In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
plt.style.use("fivethirtyeight")

from tqdm.notebook import tqdm
from sklearn.base import BaseEstimator
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import SelectKBest, chi2


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

Getting data

In [2]:
dataset_folder = 'datasets'

dataset_name_train = 'train.csv'
dataset_name_test = 'test.csv'
sample_submission = 'sample_submission.csv'

train = pd.read_csv(dataset_folder +'/'+ dataset_name_train)
test = pd.read_csv(dataset_folder +'/'+ dataset_name_test)
sample_submission = pd.read_csv(dataset_folder +'/'+ sample_submission)


X and y

In [3]:
X_train = train.drop(columns={'SalePrice', "Id"}, axis=1)
y_train = train['SalePrice']

X_test = test.drop(columns={"Id"}, axis=1)


Get num and cat features

In [4]:
num_features = X_train.select_dtypes(include=np.number).columns.tolist()
cat_features = X_train.select_dtypes(['object']).columns.tolist()

Delete duplicates

In [5]:
train.drop_duplicates(inplace=True)

Fill cat features with "NaN"

In [6]:
imputer_cat = SimpleImputer(strategy='constant', fill_value='NaN')
X_train[cat_features] = imputer_cat.fit_transform(X_train[cat_features])
X_train[cat_features] = X_train[cat_features].astype('category')

X_test[cat_features] = imputer_cat.fit_transform(X_test[cat_features])
X_test[cat_features] = X_test[cat_features].astype('category')

Make encoders

In [7]:
cat_transformer_linear = OneHotEncoder(handle_unknown='ignore')
num_transformer_linear = Pipeline([ ('imputer', SimpleImputer(strategy='median')), ('scaler', MinMaxScaler()) ])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer_linear, num_features),
        ("cat", cat_transformer_linear, cat_features)
    ]
)


Result lists

In [8]:
pipelines = []
params = []
names = []

Pipelines

In [9]:
pipeline_tree = Pipeline([('preprocessor', preprocessor), ('trun_svd', TruncatedSVD(n_components=20)),
                          ('tree', None)])  
   
pipeline_lgm = Pipeline([('lgm', None)])         

pipeline_catboost = Pipeline([('cat', None)])                          

Parametres

In [10]:
param_tree = [
    {'tree': (RandomForestRegressor(random_state=1),),
     'tree__n_estimators': range(1, 20, 5),
     'tree__max_depth': range(1, 20, 5),
     }
]

param_tree_default = [
    {'tree': (RandomForestRegressor(random_state=1),),     
     }
]

param_lgm = [
    {'lgm': (LGBMRegressor(random_state=1, learning_rate=0.1),),
     'lgm__n_estimators': [50, 100, 200, 400, 800, 1600],
     'lgm__max_depth': [5, 10, 20, 40, 80, 160],
     }
]

param_lgm_default = [
    {'lgm': (LGBMRegressor(random_state=1),),}
]

param_catboost = [
     {'cat': (CatBoostRegressor(random_state=1, verbose=False, cat_features=cat_features),),
      }
 ]

Names

In [11]:
names = ['TREE' , "TREE_DEFAULT", "LGM", "LGM_DEFAULT", "CAT_BOOST"]

In [12]:
pipelines.append(pipeline_tree)
pipelines.append(pipeline_tree)
pipelines.append(pipeline_lgm)
pipelines.append(pipeline_lgm)
pipelines.append(pipeline_catboost)

params.append(param_tree)
params.append(param_tree_default)
params.append(param_lgm)
params.append(param_lgm_default)
params.append(param_catboost)



Loop over pipelines

In [13]:
results = {}
for pipeline, param_grid, name in tqdm(zip(pipelines, params, names)):   
     gs = GridSearchCV(pipeline,
                  param_grid,
                  scoring='neg_root_mean_squared_error',
                  refit='neg_root_mean_squared_error',
                  n_jobs=10,
                  cv=5 )

     gs.fit(X_train, y_train)  
     
     #top models table
     print('Top ' + name + ' models:')
     results_df = pd.DataFrame(gs.cv_results_)
     results_df = results_df.sort_values(by=["rank_test_score"])
     display(results_df[['params', 'mean_test_score', 'mean_fit_time', 'rank_test_score']]
            .head(5)
            .rename(columns={'mean_test_score': 'cv_score'})
            .rename(columns={'rank_test_score': 'rank_cv_score'}))

     results[name] = gs     




0it [00:00, ?it/s]

Top TREE models:


Unnamed: 0,params,cv_score,mean_fit_time,rank_cv_score
15,"{'tree': RandomForestRegressor(max_depth=16, n...",-41137.299629,0.200399,1
11,"{'tree': RandomForestRegressor(max_depth=16, n...",-41826.311087,0.2098,2
14,"{'tree': RandomForestRegressor(max_depth=16, n...",-41929.139802,0.167398,3
7,"{'tree': RandomForestRegressor(max_depth=16, n...",-42428.320191,0.159999,4
10,"{'tree': RandomForestRegressor(max_depth=16, n...",-42475.337825,0.159799,5


Top TREE_DEFAULT models:


Unnamed: 0,params,cv_score,mean_fit_time,rank_cv_score
0,{'tree': RandomForestRegressor(random_state=1)},-40884.708183,0.9124,1


Top LGM models:


Unnamed: 0,params,cv_score,mean_fit_time,rank_cv_score
24,"{'lgm': LGBMRegressor(max_depth=40, n_estimato...",-27638.697967,0.120999,1
18,"{'lgm': LGBMRegressor(max_depth=40, n_estimato...",-27638.697967,0.123399,1
30,"{'lgm': LGBMRegressor(max_depth=40, n_estimato...",-27638.697967,0.120199,1
12,"{'lgm': LGBMRegressor(max_depth=40, n_estimato...",-27645.633771,0.126999,4
1,"{'lgm': LGBMRegressor(max_depth=40, n_estimato...",-27662.88751,0.111398,5


Top LGM_DEFAULT models:


Unnamed: 0,params,cv_score,mean_fit_time,rank_cv_score
0,{'lgm': LGBMRegressor(random_state=1)},-27901.079555,0.1666,1


Top CAT_BOOST models:


Unnamed: 0,params,cv_score,mean_fit_time,rank_cv_score
0,{'cat': <catboost.core.CatBoostRegressor objec...,-26391.026523,45.84183,1


Make test submission with best estimator

In [14]:
test['pred'] = results['CAT_BOOST'].best_estimator_.predict(X_test)
test[['Id', 'pred']].to_csv('test_submission.csv', index=False)