In [140]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
plt.style.use("fivethirtyeight")

from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import SelectKBest, chi2


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

Getting data

In [141]:
dataset_folder = 'datasets'

dataset_name_train = 'train.csv'
dataset_name_test = 'test.csv'
sample_submission = 'sample_submission.csv'

train = pd.read_csv(dataset_folder +'/'+ dataset_name_train)
test = pd.read_csv(dataset_folder +'/'+ dataset_name_test)
sample_submission = pd.read_csv(dataset_folder +'/'+ sample_submission)


X and y

In [142]:
X_train = train.drop(columns={'SalePrice', "Id"}, axis=1)
y_train = train['SalePrice']

X_test = test.drop(columns={"Id"}, axis=1)


Get num and cat features

In [143]:
num_features = X_train.select_dtypes(include=np.number).columns.tolist()
cat_features = X_train.select_dtypes(['object']).columns.tolist()

Delete duplicates

In [144]:
train.drop_duplicates(inplace=True)

Fill cat features with "NaN"

In [145]:
imputer_cat = SimpleImputer(strategy='constant', fill_value='NaN')
X_train[cat_features] = imputer_cat.fit_transform(X_train[cat_features])
X_train[cat_features] = X_train[cat_features].astype('category')

Make encoders

In [146]:
cat_transformer_linear = OneHotEncoder(handle_unknown='ignore')
num_transformer_linear = Pipeline([ ('imputer', SimpleImputer(strategy='median')), ('scaler', MinMaxScaler()) ])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer_linear, num_features),
        ("cat", cat_transformer_linear, cat_features)
    ]
)

In [147]:
# imputer = SimpleImputer(strategy='median')
# X_train[num_features] = imputer.fit_transform(X_train[num_features])

# #top models table
#     print('Top ' + name + ' models:')
#     results_df = pd.DataFrame(gs.cv_results_)
#     results_df = results_df.sort_values(by=["rank_test_score"])
#     display(results_df[['params', 'mean_test_score', 'mean_fit_time', 'rank_test_score']]
#             .head(5)
#             .rename(columns={'mean_test_score': 'cv_score'})
#             .rename(columns={'rank_test_score': 'rank_cv_score'}))

#     #best cv score
#     print('best cv score of ' + name + ' model:')
#     print(gs.best_score_)

#     #test score of best estimator + time prediction calculation
#     print('Test ' + name + ' score:')
#     start = time.time()
#     print(mean_squared_error(y_test, gs.best_estimator_.predict(X_test), squared=False))
#     pred_time = time.time() - start

#     #time prediction speed of best estimator
#     print('Prediction time best ' + name + ' model:')
#     print(pred_time)   
#     print()
#     print()     

# results = []
# for pipeline, param_grid, name in zip(pipelines, param_grids, names):   
#     results.append(gs_and_result(pipeline, param_grid, name))  


Result lists

In [148]:
pipelines = []
params = []
names = []

Pipelines

In [149]:
pipeline_tree = Pipeline([('preprocessor_linear', preprocessor), ('trun_svd', TruncatedSVD(n_components=20)),
                          ('tree', None)])  
   
pipeline_lgm = Pipeline([('lgm', None)])         

pipeline_catboost = Pipeline([ ('cat', None)])                          

Parametres

In [150]:
param_tree = [
    {'tree': (RandomForestRegressor(random_state=1),),
     'tree__n_estimators': range(1, 20, 5),
     'tree__max_depth': range(1, 20, 5),
     }
]

param_tree_default = [
    {'tree': (RandomForestRegressor(random_state=1),),     
     }
]

param_lgm = [
    {'lgm': (LGBMRegressor(random_state=1, learning_rate=0.1),),
     'lgm__n_estimators': [50, 100, 200, 400, 800, 1600],
     'lgm__max_depth': [5, 10, 20, 40, 80, 160],
     }
]

param_lgm_default = [
    {'lgm': (LGBMRegressor(random_state=1),),}
]

param_catboost = [
     {'cat': (CatBoostRegressor(random_state=1, verbose=False, cat_features=cat_features),),
      }
 ]

In [151]:
names = ['TREE' , "TREE_DEFAULT", "LGM", "LGM_DEFAULT", "CAT_BOOST"]

In [152]:
pipelines.append(pipeline_tree)
pipelines.append(pipeline_tree)
pipelines.append(pipeline_lgm)
pipelines.append(pipeline_lgm)
pipelines.append(pipeline_catboost)

params.append(param_tree)
params.append(param_tree_default)
params.append(param_lgm)
params.append(param_lgm_default)
params.append(param_catboost)



In [153]:
results = {}
for pipeline, param_grid, name in zip(pipelines, params, names):   
     gs = GridSearchCV(pipeline,
                  param_grid,
                  scoring='neg_root_mean_squared_error',
                  refit='neg_root_mean_squared_error',
                  n_jobs=10,
                  cv=5 )

     gs.fit(X_train, y_train)  
     
     #top models table
     print('Top ' + name + ' models:')
     results_df = pd.DataFrame(gs.cv_results_)
     results_df = results_df.sort_values(by=["rank_test_score"])
     display(results_df[['params', 'mean_test_score', 'mean_fit_time', 'rank_test_score']]
            .head(5)
            .rename(columns={'mean_test_score': 'cv_score'})
            .rename(columns={'rank_test_score': 'rank_cv_score'}))

     results[name] = gs     




Top TREE models:


Unnamed: 0,params,cv_score,mean_fit_time,rank_cv_score
15,"{'tree': RandomForestRegressor(max_depth=16, n...",-41019.154889,0.1774,1
11,"{'tree': RandomForestRegressor(max_depth=16, n...",-41433.769892,0.198399,2
14,"{'tree': RandomForestRegressor(max_depth=16, n...",-42211.302952,0.1528,3
7,"{'tree': RandomForestRegressor(max_depth=16, n...",-42511.345882,0.157,4
10,"{'tree': RandomForestRegressor(max_depth=16, n...",-42626.783391,0.157801,5


Top TREE_DEFAULT models:


Unnamed: 0,params,cv_score,mean_fit_time,rank_cv_score
0,{'tree': RandomForestRegressor(random_state=1)},-40644.703536,0.88377,1


Top LGM models:


Unnamed: 0,params,cv_score,mean_fit_time,rank_cv_score
24,"{'lgm': LGBMRegressor(max_depth=40, n_estimato...",-27638.697967,0.118797,1
18,"{'lgm': LGBMRegressor(max_depth=40, n_estimato...",-27638.697967,0.119656,1
30,"{'lgm': LGBMRegressor(max_depth=40, n_estimato...",-27638.697967,0.126128,1
12,"{'lgm': LGBMRegressor(max_depth=40, n_estimato...",-27645.633771,0.113882,4
1,"{'lgm': LGBMRegressor(max_depth=40, n_estimato...",-27662.88751,0.110163,5


Top LGM_DEFAULT models:


Unnamed: 0,params,cv_score,mean_fit_time,rank_cv_score
0,{'lgm': LGBMRegressor(random_state=1)},-27901.079555,0.158435,1


In [None]:
results['CAT_BOOST'].best_estimator_.predict()

Pipeline(steps=[('cat',
                 <catboost.core.CatBoostRegressor object at 0x000001DB48AACC40>)])

In [None]:
CatBoostRegressor(random_state=1, )

<catboost.core.CatBoostRegressor at 0x1db489ecd60>