Packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
import joblib
import shap
import matplotlib.pylab as pl

Reading the data

In [140]:
train_data = pd.read_csv("./Data/cleaned_train.csv")
test_data = pd.read_csv("./Data/cleaned_test.csv")
target = pd.read_csv("./Data/cleaned_target.csv")

target.set_index("Id", inplace = True)

Label encoding the features

In [141]:
enc = LabelEncoder()
train_result = {}
test_result = {}
for col in train_data.columns:
    if (train_data[col].dtype == "int" or train_data[col].dtype == "float"):
        train_result[col] = train_data[col]
        test_result[col] = test_data[col]
    else:
        train_result[col] = pd.Series(enc.fit_transform(train_data[col]))
        test_result[col] = pd.Series(enc.transform(test_data[col]))
train_features = pd.DataFrame(train_result)
test_features = pd.DataFrame(test_result)

train_features.set_index("Id", inplace = True)
test_features.set_index("Id", inplace = True)

In [142]:
train_data.columns

Index(['Id', 'LotArea', 'OverallQual', 'GarageCars', 'YearBuilt',
       'TotRmsAbvGrd', 'OverallCond', 'MSSubClass', 'LotShape', 'LotConfig',
       'Neighborhood', 'YrSold', 'KitchenQual', 'ExterQual', 'ExterCond',
       'HeatingQC', 'Fence', 'BldgType', 'Functional', 'TotLivArea',
       'FullBath', 'HalfBath', 'PorchSF', 'ProxPos', 'ProxRoad', 'ProxRail'],
      dtype='object')

Random Forest

In [149]:
rf = RandomForestRegressor(n_jobs = -1, n_estimators = 140)
rf_params = {'max_features': range(3,10), 'max_depth': range(1, 13), 'min_samples_leaf': [4, 7, 14, 70, 140]}
gs_rf = GridSearchCV(rf, rf_params, cv = 10, return_train_score = True)

In [150]:
np.random.seed(0)
%time gs_rf.fit(train_features, target['SalePrice'])

CPU times: user 7min 50s, sys: 56.5 s, total: 8min 47s
Wall time: 8min 10s


GridSearchCV(cv=10,
             estimator=RandomForestRegressor(n_estimators=140, n_jobs=-1),
             param_grid={'max_depth': range(1, 13),
                         'max_features': range(3, 10),
                         'min_samples_leaf': [4, 7, 14, 70, 140]},
             return_train_score=True)

In [151]:
gs_rf.best_score_

0.860439491673491

In [152]:
gs_rf.best_params_

{'max_depth': 12, 'max_features': 7, 'min_samples_leaf': 4}

In [154]:
importances = pd.Series(gs_rf.best_estimator_.feature_importances_, index=train_features.columns).sort_values(ascending=False)
importances[:20]

TotLivArea      0.268406
OverallQual     0.252931
GarageCars      0.102012
ExterQual       0.086475
YearBuilt       0.062913
KitchenQual     0.049624
LotArea         0.047378
FullBath        0.037322
TotRmsAbvGrd    0.027351
Neighborhood    0.014185
HeatingQC       0.009755
YrSold          0.008671
MSSubClass      0.007547
HalfBath        0.006500
OverallCond     0.005429
LotShape        0.003777
BldgType        0.002768
PorchSF         0.002223
LotConfig       0.002173
Fence           0.000784
dtype: float64

In [97]:
model_rf = gs_rf.best_estimator_

XGBoost with early stopping

In [None]:
train_features_grid, train_features_validation, target_grid, target_validation = train_test_split(train_features, target, test_size = 0.2)

In [None]:
xgb = XGBRegressor(n_jobs = -1, n_estimators = 1000)
xgb_params = {'max_depth': range(1, 6), 'eta': np.linspace(0.0001, 0.5, num = 50, endpoint = True)}
gs_xgb = GridSearchCV(xgb, xgb_params, cv = 10, return_train_score = True)

In [None]:
np.random.seed(0)
fit_params={"early_stopping_rounds":10, 
            "eval_metric" : "mae", 
            "eval_set" : [[train_features_validation, target_validation]]}
%time gs_xgb.fit(train_features_grid, target_grid,verbose = 0, **fit_params)

XGBoost without early stopping

In [155]:
xgb = XGBRegressor(n_jobs = -1, booster = "gbtree")
eta = np.linspace(0.0001, 0.5, num = 50, endpoint = True)
n_estimators = [25 * i for i in range(1,13)]
xgb_params = {'max_depth': range(2, 6), 'eta': eta, 'n_estimators' : n_estimators}
gs_xgb = GridSearchCV(xgb, xgb_params, cv = 10, return_train_score = True)

In [156]:
np.random.seed(0)
%time gs_xgb.fit(train_features, target, verbose = 0)

CPU times: user 7h 13min 51s, sys: 36min 20s, total: 7h 50min 11s
Wall time: 1h 1min 37s


GridSearchCV(cv=10,
             estimator=XGBRegressor(base_score=None, booster='gbtree',
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_...
       3.26565306e-01, 3.36767347e-01, 3.46969388e-01, 3.57171429e-01,
       3.67373469e-01, 3.77575510e-01, 3.87777551e-01, 3.97979592e-01,
       4.08181633e-01, 4.18383673e-01, 4.28585714e-01, 4.38787755e-01,
       4.48989796e-01, 4.59191837e-01, 4.69393878e-01, 4.79595918e-01,
       4

XGBoost outcomes

In [157]:
gs_xgb.best_score_

0.8797505460780949

In [158]:
gs_xgb.best_params_

{'eta': 0.21434285714285714, 'max_depth': 2, 'n_estimators': 275}

In [159]:
importances = pd.Series(gs_xgb.best_estimator_.feature_importances_, index=train_features.columns).sort_values(ascending=False)
importances[:20]

OverallQual     0.397834
KitchenQual     0.232355
GarageCars      0.090197
TotLivArea      0.079999
ExterQual       0.053834
YearBuilt       0.020539
LotArea         0.019549
HalfBath        0.015559
FullBath        0.014242
TotRmsAbvGrd    0.012263
OverallCond     0.011740
MSSubClass      0.011253
Neighborhood    0.008240
PorchSF         0.005668
LotShape        0.004193
ProxRoad        0.003923
ProxPos         0.003718
YrSold          0.003340
Functional      0.002856
LotConfig       0.002622
dtype: float32

In [160]:
model_xgb = gs_xgb.best_estimator_

Running the model on the test dataset

In [161]:
pred_xgb = pd.Series(model_xgb.predict(test_features))
prediction_xgb = pd.DataFrame({"Id" : test_data["Id"], "SalePrice" : pred_xgb})
prediction_xgb.set_index("Id", inplace = True)

In [162]:
prediction_xgb.to_csv("./Data/prediction_xgb.csv")

In [104]:
pred_rf = pd.Series(model_rf.predict(test_features))
prediction_rf = pd.DataFrame({"Id" : test_data["Id"], "SalePrice" : pred_rf})
prediction_rf.set_index("Id", inplace = True)

In [105]:
prediction_rf.to_csv("./Data/prediction_rf.csv")

In [107]:
pred_av = (pred_xgb + pred_rf) / 2
prediction_av = pd.DataFrame({"Id" : test_data["Id"], "SalePrice" : pred_av})
prediction_av.set_index("Id", inplace = True)

In [108]:
prediction_av.to_csv("./Data/prediction_av.csv")

Save encoder and model for later

In [None]:
# save the model to disk
filename = 'fitted_model.sav'
joblib.dump(model, filename)
filename = 'fitted_encoder.sav'
joblib.dump(enc, filename)

Checking model on the test data set