Packages

In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
import joblib
#import shap
import matplotlib.pylab as pl

Reading the data

In [6]:
train_data = pd.read_csv("./Data/cleaned_train.csv")
test_data = pd.read_csv("./Data/cleaned_test.csv")
target = pd.read_csv("./Data/cleaned_target.csv")

target.set_index("Id", inplace = True)

Label encoding the features

In [7]:
enc = LabelEncoder()
train_result = {}
test_result = {}
for col in train_data.columns:
    if (train_data[col].dtype == "int" or train_data[col].dtype == "float"):
        train_result[col] = train_data[col]
        test_result[col] = test_data[col]
    else:
        train_result[col] = pd.Series(enc.fit_transform(train_data[col]))
        test_result[col] = pd.Series(enc.transform(test_data[col]))
train_features = pd.DataFrame(train_result)
test_features = pd.DataFrame(test_result)

train_features.set_index("Id", inplace = True)
test_features.set_index("Id", inplace = True)

In [8]:
train_data.columns

Index(['Id', 'LotArea', 'OverallQual', 'GarageCars', 'TotRmsAbvGrd',
       'OverallCond', 'GrLivArea', 'TotalBsmtSF', 'YrSold', 'KitchenQual',
       'ExterQual', 'ExterCond', 'HeatingQC', 'GarageQual', 'Fence',
       'BldgType', 'Functional', 'ProxPos', 'ProxRoad', 'ProxRail',
       'BasementQualFactor', 'FullBath', 'HalfBath', 'PorchSF', 'Neighborhood',
       'MSSubClass', 'SaleCondition'],
      dtype='object')

Random Forest

In [None]:
rf = RandomForestRegressor(n_jobs = -1, n_estimators = 140)
rf_params = {'max_features': range(3,10), 'max_depth': range(1, 13), 'min_samples_leaf': [4, 7, 14, 70, 140]}
gs_rf = GridSearchCV(rf, rf_params, cv = 10, return_train_score = True)

In [None]:
np.random.seed(0)
%time gs_rf.fit(train_features, target['SalePrice'])

In [None]:
gs_rf.best_score_

In [None]:
gs_rf.best_params_

In [None]:
importances = pd.Series(gs_rf.best_estimator_.feature_importances_, index=train_features.columns).sort_values(ascending=False)
importances[:20]

In [None]:
model_rf = gs_rf.best_estimator_

XGBoost with early stopping

In [None]:
train_features_grid, train_features_validation, target_grid, target_validation = train_test_split(train_features, target, test_size = 0.2)

In [None]:
xgb = XGBRegressor(n_jobs = -1, n_estimators = 1000)
xgb_params = {'max_depth': range(1, 6), 'eta': np.linspace(0.0001, 0.5, num = 50, endpoint = True)}
gs_xgb = GridSearchCV(xgb, xgb_params, cv = 10, return_train_score = True)

In [None]:
np.random.seed(0)
fit_params={"early_stopping_rounds":10, 
            "eval_metric" : "mae", 
            "eval_set" : [[train_features_validation, target_validation]]}
%time gs_xgb.fit(train_features_grid, target_grid,verbose = 0, **fit_params)

XGBoost without early stopping

In [9]:
xgb = XGBRegressor(n_jobs = -1, booster = "gbtree")
eta = np.linspace(0.0001, 0.5, num = 10, endpoint = True)
n_estimators = [25 * i for i in range(1,11)]
xgb_params = {'max_depth': range(2, 6), 'eta': eta, 'n_estimators' : n_estimators}
gs_xgb = GridSearchCV(xgb, xgb_params, cv = 5, return_train_score = True)

In [10]:
np.random.seed(0)
%time gs_xgb.fit(train_features, target, verbose = 0)

CPU times: user 23min 22s, sys: 1min 40s, total: 25min 3s
Wall time: 3min 11s


GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster='gbtree',
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_j...
                                    scale_pos_weight=None, subsample=None,
                                    tree_method=None, validate_parameters=None,
                                    verbosity=None),
             param_grid={'eta': array([1.00000000e-04, 5.56444444e-02, 1.11188889e-0

XGBoost outcomes

In [11]:
gs_xgb.best_score_

0.8995630541909879

In [12]:
gs_xgb.best_params_

{'eta': 0.22227777777777777, 'max_depth': 3, 'n_estimators': 150}

In [13]:
importances = pd.Series(gs_xgb.best_estimator_.feature_importances_, index=train_features.columns).sort_values(ascending=False)
importances[:20]

OverallQual           0.431289
GarageCars            0.114163
Neighborhood          0.075524
KitchenQual           0.065440
FullBath              0.053081
GrLivArea             0.046780
ExterQual             0.036624
TotalBsmtSF           0.034225
TotRmsAbvGrd          0.028013
MSSubClass            0.016262
GarageQual            0.013208
SaleCondition         0.013165
BasementQualFactor    0.011668
OverallCond           0.011624
HalfBath              0.010975
HeatingQC             0.006987
LotArea               0.005986
Functional            0.004914
PorchSF               0.004058
ProxPos               0.003851
dtype: float32

In [None]:
model_xgb = gs_xgb.best_estimator_

Running the model on the test dataset

In [None]:
pred_xgb = pd.Series(model_xgb.predict(test_features))
prediction_xgb = pd.DataFrame({"Id" : test_data["Id"], "SalePrice" : pred_xgb})
prediction_xgb.set_index("Id", inplace = True)

In [None]:
prediction_xgb.to_csv("./Data/prediction_xgb.csv")

In [None]:
pred_rf = pd.Series(model_rf.predict(test_features))
prediction_rf = pd.DataFrame({"Id" : test_data["Id"], "SalePrice" : pred_rf})
prediction_rf.set_index("Id", inplace = True)

In [None]:
prediction_rf.to_csv("./Data/prediction_rf.csv")

In [None]:
pred_av = (pred_xgb + pred_rf) / 2
prediction_av = pd.DataFrame({"Id" : test_data["Id"], "SalePrice" : pred_av})
prediction_av.set_index("Id", inplace = True)

In [None]:
prediction_av.to_csv("./Data/prediction_av.csv")

Save encoder and model for later

In [None]:
# save the model to disk
filename = 'fitted_model.sav'
joblib.dump(model, filename)
filename = 'fitted_encoder.sav'
joblib.dump(enc, filename)

Checking model on the test data set