Packages

In [42]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
import joblib
import matplotlib.pylab as pl
import seaborn as sns
import shap
from scipy.stats import pearsonr

Reading the data

In [17]:
train_data = pd.read_csv("./Data/cleaned_train.csv")
test_data = pd.read_csv("./Data/cleaned_test.csv")
target = pd.read_csv("./Data/cleaned_target.csv")

target.set_index("Id", inplace = True)

Label encoding the features

In [18]:
enc = LabelEncoder()
train_result = {}
test_result = {}
for col in train_data.columns:
    if (train_data[col].dtype == "int" or train_data[col].dtype == "float"):
        train_result[col] = train_data[col]
        test_result[col] = test_data[col]
    else:
        train_result[col] = pd.Series(enc.fit_transform(train_data[col]))
        test_result[col] = pd.Series(enc.transform(test_data[col]))
train_features = pd.DataFrame(train_result)
test_features = pd.DataFrame(test_result)

train_features.set_index("Id", inplace = True)
test_features.set_index("Id", inplace = True)

In [19]:
train_data.columns

Index(['Id', 'LotArea', 'OverallQual', 'GarageCars', 'TotRmsAbvGrd',
       'OverallCond', 'Fireplaces', 'CentralAir', 'Street', 'YrSold',
       'KitchenQual', 'ExterQual', 'ExterCond', 'HeatingQC', 'GarageQual',
       'PoolQC', 'BsmtCond', 'Fence', 'BsmtExposure', 'BldgType', 'Functional',
       'ProxPos', 'ProxRoad', 'ProxRail', 'Heating', 'BasementQualFactor',
       'TotLivArea', 'FullBath', 'HalfBath', 'PorchSF', 'Neighborhood',
       'MSSubClass', 'SaleCondition'],
      dtype='object')

In [59]:
train_features.columns

Index(['LotArea', 'OverallQual', 'GarageCars', 'TotRmsAbvGrd', 'OverallCond',
       'Fireplaces', 'CentralAir', 'Street', 'YrSold', 'KitchenQual',
       'ExterQual', 'ExterCond', 'HeatingQC', 'GarageQual', 'PoolQC',
       'BsmtCond', 'Fence', 'BsmtExposure', 'BldgType', 'Functional',
       'ProxPos', 'ProxRoad', 'ProxRail', 'Heating', 'BasementQualFactor',
       'TotLivArea', 'FullBath', 'HalfBath', 'PorchSF', 'Neighborhood',
       'MSSubClass', 'SaleCondition'],
      dtype='object')

Random Forest

In [None]:
rf = RandomForestRegressor(n_jobs = -1, n_estimators = 140)
rf_params = {'max_features': range(3,10), 'max_depth': range(1, 13), 'min_samples_leaf': [4, 7, 14, 70, 140]}
gs_rf = GridSearchCV(rf, rf_params, cv = 10, return_train_score = True)

In [None]:
np.random.seed(0)
%time gs_rf.fit(train_features, target['SalePrice'])

In [None]:
gs_rf.best_score_

In [None]:
gs_rf.best_params_

In [None]:
importances = pd.Series(gs_rf.best_estimator_.feature_importances_, index=train_features.columns).sort_values(ascending=False)
importances[:20]

In [None]:
model_rf = gs_rf.best_estimator_

XGBoost without early stopping

In [None]:
xgb = XGBRegressor(n_jobs = -1, booster = "gbtree")
eta_range = [0.01 * i for i in range(1, 21)]
max_depth_range = range(2, 6)
n_estimators = range(100, 301,550)
xgb_params = {'max_depth': max_depth_range, 'eta': eta_range, 'n_estimators' : n_estimators}
gs_xgb = GridSearchCV(xgb, xgb_params, cv = 10, return_train_score = True)

In [None]:
np.random.seed(0)
%time gs_xgb.fit(train_features, target, verbose = 0)

XGBoost with early stopping

In [20]:
train_features_grid, train_features_validation, target_grid, target_validation = train_test_split(train_features, target, test_size = 0.1)

In [21]:
xgb = XGBRegressor(n_jobs = -1, booster = "gbtree", n_estimators = 1000, colsample_bytree = 0.5)
eta_range = [0.01 * i for i in range(1, 41)]
max_depth_range = range(1, 7)
xgb_params = {'max_depth': max_depth_range, 'eta': eta_range}
gs_xgb = GridSearchCV(xgb, xgb_params, cv = 10, return_train_score = True)

In [22]:
np.random.seed(0)
fit_params={"early_stopping_rounds":10, 
            "eval_metric" : "mae", 
            "eval_set" : [[train_features_validation, target_validation]]}
%time gs_xgb.fit(train_features_grid, target_grid,verbose = 0, **fit_params)

CPU times: user 22min 37s, sys: 2min 35s, total: 25min 13s
Wall time: 3min 24s


GridSearchCV(cv=10,
             estimator=XGBRegressor(base_score=None, booster='gbtree',
                                    colsample_bylevel=None,
                                    colsample_bynode=None, colsample_bytree=0.5,
                                    gamma=None, gpu_id=None,
                                    importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=1000, n_...
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=None, reg_lambda=None,
                                    scale_pos_weight=None, subsample=None,
                                    tree_method=None, validate_parame

Second round, narrowing in on eta

In [24]:
xgb = XGBRegressor(n_jobs = -1, booster = "gbtree", n_estimators = 1000, colsample_bytree = 0.5, max_depth = gs_xgb.best_params_["max_depth"])
eta_base = gs_xgb.best_params_["eta"] - 0.01
eta_range = [eta_base + 0.001 * i for i in range(1,20)]
xgb_params = {'eta': eta_range}
gs_xgb = GridSearchCV(xgb, xgb_params, cv = 10, return_train_score = True)

In [25]:
np.random.seed(0)
fit_params={"early_stopping_rounds":10, 
            "eval_metric" : "mae", 
            "eval_set" : [[train_features_validation, target_validation]]}
%time gs_xgb.fit(train_features_grid, target_grid,verbose = 0, **fit_params)

CPU times: user 5min 33s, sys: 32.1 s, total: 6min 5s
Wall time: 48.3 s


GridSearchCV(cv=10,
             estimator=XGBRegressor(base_score=None, booster='gbtree',
                                    colsample_bylevel=None,
                                    colsample_bynode=None, colsample_bytree=0.5,
                                    gamma=None, gpu_id=None,
                                    importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=4, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=1000, n_job...
                                    scale_pos_weight=None, subsample=None,
                                    tree_method=None, validate_parameters=None,
                                    verbosity=None),
             param_grid={'eta': [0.011, 0.012, 0.013000000000000001, 0.014,
        

Fitting the residuals

In [52]:
model_xgb = gs_xgb.best_estimator_
predvalues_train_xgb = pd.Series(model_xgb.predict(train_features))
prediction_train_xgb = pd.DataFrame({"Id" : train_data["Id"], "SalePrice" : predvalues_train_xgb})
prediction_train_xgb.set_index("Id", inplace = True)
residuals = target - prediction_train_xgb

In [30]:
lm = Ridge()
lm_params = {"alpha" : np.linspace(0.001, 100, 100)}
gs_lm = GridSearchCV(lm, lm_params, cv = 10, return_train_score = True)

In [56]:
def resid_correl(col):
    return pearsonr(residuals["SalePrice"], col)[0]

In [78]:
correlations = [resid_correl(train_features[col]) for col in train_features.columns]
sorted_correlations = pd.Series(correlations, index=train_features.columns).sort_values(ascending=False)
mask = (sorted_correlations > 0.1) | (sorted_correlations < -0.1)
selected_cols = sorted_correlations[mask].index

In [79]:
np.random.seed(0)
%time gs_lm.fit(train_features[selected_cols], residuals)

CPU times: user 4.1 s, sys: 10.5 ms, total: 4.11 s
Wall time: 4.11 s


GridSearchCV(cv=10, estimator=Ridge(),
             param_grid={'alpha': array([1.00000000e-03, 1.01109091e+00, 2.02118182e+00, 3.03127273e+00,
       4.04136364e+00, 5.05145455e+00, 6.06154545e+00, 7.07163636e+00,
       8.08172727e+00, 9.09181818e+00, 1.01019091e+01, 1.11120000e+01,
       1.21220909e+01, 1.31321818e+01, 1.41422727e+01, 1.51523636e+01,
       1.61624545e+01, 1.71725455e+01, 1.81826364e+01, 1...
       7.67679091e+01, 7.77780000e+01, 7.87880909e+01, 7.97981818e+01,
       8.08082727e+01, 8.18183636e+01, 8.28284545e+01, 8.38385455e+01,
       8.48486364e+01, 8.58587273e+01, 8.68688182e+01, 8.78789091e+01,
       8.88890000e+01, 8.98990909e+01, 9.09091818e+01, 9.19192727e+01,
       9.29293636e+01, 9.39394545e+01, 9.49495455e+01, 9.59596364e+01,
       9.69697273e+01, 9.79798182e+01, 9.89899091e+01, 1.00000000e+02])},
             return_train_score=True)

In [80]:
gs_lm.best_score_

-0.014851930570850746

In [81]:
gs_lm.best_estimator_.coef_

array([[6302.51730177]])

XGBoost outcomes

In [26]:
gs_xgb.best_score_

0.8993437967453376

In [27]:
gs_xgb.best_params_

{'eta': 0.02}

In [28]:
importances = pd.Series(gs_xgb.best_estimator_.feature_importances_, index=train_features.columns).sort_values(ascending=False)
importances[:20]

ExterQual             0.289064
OverallQual           0.202760
TotLivArea            0.140788
GarageCars            0.079861
Neighborhood          0.064700
KitchenQual           0.040763
FullBath              0.022571
Fireplaces            0.017234
BsmtExposure          0.014612
TotRmsAbvGrd          0.013670
CentralAir            0.013066
MSSubClass            0.010575
LotArea               0.009415
BldgType              0.008786
BasementQualFactor    0.008375
HalfBath              0.007400
GarageQual            0.007373
ProxPos               0.006296
SaleCondition         0.006242
HeatingQC             0.005926
dtype: float32

Running the model on the test dataset

In [None]:
pred_xgb = pd.Series(model_xgb.predict(test_features))
prediction_xgb = pd.DataFrame({"Id" : test_data["Id"], "SalePrice" : pred_xgb})
prediction_xgb.set_index("Id", inplace = True)

In [None]:
prediction_xgb.to_csv("./Data/prediction_xgb.csv")

In [None]:
pred_rf = pd.Series(model_rf.predict(test_features))
prediction_rf = pd.DataFrame({"Id" : test_data["Id"], "SalePrice" : pred_rf})
prediction_rf.set_index("Id", inplace = True)

In [None]:
prediction_rf.to_csv("./Data/prediction_rf.csv")

In [None]:
pred_av = (pred_xgb + pred_rf) / 2
prediction_av = pd.DataFrame({"Id" : test_data["Id"], "SalePrice" : pred_av})
prediction_av.set_index("Id", inplace = True)

In [None]:
prediction_av.to_csv("./Data/prediction_av.csv")

Save encoder and model for later

In [None]:
filename = 'fitted_model.sav'
joblib.dump(model_xgb, filename)
filename = 'fitted_encoder.sav'
joblib.dump(enc, filename)

In [None]:
train_features.to_csv("./Data/encoded_train_data.csv")

In [None]:
shap_values = shap.TreeExplainer(model_xgb).shap_values(train_features)

In [None]:
shap.summary_plot(shap_values, train_features)

In [None]:
shap.dependence_plot("TotLivArea", shap_values, train_features)

In [None]:
shap.dependence_plot("OverallQual", shap_values, train_features)

In [None]:
shap.dependence_plot("LotArea", shap_values, train_features)

In [None]:
shap.dependence_plot("Neighborhood", shap_values, train_features)

In [None]:
shap_interaction_values = shap.TreeExplainer(model_xgb).shap_interaction_values(train_features)

In [None]:
shap.summary_plot(shap_interaction_values, train_features)

In [None]:
shap.dependence_plot(
    ("TotLivArea", "TotLivArea"),
    shap_interaction_values, train_features,
    display_features = train_features
)

In [None]:
shap.dependence_plot(
    ("TotLivArea", "OverallQual"),
    shap_interaction_values, train_features,
    display_features = train_features
)

In [None]:
shap.dependence_plot(
    ("Neighborhood", "OverallQual"),
    shap_interaction_values, train_features,
    display_features = train_features
)

In [None]:
tmp = np.abs(shap_interaction_values).sum(0)
for i in range(tmp.shape[0]):
    tmp[i,i] = 0
inds = np.argsort(-tmp.sum(0))[:50]
tmp2 = tmp[inds,:][:,inds]
pl.figure(figsize=(12,12))
pl.imshow(tmp2)
pl.yticks(range(tmp2.shape[0]), train_features.columns[inds], rotation=50.4, horizontalalignment="right")
pl.xticks(range(tmp2.shape[0]), train_features.columns[inds], rotation=50.4, horizontalalignment="left")
pl.gca().xaxis.tick_top()
pl.show()

In [39]:
def heatmap(valuesDF, threshold):
    # Make a mask to only show the lower left part of the table
    mask = np.triu(np.ones_like(valuesDF, dtype = bool))

    cmap = sns.diverging_palette(220, 20, as_cmap = True)

    # Draw the heatmap with the mask and correct aspect ratio
    visual = sns.heatmap(valuesDF, mask = mask, cmap = cmap, center = threshold,
                square = True, linewidths = .5, annot = True);

    return visual