Packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
import joblib
import matplotlib.pylab as pl
import seaborn as sns
import shap
from scipy.stats import pearsonr

Reading the data

In [9]:
train_data = pd.read_csv("./Data/cleaned_train.csv")
test_data = pd.read_csv("./Data/cleaned_test.csv")
target = pd.read_csv("./Data/cleaned_target.csv")

target.set_index("Id", inplace = True)

Label encoding the features

In [21]:
train_features.columns

Index(['LotArea', 'LotFrontage', 'OverallQual', 'GarageCars', 'TotRmsAbvGrd',
       'OverallCond', 'Fireplaces', 'CentralAir', 'Street', 'YrSold',
       'AgeBuilt', 'AgeRemod', 'KitchenQual', 'ExterQual', 'ExterCond',
       'HeatingQC', 'GarageQual', 'PoolQC', 'BsmtCond', 'Utilities', 'Fence',
       'LandSlope', 'LotShape', 'BsmtExposure', 'BldgType', 'MasVnrType',
       'Foundation', 'Electrical', 'Functional', 'ProxPos', 'ProxRoad',
       'ProxRail', 'Heating', 'BasementQualFactor', 'TotLivArea', 'FullBath',
       'HalfBath', 'PorchSF', 'Neighborhood', 'MSSubClass', 'SaleCondition'],
      dtype='object')

In [10]:
enc = LabelEncoder()
train_result = {}
test_result = {}
for col in train_data.columns:
    if (train_data[col].dtype == "int" or train_data[col].dtype == "float"):
        train_result[col] = train_data[col]
        test_result[col] = test_data[col]
    else:
        train_result[col] = pd.Series(enc.fit_transform(train_data[col]))
        test_result[col] = pd.Series(enc.transform(test_data[col]))
train_features = pd.DataFrame(train_result)
test_features = pd.DataFrame(test_result)

train_features.set_index("Id", inplace = True)
test_features.set_index("Id", inplace = True)

XGBoost with early stopping

In [11]:
np.random.seed(0)
train_features_grid, train_features_validation, target_grid, target_validation = train_test_split(train_features, target, test_size = 0.1)

In [12]:
xgb = XGBRegressor(n_jobs = -1, booster = "gbtree", n_estimators = 1000, colsample_bytree = 0.5)
eta_range = np.linspace(0.01, 0.5, 50, endpoint = True)
max_depth_range = range(1, 7)
xgb_params = {'max_depth': max_depth_range, 'eta': eta_range}
gs_xgb = GridSearchCV(xgb, xgb_params, cv = 10, return_train_score = True)

In [13]:
np.random.seed(0)
fit_params={"early_stopping_rounds" : 10, 
            "eval_metric" : "mae", 
            "eval_set" : [[train_features_validation, target_validation]]}
%time gs_xgb.fit(train_features_grid, target_grid,verbose = 0, **fit_params)

CPU times: user 31min 10s, sys: 4min 28s, total: 35min 39s
Wall time: 5min 27s


GridSearchCV(cv=10,
             estimator=XGBRegressor(base_score=None, booster='gbtree',
                                    colsample_bylevel=None,
                                    colsample_bynode=None, colsample_bytree=0.5,
                                    gamma=None, gpu_id=None,
                                    importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=1000, n_...
                                    tree_method=None, validate_parameters=None,
                                    verbosity=None),
             param_grid={'eta': array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 , 0.11,
       0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19,

In [14]:
gs_xgb.best_score_

0.9066610829620364

Second round, narrowing in on eta

In [16]:
xgb = XGBRegressor(n_jobs = -1, booster = "gbtree", n_estimators = 1000, colsample_bytree = 0.5, max_depth = gs_xgb.best_params_["max_depth"])
eta_mid = gs_xgb.best_params_["eta"]
eta_range = np.linspace(eta_mid - 0.01, eta_mid + 0.01, 200, endpoint = False)
xgb_params = {'eta': eta_range}
gs_xgb = GridSearchCV(xgb, xgb_params, cv = 10, return_train_score = True)

In [17]:
np.random.seed(0)
fit_params={"early_stopping_rounds" : 10, 
            "eval_metric" : "mae", 
            "eval_set" : [[train_features_validation, target_validation]]}
%time gs_xgb.fit(train_features_grid, target_grid,verbose = 0, **fit_params)

CPU times: user 27min, sys: 3min 17s, total: 30min 17s
Wall time: 4min 32s


GridSearchCV(cv=10,
             estimator=XGBRegressor(base_score=None, booster='gbtree',
                                    colsample_bylevel=None,
                                    colsample_bynode=None, colsample_bytree=0.5,
                                    gamma=None, gpu_id=None,
                                    importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=5, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=1000, n_job...
       0.0952, 0.0953, 0.0954, 0.0955, 0.0956, 0.0957, 0.0958, 0.0959,
       0.096 , 0.0961, 0.0962, 0.0963, 0.0964, 0.0965, 0.0966, 0.0967,
       0.0968, 0.0969, 0.097 , 0.0971, 0.0972, 0.0973, 0.0974, 0.0975,
       0.0976, 0.0977, 0.0978, 0.0979, 0.098 , 0.0981, 0.0982, 0.0983,
       0

In [18]:
gs_xgb.best_score_

0.908909678772958

Reducing feature space

In [None]:
importances = pd.Series(gs_xgb.best_estimator_.feature_importances_, index=train_features.columns).sort_values(ascending=False)
mask = (importances > 0.01)
selected_cols = importances[mask].index
selected_cols

In [None]:
train_features_grid, train_features_validation, target_grid, target_validation = train_test_split(train_features, target, test_size = 0.1)

In [None]:
np.random.seed(0)
fit_params={"early_stopping_rounds" : 10, 
            "eval_metric" : "mae", 
            "eval_set" : [[train_features_validation[selected_cols], target_validation]]}
%time gs_xgb.fit(train_features_grid[selected_cols], target_grid,verbose = 0, **fit_params)

In [None]:
gs_xgb.best_score_

Fitting the residuals

In [22]:
model_xgb = gs_xgb.best_estimator_
predvalues_train_xgb = pd.Series(model_xgb.predict(train_features))
prediction_train_xgb = pd.DataFrame({"Id" : train_data["Id"], "SalePrice" : predvalues_train_xgb})
prediction_train_xgb.set_index("Id", inplace = True)
residuals = target - prediction_train_xgb

In [23]:
lm = Ridge()
lm_params = {"alpha" : np.linspace(0.001, 100, 100)}
gs_lm = GridSearchCV(lm, lm_params, cv = 10, return_train_score = True)

In [24]:
def resid_correl(col):
    return pearsonr(residuals["SalePrice"], col)[0]

In [32]:
correlations = [resid_correl(train_features[col]) for col in train_features.columns]
sorted_correlations = pd.Series(correlations, index=train_features.columns).sort_values(ascending=False)
mask = (sorted_correlations > 0.25) | (sorted_correlations < -0.25)
selected_cols = sorted_correlations[mask].index

In [34]:
np.random.seed(0)
%time gs_lm.fit(train_features[selected_cols], residuals)

CPU times: user 4.1 s, sys: 12.2 ms, total: 4.11 s
Wall time: 4.12 s


GridSearchCV(cv=10, estimator=Ridge(),
             param_grid={'alpha': array([1.00000000e-03, 1.01109091e+00, 2.02118182e+00, 3.03127273e+00,
       4.04136364e+00, 5.05145455e+00, 6.06154545e+00, 7.07163636e+00,
       8.08172727e+00, 9.09181818e+00, 1.01019091e+01, 1.11120000e+01,
       1.21220909e+01, 1.31321818e+01, 1.41422727e+01, 1.51523636e+01,
       1.61624545e+01, 1.71725455e+01, 1.81826364e+01, 1...
       7.67679091e+01, 7.77780000e+01, 7.87880909e+01, 7.97981818e+01,
       8.08082727e+01, 8.18183636e+01, 8.28284545e+01, 8.38385455e+01,
       8.48486364e+01, 8.58587273e+01, 8.68688182e+01, 8.78789091e+01,
       8.88890000e+01, 8.98990909e+01, 9.09091818e+01, 9.19192727e+01,
       9.29293636e+01, 9.39394545e+01, 9.49495455e+01, 9.59596364e+01,
       9.69697273e+01, 9.79798182e+01, 9.89899091e+01, 1.00000000e+02])},
             return_train_score=True)

In [35]:
gs_lm.best_score_

-0.024553604419327135

In [36]:
gs_lm.best_estimator_.coef_

array([[8583.47883442]])

XGBoost outcomes

In [28]:
gs_xgb.best_params_

{'eta': 0.09179999999999999}

In [39]:
importances = pd.Series(gs_xgb.best_estimator_.feature_importances_, index=train_features.columns).sort_values(ascending=False)
importances[:20]

KitchenQual     0.217538
GarageCars      0.211186
TotLivArea      0.127965
OverallQual     0.103596
Neighborhood    0.054028
ExterQual       0.048793
FullBath        0.038648
Fireplaces      0.019184
BldgType        0.016054
TotRmsAbvGrd    0.015010
Foundation      0.014299
CentralAir      0.013372
BsmtExposure    0.012627
LotFrontage     0.008988
GarageQual      0.007879
AgeRemod        0.007829
LotArea         0.007656
AgeBuilt        0.007639
HalfBath        0.006545
MSSubClass      0.006418
dtype: float32

Running the model on the test dataset

In [None]:
pred_xgb = pd.Series(model_xgb.predict(test_features))
prediction_xgb = pd.DataFrame({"Id" : test_data["Id"], "SalePrice" : pred_xgb})
prediction_xgb.set_index("Id", inplace = True)

In [None]:
prediction_xgb.to_csv("./Data/prediction_xgb.csv")

In [None]:
pred_rf = pd.Series(model_rf.predict(test_features))
prediction_rf = pd.DataFrame({"Id" : test_data["Id"], "SalePrice" : pred_rf})
prediction_rf.set_index("Id", inplace = True)

In [None]:
prediction_rf.to_csv("./Data/prediction_rf.csv")

In [None]:
pred_av = (pred_xgb + pred_rf) / 2
prediction_av = pd.DataFrame({"Id" : test_data["Id"], "SalePrice" : pred_av})
prediction_av.set_index("Id", inplace = True)

In [None]:
prediction_av.to_csv("./Data/prediction_av.csv")

Save encoder and model for later

In [None]:
filename = 'fitted_model.sav'
joblib.dump(model_xgb, filename)
filename = 'fitted_encoder.sav'
joblib.dump(enc, filename)

In [None]:
train_features.to_csv("./Data/encoded_train_data.csv")

In [None]:
shap_values = shap.TreeExplainer(model_xgb).shap_values(train_features)

In [None]:
shap.summary_plot(shap_values, train_features)

In [None]:
shap.dependence_plot("TotLivArea", shap_values, train_features)

In [None]:
shap.dependence_plot("OverallQual", shap_values, train_features)

In [None]:
shap.dependence_plot("LotArea", shap_values, train_features)

In [None]:
shap.dependence_plot("Neighborhood", shap_values, train_features)

In [None]:
shap_interaction_values = shap.TreeExplainer(model_xgb).shap_interaction_values(train_features)

In [None]:
shap.summary_plot(shap_interaction_values, train_features)

In [None]:
shap.dependence_plot(
    ("TotLivArea", "TotLivArea"),
    shap_interaction_values, train_features,
    display_features = train_features
)

In [None]:
shap.dependence_plot(
    ("TotLivArea", "OverallQual"),
    shap_interaction_values, train_features,
    display_features = train_features
)

In [None]:
shap.dependence_plot(
    ("Neighborhood", "OverallQual"),
    shap_interaction_values, train_features,
    display_features = train_features
)

In [None]:
tmp = np.abs(shap_interaction_values).sum(0)
for i in range(tmp.shape[0]):
    tmp[i,i] = 0
inds = np.argsort(-tmp.sum(0))[:50]
tmp2 = tmp[inds,:][:,inds]
pl.figure(figsize=(12,12))
pl.imshow(tmp2)
pl.yticks(range(tmp2.shape[0]), train_features.columns[inds], rotation=50.4, horizontalalignment="right")
pl.xticks(range(tmp2.shape[0]), train_features.columns[inds], rotation=50.4, horizontalalignment="left")
pl.gca().xaxis.tick_top()
pl.show()

In [None]:
def heatmap(valuesDF, threshold):
    # Make a mask to only show the lower left part of the table
    mask = np.triu(np.ones_like(valuesDF, dtype = bool))

    cmap = sns.diverging_palette(220, 20, as_cmap = True)

    # Draw the heatmap with the mask and correct aspect ratio
    visual = sns.heatmap(valuesDF, mask = mask, cmap = cmap, center = threshold,
                square = True, linewidths = .5, annot = True);

    return visual