Packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
import joblib
import matplotlib.pylab as pl
import seaborn as sns
import shap
from scipy.stats import pearsonr

In [2]:
rand_seed = 0

Reading the data

In [67]:
train_data = pd.read_csv("./Data/prepped_train.csv")
test_data = pd.read_csv("./Data/prepped_test.csv")
target = pd.read_csv("./Data/prepped_target.csv")

target.set_index("Id", inplace = True)

Label encoding the features

In [68]:
enc = LabelEncoder()
train_result = {}
test_result = {}
for col in train_data.columns:
    if (train_data[col].dtype == "int" or train_data[col].dtype == "float"):
        train_result[col] = train_data[col]
        test_result[col] = test_data[col]
    else:
        train_result[col] = pd.Series(enc.fit_transform(train_data[col]))
        test_result[col] = pd.Series(enc.transform(test_data[col]))
train_features = pd.DataFrame(train_result)
test_features = pd.DataFrame(test_result)

train_features.set_index("Id", inplace = True)
test_features.set_index("Id", inplace = True)

XGBoost with early stopping

In [69]:
np.random.seed(rand_seed)
train_features_grid, train_features_validation, target_grid, target_validation = train_test_split(train_features, target, test_size = 0.1)

In [70]:
xgb = XGBRegressor(n_jobs = -1, booster = "gbtree", n_estimators = 1000, colsample_bytree = 0.5)
eta_range = np.linspace(0.01, 0.5, 50, endpoint = True)
max_depth_range = range(1, 7)
xgb_params = {'max_depth': max_depth_range, 'eta': eta_range}
gs_xgb = GridSearchCV(xgb, xgb_params, cv = 10, return_train_score = True)

In [71]:
np.random.seed(rand_seed)
fit_params={"early_stopping_rounds" : 20, 
            "eval_metric" : "mae", 
            "eval_set" : [[train_features_validation, target_validation]]}
%time gs_xgb.fit(train_features_grid, target_grid,verbose = 0, **fit_params)

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/nielskreuk/opt/anaconda3/lib/python3.8/site-packages/IPython/core/magics/execution.py", line 1313, in time
    out = eval(code, glob, local_ns)
  File "<timed eval>", line 1, in <module>
  File "/Users/nielskreuk/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/Users/nielskreuk/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_search.py", line 841, in fit
    self._run_search(evaluate_candidates)
  File "/Users/nielskreuk/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_search.py", line 1288, in _run_search
    evaluate_candidates(ParameterGrid(self.param_grid))
  File "/Users/nielskreuk/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_search.py", line 795, in evaluate_candidates
    out = parallel(delayed(_fit_and_score)(clone(base_estimator),
  File "/Users/nielskreuk/opt/anaconda3/lib/python3.8/site-p

TypeError: object of type 'NoneType' has no len()

In [66]:
gs_xgb.best_score_

0.9151926096986618

Second round, narrowing in on eta

In [23]:
max_d = gs_xgb.best_params_["max_depth"]
xgb = XGBRegressor(n_jobs = -1, booster = "gbtree", n_estimators = 1000, colsample_bytree = 0.5, max_depth = max_d)
eta_mid = gs_xgb.best_params_["eta"]
eta_range = np.linspace(eta_mid - 0.01, eta_mid + 0.01, 200, endpoint = False)
xgb_params = {'eta': eta_range}
gs_xgb = GridSearchCV(xgb, xgb_params, cv = 10, return_train_score = True)

In [24]:
np.random.seed(rand_seed)
fit_params={"early_stopping_rounds" : 20, 
            "eval_metric" : "mae", 
            "eval_set" : [[train_features_validation, target_validation]]}
%time gs_xgb.fit(train_features_grid, target_grid,verbose = 0, **fit_params)

CPU times: user 1h 11min 12s, sys: 6min 27s, total: 1h 17min 39s
Wall time: 10min 5s


GridSearchCV(cv=10,
             estimator=XGBRegressor(base_score=None, booster='gbtree',
                                    colsample_bylevel=None,
                                    colsample_bynode=None, colsample_bytree=0.5,
                                    gamma=None, gpu_id=None,
                                    importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=4, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=1000, n_job...
       0.0452, 0.0453, 0.0454, 0.0455, 0.0456, 0.0457, 0.0458, 0.0459,
       0.046 , 0.0461, 0.0462, 0.0463, 0.0464, 0.0465, 0.0466, 0.0467,
       0.0468, 0.0469, 0.047 , 0.0471, 0.0472, 0.0473, 0.0474, 0.0475,
       0.0476, 0.0477, 0.0478, 0.0479, 0.048 , 0.0481, 0.0482, 0.0483,
       0

In [25]:
gs_xgb.best_score_

0.9162778291822447

XGBoost outcomes

In [26]:
print("max depth:", max_d)
print("eta:", gs_xgb.best_params_["eta"])

max depth: 4
eta: 0.0347


In [27]:
importances = pd.Series(gs_xgb.best_estimator_.feature_importances_, index=train_features.columns).sort_values(ascending=False)
importances[:20]

TotLivArea            0.123208
OverallQual           0.098204
ExterQual             0.082740
Neighborhood          0.081332
Fireplaces            0.079228
KitchenQual           0.076354
GarageCars            0.063990
CentralAir            0.051734
FullBath              0.048938
GarageQual            0.032782
AgeRemod              0.020963
OverallCond           0.019246
HalfBath              0.018572
BsmtExposure          0.017495
MSSubClass            0.014557
BasementQualFactor    0.014040
MSZoning              0.013913
ExterCond             0.013700
SaleCondition         0.012442
Heating               0.009698
dtype: float32

Running the model on the test dataset, taking exponential as predicted values are log prices

In [28]:
model_xgb = gs_xgb.best_estimator_

In [29]:
pred_xgb = np.exp(pd.Series(model_xgb.predict(test_features)))
prediction_xgb = pd.DataFrame({"Id" : test_data["Id"], "SalePrice" : pred_xgb})
prediction_xgb.set_index("Id", inplace = True)

In [30]:
prediction_xgb.to_csv("./Data/prediction_xgb.csv")

Save encoder and model for later

In [None]:
filename = 'fitted_model.sav'
joblib.dump(model_xgb, filename)
filename = 'fitted_encoder.sav'
joblib.dump(enc, filename)

In [None]:
train_features.to_csv("./Data/encoded_train_data.csv")

In [None]:
shap_values = shap.TreeExplainer(model_xgb).shap_values(train_features)

In [None]:
shap.summary_plot(shap_values, train_features)

In [None]:
shap.dependence_plot("TotLivArea", shap_values, train_features)

In [None]:
shap.dependence_plot("OverallQual", shap_values, train_features)

In [None]:
shap.dependence_plot("LotArea", shap_values, train_features)

In [None]:
shap.dependence_plot("Neighborhood", shap_values, train_features)

In [None]:
shap_interaction_values = shap.TreeExplainer(model_xgb).shap_interaction_values(train_features)

In [None]:
shap.summary_plot(shap_interaction_values, train_features)

In [None]:
shap.dependence_plot(
    ("TotLivArea", "TotLivArea"),
    shap_interaction_values, train_features,
    display_features = train_features
)

In [None]:
shap.dependence_plot(
    ("TotLivArea", "OverallQual"),
    shap_interaction_values, train_features,
    display_features = train_features
)

In [None]:
shap.dependence_plot(
    ("Neighborhood", "OverallQual"),
    shap_interaction_values, train_features,
    display_features = train_features
)

In [None]:
tmp = np.abs(shap_interaction_values).sum(0)
for i in range(tmp.shape[0]):
    tmp[i,i] = 0
inds = np.argsort(-tmp.sum(0))[:50]
tmp2 = tmp[inds,:][:,inds]
pl.figure(figsize=(12,12))
pl.imshow(tmp2)
pl.yticks(range(tmp2.shape[0]), train_features.columns[inds], rotation=50.4, horizontalalignment="right")
pl.xticks(range(tmp2.shape[0]), train_features.columns[inds], rotation=50.4, horizontalalignment="left")
pl.gca().xaxis.tick_top()
pl.show()

In [None]:
def heatmap(valuesDF, threshold):
    # Make a mask to only show the lower left part of the table
    mask = np.triu(np.ones_like(valuesDF, dtype = bool))

    cmap = sns.diverging_palette(220, 20, as_cmap = True)

    # Draw the heatmap with the mask and correct aspect ratio
    visual = sns.heatmap(valuesDF, mask = mask, cmap = cmap, center = threshold,
                square = True, linewidths = .5, annot = True);

    return visual