Packages

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
import matplotlib.pylab as plt
#import seaborn as sns
import shap
from IPython.display import Javascript
#from scipy.stats import pearsonr

In [None]:
rand_seed = 0

Reading the data

In [None]:
train_data = pd.read_csv("./Data/prepped_train.csv")
test_data = pd.read_csv("./Data/prepped_test.csv")
target = pd.read_csv("./Data/prepped_target.csv")
pred_base = pd.read_csv("./Data/prediction_base.csv")

target.set_index("Id", inplace = True)

Label encoding the features

In [None]:
enc = LabelEncoder()
train_result = {}
test_result = {}
for col in train_data.columns:
    if (train_data[col].dtype == "int" or train_data[col].dtype == "float"):
        train_result[col] = train_data[col]
        test_result[col] = test_data[col]
    else:
        train_result[col] = pd.Series(enc.fit_transform(train_data[col]))
        test_result[col] = pd.Series(enc.transform(test_data[col]))
train_features = pd.DataFrame(train_result)
test_features = pd.DataFrame(test_result)

train_features.set_index("Id", inplace = True)
test_features.set_index("Id", inplace = True)

XGBoost with early stopping

In [None]:
np.random.seed(rand_seed)
train_features_grid, train_features_validation, target_grid, target_validation = train_test_split(train_features, target, test_size = 0.1)

In [None]:
xgb = XGBRegressor(n_jobs = -1, booster = "gbtree", n_estimators = 1000, colsample_bytree = 0.5)
eta_range = np.linspace(0.01, 0.5, 50, endpoint = True)
max_depth_range = range(1, 7)
xgb_params = {'max_depth': max_depth_range, 'eta': eta_range}
gs_xgb = GridSearchCV(xgb, xgb_params, cv = 10, return_train_score = True)

In [None]:
np.random.seed(rand_seed)
fit_params={"early_stopping_rounds" : 20, 
            "eval_metric" : "mae", 
            "eval_set" : [[train_features_validation, target_validation]]}
%time gs_xgb.fit(train_features_grid, target_grid,verbose = 0, **fit_params)

In [None]:
gs_xgb.best_score_

Second round, narrowing in on eta

In [None]:
max_d = gs_xgb.best_params_["max_depth"]
xgb = XGBRegressor(n_jobs = -1, booster = "gbtree", n_estimators = 1000, colsample_bytree = 0.5, max_depth = max_d)
eta_mid = gs_xgb.best_params_["eta"]
eta_range = np.linspace(eta_mid - 0.01, eta_mid + 0.01, 200, endpoint = False)
xgb_params = {'eta': eta_range}
gs_xgb = GridSearchCV(xgb, xgb_params, cv = 10, return_train_score = True)

In [None]:
np.random.seed(rand_seed)
fit_params={"early_stopping_rounds" : 20, 
            "eval_metric" : "mae", 
            "eval_set" : [[train_features_validation, target_validation]]}
%time gs_xgb.fit(train_features_grid, target_grid,verbose = 0, **fit_params)

In [None]:
gs_xgb.best_score_

XGBoost outcomes

In [None]:
print("max depth:", max_d)
print("eta:", gs_xgb.best_params_["eta"])

In [None]:
importances = pd.Series(gs_xgb.best_estimator_.feature_importances_, index=train_features.columns).sort_values(ascending=False)
importances[:20]

Running the model on the test dataset, taking exponential as predicted values are log prices

In [None]:
model_xgb = gs_xgb.best_estimator_

In [None]:
pred_xgb = pd.Series(model_xgb.predict(test_features))
pred_combined = np.exp(np.sum([pred_xgb, pred_base["predictions"]],axis=0))

In [None]:
prediction = pd.DataFrame({"Id" : test_data["Id"], "SalePrice" : pred_combined})
prediction.set_index("Id", inplace = True)

In [None]:
prediction.to_csv("./Data/prediction.csv")

SHAP visualizations

In [None]:
explainer = shap.TreeExplainer(model_xgb)
shap_values = explainer.shap_values(train_features)

In [None]:
shap.summary_plot(shap_values, train_features)

In [None]:
shap.initjs() 
for i in range(1,2):
    filename = "ShapForces" + str(i) + ".png"
    shap.force_plot(explainer.expected_value, shap_values[i,:], train_features.iloc[i,:], show = False, matplotlib=True).savefig(filename, format = "png", dpi = 150, bbox_inches = 'tight')