Packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
import joblib

Reading the data

In [140]:
train_data = pd.read_csv("./Data/cleaned_train.csv")
test_data = pd.read_csv("./Data/cleaned_test.csv")
target = pd.read_csv("./Data/cleaned_target.csv")

target.set_index("Id", inplace = True)

Label encoding the features

In [141]:
enc = LabelEncoder()
train_result = {}
test_result = {}
for col in train_data.columns:
    if (train_data[col].dtype == "int" or train_data[col].dtype == "float"):
        train_result[col] = train_data[col]
        test_result[col] = test_data[col]
    else:
        train_result[col] = pd.Series(enc.fit_transform(train_data[col]))
        test_result[col] = pd.Series(enc.transform(test_data[col]))
train_features = pd.DataFrame(train_result)
test_features = pd.DataFrame(test_result)

train_features.set_index("Id", inplace = True)
test_features.set_index("Id", inplace = True)

In [142]:
train_data.columns

Index(['Id', 'LotArea', 'OverallQual', 'GarageCars', 'YearBuilt',
       'TotRmsAbvGrd', 'OverallCond', 'MSSubClass', 'LotShape', 'LotConfig',
       'Neighborhood', 'YrSold', 'KitchenQual', 'ExterQual', 'ExterCond',
       'HeatingQC', 'Fence', 'BldgType', 'Functional', 'TotLivArea',
       'FullBath', 'HalfBath', 'PorchSF', 'ProxPos', 'ProxRoad', 'ProxRail'],
      dtype='object')

Random Forest

In [88]:
rf = RandomForestRegressor(n_jobs = -1, n_estimators = 140)
rf_params = {'max_features': range(3,10), 'max_depth': range(1, 9), 'min_samples_leaf': [7, 14, 70, 140]}
gs_rf = GridSearchCV(rf, rf_params, cv = 10, return_train_score = True)

In [89]:
np.random.seed(0)
%time gs_rf.fit(train_features, target['SalePrice'])

CPU times: user 3min 53s, sys: 27.2 s, total: 4min 20s
Wall time: 4min 8s


GridSearchCV(cv=10,
             estimator=RandomForestRegressor(n_estimators=140, n_jobs=-1),
             param_grid={'max_depth': range(1, 9), 'max_features': range(3, 10),
                         'min_samples_leaf': [7, 14, 70, 140]},
             return_train_score=True)

In [90]:
gs_rf.best_score_

0.8482345761570352

In [91]:
gs_rf.best_params_

{'max_depth': 8, 'max_features': 4, 'min_samples_leaf': 7}

In [93]:
importances = pd.Series(gs_rf.best_estimator_.feature_importances_, index=train_features.columns).sort_values(ascending=False)
importances[:10]

TotLivArea      0.349021
OverallQual     0.280249
GarageCars      0.121423
KitchenQual     0.079766
YearBuilt       0.070614
LotArea         0.045688
TotRmsAbvGrd    0.031019
FullBath        0.010442
YrSold          0.005473
HalfBath        0.005005
dtype: float64

In [97]:
model_rf = gs_rf.best_estimator_

XGBoost with early stopping

In [None]:
train_features_grid, train_features_validation, target_grid, target_validation = train_test_split(train_features, target, test_size = 0.2)

In [None]:
xgb = XGBRegressor(n_jobs = -1, n_estimators = 1000)
xgb_params = {'max_depth': range(1, 6), 'eta': np.linspace(0.0001, 0.5, num = 50, endpoint = True)}
gs_xgb = GridSearchCV(xgb, xgb_params, cv = 10, return_train_score = True)

In [None]:
np.random.seed(0)
fit_params={"early_stopping_rounds":10, 
            "eval_metric" : "mae", 
            "eval_set" : [[train_features_validation, target_validation]]}
%time gs_xgb.fit(train_features_grid, target_grid,verbose = 0, **fit_params)

XGBoost without early stopping

In [132]:
xgb = XGBRegressor(n_jobs = -1, booster = "gbtree")
eta = np.linspace(0.0001, 0.5, num = 10, endpoint = True)
n_estimators = [25 * i for i in range(1,13)]
xgb_params = {'max_depth': range(2, 6), 'eta': eta, 'n_estimators' : n_estimators}
gs_xgb = GridSearchCV(xgb, xgb_params, cv = 5, return_train_score = True)

In [None]:
np.random.seed(0)
%time gs_xgb.fit(train_features, target, verbose = 0)

XGBoost outcomes

In [134]:
gs_xgb.best_score_

0.8666272741374355

In [135]:
gs_xgb.best_params_

{'eta': 0.22227777777777777, 'max_depth': 2, 'n_estimators': 225}

In [136]:
importances = pd.Series(gs_xgb.best_estimator_.feature_importances_, index=train_features.columns).sort_values(ascending=False)
importances[:20]

OverallQual     0.411811
GarageCars      0.193343
KitchenQual     0.088963
TotLivArea      0.075533
ExterQual       0.054579
TotRmsAbvGrd    0.027472
LotArea         0.020413
YearBuilt       0.020058
HalfBath        0.016944
MSSubClass      0.016129
OverallCond     0.012560
BldgType        0.011847
Neighborhood    0.010533
FullBath        0.009533
LotShape        0.009362
PorchSF         0.004994
YrSold          0.004358
HeatingQC       0.003800
Functional      0.002851
LotConfig       0.002528
dtype: float32

In [95]:
model_xgb = gs_xgb.best_estimator_

Running the model on the test dataset

In [102]:
pred_xgb = pd.Series(model_xgb.predict(test_features))
prediction_xgb = pd.DataFrame({"Id" : test_data["Id"], "SalePrice" : pred_xgb})
prediction_xgb.set_index("Id", inplace = True)

In [103]:
prediction_xgb.to_csv("./Data/prediction_xgb.csv")

In [104]:
pred_rf = pd.Series(model_rf.predict(test_features))
prediction_rf = pd.DataFrame({"Id" : test_data["Id"], "SalePrice" : pred_rf})
prediction_rf.set_index("Id", inplace = True)

In [105]:
prediction_rf.to_csv("./Data/prediction_rf.csv")

In [107]:
pred_av = (pred_xgb + pred_rf) / 2
prediction_av = pd.DataFrame({"Id" : test_data["Id"], "SalePrice" : pred_av})
prediction_av.set_index("Id", inplace = True)

In [108]:
prediction_av.to_csv("./Data/prediction_av.csv")

Save encoder and model for later

In [None]:
# save the model to disk
filename = 'fitted_model.sav'
joblib.dump(model, filename)
filename = 'fitted_encoder.sav'
joblib.dump(enc, filename)

Checking model on the test data set