Packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
import joblib

Reading the data

In [51]:
train_data = pd.read_csv("./Data/cleaned_train.csv")
test_data = pd.read_csv("./Data/cleaned_test.csv")
target = pd.read_csv("./Data/cleaned_target.csv")

target.set_index("Id", inplace = True)

Label encoding the features

In [52]:
enc = LabelEncoder()
train_result = {}
test_result = {}
for col in train_data.columns:
    if (train_data[col].dtype == "int" or train_data[col].dtype == "float"):
        train_result[col] = train_data[col]
        test_result[col] = test_data[col]
    else:
        train_result[col] = pd.Series(enc.fit_transform(train_data[col]))
        test_result[col] = pd.Series(enc.transform(test_data[col]))
train_features = pd.DataFrame(train_result)
test_features = pd.DataFrame(test_result)

train_features.set_index("Id", inplace = True)
test_features.set_index("Id", inplace = True)

Id
LotArea
OverallQual
GarageCars
YearBuilt
TotRmsAbvGrd
KitchenQual
Functional
YrSold
TotLivArea
FullBath
PorchSF


Random Forest

In [None]:
rf = RandomForestRegressor(n_jobs = -1, n_estimators = 100)
rf_params = {'max_features': [3,5,7,9], 'max_depth': range(1, 9), 'min_samples_leaf': [5, 25, 100, 150]}
gs_rf = GridSearchCV(rf, rf_params, cv = 10, return_train_score = True)

In [None]:
np.random.seed(0)
%time gs_rf.fit(train_features, target['SalePrice'])

In [None]:
gs_rf.best_score_

In [None]:
gs_rf.best_params_

In [None]:
importances = pd.Series(gs_rf.best_estimator_.feature_importances_, index=train_features.columns).sort_values(ascending=False)
importances[:6]

XGBoost with early stopping

In [None]:
train_features_grid, train_features_validation, target_grid, target_validation = train_test_split(train_features, target, test_size = 0.2)

In [None]:
xgb = XGBRegressor(n_jobs = -1, n_estimators = 1000)
xgb_params = {'max_depth': range(1, 6), 'eta': np.linspace(0.0001, 0.5, num = 50, endpoint = True)}
gs_xgb = GridSearchCV(xgb, xgb_params, cv = 10, return_train_score = True)

In [None]:
np.random.seed(0)
fit_params={"early_stopping_rounds":10, 
            "eval_metric" : "mae", 
            "eval_set" : [[train_features_validation, target_validation]]}
%time gs_xgb.fit(train_features_grid, target_grid,verbose = 0, **fit_params)

XGBoost without early stopping

In [53]:
xgb = XGBRegressor(n_jobs = -1, booster = "gbtree", colsample_bytree = 0.5)
eta = np.linspace(0.0001, 0.5, num = 10, endpoint = True)
n_estimators = [25 * i for i in range(1,13)]
xgb_params = {'max_depth': range(2, 6), 'eta': eta, 'n_estimators' : n_estimators}
gs_xgb = GridSearchCV(xgb, xgb_params, cv = 5, return_train_score = True)

In [54]:
np.random.seed(0)
%time gs_xgb.fit(train_features, target, verbose = 0)

CPU times: user 26min 33s, sys: 2min 44s, total: 29min 18s
Wall time: 3min 48s


GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster='gbtree',
                                    colsample_bylevel=None,
                                    colsample_bynode=None, colsample_bytree=0.5,
                                    gamma=None, gpu_id=None,
                                    importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jo...
                                    scale_pos_weight=None, subsample=None,
                                    tree_method=None, validate_parameters=None,
                                    verbosity=None),
             param_grid={'eta': array([1.00000000e-04, 5.56444444e-02, 1.11188889e-0

XGBoost outcomes

In [55]:
gs_xgb.best_score_

0.8576462612160046

In [56]:
gs_xgb.best_params_

{'eta': 0.05564444444444445, 'max_depth': 3, 'n_estimators': 300}

In [57]:
importances = pd.Series(gs_xgb.best_estimator_.feature_importances_, index=train_features.columns).sort_values(ascending=False)
importances[:20]

OverallQual     0.330626
GarageCars      0.224456
TotLivArea      0.146135
KitchenQual     0.102785
FullBath        0.084738
TotRmsAbvGrd    0.037348
YearBuilt       0.030034
LotArea         0.022591
PorchSF         0.010866
Functional      0.005460
YrSold          0.004961
dtype: float32

In [None]:
model = gs_xgb.best_estimator_

Running the model on the test dataset

In [None]:
pred = pd.Series(model.predict(test_features))
prediction = pd.DataFrame({"Id" : test_data["Id"], "SalePrice" : pred})
prediction.set_index("Id", inplace = True)

In [None]:
prediction.to_csv("./Data/prediction.csv")

Save encoder and model for later

In [None]:
# save the model to disk
filename = 'fitted_model.sav'
joblib.dump(model, filename)
filename = 'fitted_encoder.sav'
joblib.dump(enc, filename)

Checking model on the test data set