Packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
import joblib

Reading the data

In [2]:
data = pd.read_csv("./Data/cleaned.csv")
data

Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BedroomAbvGr,FullBath,GrLivArea,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,8450,1Fam,2Story,7,5,2003,2003,3,2,1710,2,2008,WD,Normal,208500
1,2,20,RL,9600,1Fam,1Story,6,8,1976,1976,3,2,1262,5,2007,WD,Normal,181500
2,3,60,RL,11250,1Fam,2Story,7,5,2001,2002,3,2,1786,9,2008,WD,Normal,223500
3,4,70,RL,9550,1Fam,2Story,7,5,1915,1970,3,1,1717,2,2006,WD,Abnorml,140000
4,5,60,RL,14260,1Fam,2Story,8,5,2000,2000,4,2,2198,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,7917,1Fam,2Story,6,5,1999,2000,3,2,1647,8,2007,WD,Normal,175000
1456,1457,20,RL,13175,1Fam,1Story,6,6,1978,1988,3,2,2073,2,2010,WD,Normal,210000
1457,1458,70,RL,9042,1Fam,2Story,7,9,1941,2006,4,2,2340,5,2010,WD,Normal,266500
1458,1459,20,RL,9717,1Fam,1Story,5,6,1950,1996,2,1,1078,4,2010,WD,Normal,142125


Splitting target and features
Label encoding the features

In [3]:
target = data['SalePrice']

In [4]:
enc = LabelEncoder()
result = {}
for col in data.drop(['SalePrice'], axis=1).columns:
    if (data[data.drop(['SalePrice'], axis = 1).columns[2]].dtype == "int"):
        result[col] = data[col]
    else:
        result[col] = pd.Series(enc.fit_transform(data[col]))
features = pd.DataFrame(result)

Random Forest

In [15]:
rf     = RandomForestRegressor(n_estimators = 100)
rf_params = {'max_features': range(3,9,3), 'max_depth': range(4, 21, 4), 'min_samples_leaf': [5,10]}
gs_rf  = GridSearchCV(rf, rf_params, cv = 5, return_train_score = True)

In [16]:
np.random.seed(0)
gs_rf.fit(features, target)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid={'max_depth': range(4, 21, 4),
                         'max_features': range(3, 9, 3),
                         'min_samples_leaf': [5, 10]},
             return_train_score=True)

In [7]:
gs_rf.best_score_

0.8241046520946236

In [8]:
gs_rf.best_params_

{'max_depth': 8, 'max_features': 6, 'min_samples_leaf': 5}

In [9]:
importances = pd.Series(gs_rf.best_estimator_.feature_importances_, index=features.columns).sort_values(ascending=False)
importances[:6]

OverallQual     0.379758
GrLivArea       0.235529
YearBuilt       0.136476
LotArea         0.065316
YearRemodAdd    0.056715
FullBath        0.048805
dtype: float64

In [26]:
xgb = XGBRegressor(n_estimators = 100)
xgb_params = {'max_depth': range(4, 21, 4), 'eta': [0.001, 0.1, 0.3]}
gs_xgb = GridSearchCV(xgb, xgb_params, cv = 5, return_train_score = True)

In [27]:
gs_xgb.fit(features, target)

GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=None, reg_lambda=None,
                                    scale_pos_weight=None, subsample=None,
                                    tree_method=None, validate_para

In [None]:
model = gs_xgb.best_estimator_

In [28]:
gs_xgb.best_score_

0.8406336256607343

In [29]:
gs_xgb.best_params_

{'eta': 0.1, 'max_depth': 4}

Save encoder and model for later

In [None]:
# save the model to disk
filename = 'fitted_model.sav'
joblib.dump(model, filename)
filename = 'fitted_encoder.sav'
joblib.dump(enc, filename)