Packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
import joblib

Reading the data

In [2]:
train_data = pd.read_csv("./Data/cleaned_train.csv")
target = pd.read_csv("./Data/cleaned_train_target.csv")
test_data = pd.read_csv("./Data/cleaned.csv")

Splitting target and features
Label encoding the features

In [3]:
enc = LabelEncoder()
train_result = {}
test_result = {}
for col in train_data.columns:
    if (train_data[col].dtype == "int"):
        train_result[col] = train_data[col]
        test_result[col] = test_data[col]
    else:
        train_result[col] = pd.Series(enc.fit_transform(train_data[col]))
        test_result[col] = pd.Series(enc.transform(test_data[col]))
train_features = pd.DataFrame(train_result)
test_features = pd.DataFrame(test_result)

Random Forest

In [None]:
rf = RandomForestRegressor(n_jobs = -1, n_estimators = 100)
rf_params = {'max_features': [3,5,7,9], 'max_depth': range(1, 9), 'min_samples_leaf': [5, 25, 100, 150]}
gs_rf = GridSearchCV(rf, rf_params, cv = 10, return_train_score = True)

In [None]:
np.random.seed(0)
%time gs_rf.fit(train_features, target['SalePrice'])

In [None]:
gs_rf.best_score_

In [None]:
gs_rf.best_params_

In [None]:
importances = pd.Series(gs_rf.best_estimator_.feature_importances_, index=train_features.columns).sort_values(ascending=False)
importances[:6]

XGBoost

In [4]:
xgb = XGBRegressor(n_jobs = -1, n_estimators = 100)
xgb_params = {'max_depth': range(1, 6), 'eta': np.linspace(0.0001, 0.5, num = 50, endpoint = True)}
gs_xgb = GridSearchCV(xgb, xgb_params, cv = 10, return_train_score = True)

In [5]:
np.random.seed(0)
%time gs_xgb.fit(train_features, target)

CPU times: user 18min 53s, sys: 1min 43s, total: 20min 37s
Wall time: 2min 42s


GridSearchCV(cv=10,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs...
       2.85757143e-01, 2.95959184e-01, 3.06161224e-01, 3.16363265e-01,
       3.26565306e-01, 3.36767347e-01, 3.46969388e-01, 3.57171429e-01,
       3.67373469e-01, 3.77575510e-01, 3.87777551e-01, 3.97979592e-01,
       4.08181633e-01, 4.18383673e-01, 4.28585714e-01, 4.38787755e-01,
       4

In [6]:
gs_xgb.best_score_

0.8588433653183161

In [7]:
gs_xgb.best_params_

{'eta': 0.3367673469387755, 'max_depth': 2}

In [9]:
importances = pd.Series(gs_xgb.best_estimator_.feature_importances_, index=train_features.columns).sort_values(ascending=False)
importances[:6]

OverallQual      0.539660
GrLivArea        0.112523
YearBuilt        0.082477
YearRemodAdd     0.065863
LotArea          0.058380
SaleCondition    0.041851
dtype: float32

In [10]:
model = gs_xgb.best_estimator_

Running the model on the test dataset

In [11]:
pred = pd.Series(model.predict(test_features))
prediction = pd.DataFrame({"Id" : test_data["Id"], "SalePrice" : pred})
prediction.set_index("Id", inplace = True)

In [12]:
prediction.to_csv("./Data/prediction.csv")

Save encoder and model for later

In [None]:
# save the model to disk
filename = 'fitted_model.sav'
joblib.dump(model, filename)
filename = 'fitted_encoder.sav'
joblib.dump(enc, filename)

Checking model on the test data set