In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import os
import scipy
from sklearn.decomposition import PCA
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import MinMaxScaler, LabelBinarizer, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import *
import hyperopt
from hyperopt import *
from hyperopt import fmin, tpe, hp, space_eval
import matplotlib.pyplot as plt
%matplotlib inline 


<div class="alert alert-block alert-info">
<b>Loading the data:</b> We load the data from the mentioned path
</div>

In [2]:
path_of_input_file = r'D:\kaggle_trials\house-prices-advanced-regression-techniques\train.csv'
df                 = pd.read_csv(path_of_input_file)
df.head(2)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500


<div class="alert alert-block alert-info">
<b>Preprocessing data :</b> We separate out the numerical and categorical columns from the data to be used for scaling and encoding respectively 
</div>

In [3]:
cols_needed           = list(df.columns)
cols_needed           = cols_needed[:len(cols_needed)-1]

possible_numeric_cols = list(df._get_numeric_data().columns)
possible_numeric_cols.remove('Id')

categorical_columns   = list(set(cols_needed)- set(possible_numeric_cols))

numerical_columns     = []
for i in range(len(possible_numeric_cols)):
    col_name  = possible_numeric_cols[i]
    if len(df[col_name].unique())<10:
        categorical_columns.append(col_name)
    else:
        numerical_columns.append(col_name)

<div class="alert alert-block alert-info">
<b>Missing value Treatment:</b> We impute the numerical missing values with their respective means and the categorical values with their modes.
</div>

In [4]:
for i in range(len(categorical_columns)):
    df[categorical_columns[i]] = df[categorical_columns[i]].fillna(df[categorical_columns[i]].mode()[0])
mean_impute_dict    ={}
for i in range(len(numerical_columns)):
    mean_impute_dict[numerical_columns[i]] = np.nanmean(np.float_(df[numerical_columns[i]].values))
for i in range(len(numerical_columns)):
    df[numerical_columns[i]]   = df[numerical_columns[i]].fillna(mean_impute_dict[numerical_columns[i]])

<div class="alert alert-block alert-info">
<b>Scaling and Encoding:</b> We scale and one hot encode the data to get the matrix we need for calculations
</div>

In [5]:
ohe                  = OneHotEncoder()
scalar               = MinMaxScaler()
encoded_matrix       = ohe.fit_transform(df[categorical_columns])
scaled_matrix        = scalar.fit_transform(df[numerical_columns])
X_complete_matrix    = scipy.sparse.hstack((encoded_matrix,scaled_matrix)).A
Y                    = scalar.fit_transform(df[['SalePrice']])


<div class="alert alert-block alert-info">
<b>Train Test Split :</b> We split the data to train and test set 
</div>

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_complete_matrix, Y, test_size=0.2, random_state=42)

<div class="alert alert-block alert-info">
<b>Implementing the model:</b> We now implement the model with tuned parameters and get the R^2 score
</div>

In [7]:
model = GaussianProcessRegressor()
model.fit(X_train, y_train)

GaussianProcessRegressor(alpha=1e-10, copy_X_train=True, kernel=None,
                         n_restarts_optimizer=0, normalize_y=False,
                         optimizer='fmin_l_bfgs_b', random_state=None)

In [8]:
y_pred = model.predict(X_test)
print('The coefficient of determination is:- ',r2_score(y_pred,y_test))

The coefficient of determination is:-  -218.8804257341233


<div class="alert alert-block alert-info">
<b>Conclusion:</b> Clearly Gaussian Process Regression is not giving a good coefficient of determination at all. We will use Gradient Boosting Regressor in this scenario to see what improvements we can make
</div>

In [9]:
gradient_boost_reg_grid  = {'loss'         : hp.choice('loss',['ls','lad','huber','quantile']),
                            'learning_rate': hp.uniform('learning_rate',0.0,1.0),
                            'n_estimators' : hp.choice('n_estimators',range(50,300)),
                            'max_features' : hp.choice('max_features',['auto','sqrt','log2',None]),
                            'min_samples_split': hp.uniform('min_samples_split',0.0,1.0),
                            'min_samples_leaf' : hp.uniform('min_samples_leaf',0.0,0.5),
                         }

In [10]:
def hyperopt_train_test(params):
    reg = GradientBoostingRegressor(**params,random_state=19)
    return cross_val_score(reg, X_train, y_train).mean()

def function_to_minimise(params):
    accuracy = hyperopt_train_test(params)
    return {'loss': -1*accuracy, 'status': STATUS_OK}


trials          = Trials()
best            = fmin(function_to_minimise, gradient_boost_reg_grid, algo=tpe.suggest, max_evals=20, trials=trials)
best_parameters = space_eval(gradient_boost_reg_grid, best)
print('The best parameter tuned on training set is given by :- ',best_parameters)

100%|██████████████████████████████████████████████████| 20/20 [00:49<00:00,  2.46s/it, best loss: -0.9715407832381585]
The best parameter tuned on training set is given by :-  {'learning_rate': 0.627881784944577, 'loss': 'huber', 'max_features': None, 'min_samples_leaf': 0.01756016919912151, 'min_samples_split': 0.4790955420164398, 'n_estimators': 250}


In [11]:
model = GradientBoostingRegressor(**best_parameters)
model.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.627881784944577, loss='huber',
                          max_depth=3, max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=0.01756016919912151,
                          min_samples_split=0.4790955420164398,
                          min_weight_fraction_leaf=0.0, n_estimators=250,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [12]:
y_pred = model.predict(X_test)
print('The coefficient of determination is:- ',r2_score(y_pred,y_test))

The coefficient of determination is:-  0.9195001148340186


<div class="alert alert-block alert-info">
<b>Conclusion:</b> Gradient Boosting regressor is an improvement over Gaussian Process Regressor in this case.
</div>