In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import os
import scipy
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler, LabelBinarizer, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import *
import hyperopt
from hyperopt import *
from hyperopt import fmin, tpe, hp, space_eval
import matplotlib.pyplot as plt
%matplotlib inline 


<div class="alert alert-block alert-info">
<b>Loading the data:</b> We load the data from the mentioned path
</div>

In [2]:
path_of_input_file = r'D:\kaggle_trials\insurance-premium-prediction\insurance.csv'
df                 = pd.read_csv(path_of_input_file)
df.head(4)

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47


<div class="alert alert-block alert-info">
<b>Categorical and Numerical Columns Identification:</b> We identify categorical and numerical columns from the data. We do set a threshold that if any categorical value is classified as numerical, then it has to be classified back to categorical if the number of distinct values of that column in the dataframe is less than 10 
</div>

In [3]:
cols_needed           = list(df.columns)
cols_needed           = cols_needed[:len(cols_needed)-1]

possible_numeric_cols = list(df._get_numeric_data().columns)
possible_numeric_cols.remove('expenses')

categorical_columns   = list(set(cols_needed)- set(possible_numeric_cols))

numerical_columns     = []
for i in range(len(possible_numeric_cols)):
    col_name  = possible_numeric_cols[i]
    if len(df[col_name].unique())<10:
        categorical_columns.append(col_name)
    else:
        numerical_columns.append(col_name)

<div class="alert alert-block alert-info">
<b>Missing Value Treatment:</b> We impute the categorical missing values with their mode and the numerical missing values with their mean
</div>

In [4]:
for i in range(len(categorical_columns)):
    df[categorical_columns[i]] = df[categorical_columns[i]].fillna(df[categorical_columns[i]].mode()[0])
mean_impute_dict    ={}
for i in range(len(numerical_columns)):
    mean_impute_dict[numerical_columns[i]] = np.nanmean(np.float_(df[numerical_columns[i]].values))
for i in range(len(numerical_columns)):
    df[numerical_columns[i]]   = df[numerical_columns[i]].fillna(mean_impute_dict[numerical_columns[i]])

<div class="alert alert-block alert-info">
<b>Encoding and Feature Scaling:</b> We do the one hot encoding of categorical values and scale(by using MinMaxScaler) the numerical values to get the final feature matrix X. Subseqently, we consider the SalePrice column to be our target variable
</div>

In [5]:
ohe                  = OneHotEncoder()
scalar               = MinMaxScaler()
encoded_matrix       = ohe.fit_transform(df[categorical_columns])
scaled_matrix        = scalar.fit_transform(df[numerical_columns])
X_complete_matrix    = scipy.sparse.hstack((encoded_matrix,scaled_matrix)).A
Y                    = scalar.fit_transform(df[['expenses']])


<div class="alert alert-block alert-info">
<b>Train Test split:</b> We perform train test split on the data
</div>

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_complete_matrix, Y, test_size=0.2, random_state=42)

<div class="alert alert-block alert-info">
<b>Ridge Regressor parameters:</b> We specify ridge regressor parameters here to hypertune them
</div>

In [7]:
ridge_reg_grid  = {'alpha' : hp.uniform('alpha',0.01,5),
                      'solver'          : hp.choice('solver',['auto','svd','cholesky','lsqr','sparse_cg','sag','saga'])
                     }

In [8]:
def hyperopt_train_test(params):
    reg = Ridge(**params)
    return cross_val_score(reg, X_train, y_train).mean()

def function_to_minimise(params):
    accuracy = hyperopt_train_test(params)
    return {'loss': -1*accuracy, 'status': STATUS_OK}


trials          = Trials()
best            = fmin(function_to_minimise, ridge_reg_grid, algo=tpe.suggest, max_evals=30, trials=trials)
best_parameters = space_eval(ridge_reg_grid, best)
print('The best parameter tuned on training set is given by :- ',best_parameters)

100%|██████████████████████████████████████████████████| 30/30 [00:00<00:00, 84.07it/s, best loss: -0.7352459216210612]
The best parameter tuned on training set is given by :-  {'alpha': 0.34064687620483347, 'solver': 'sag'}


In [9]:
model = Ridge(**best_parameters)
model.fit(X_train, y_train)

Ridge(alpha=0.34064687620483347, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='sag', tol=0.001)

In [10]:
y_pred = model.predict(X_test)
print('The coefficient of determination is:- ',r2_score(y_pred,y_test))

The coefficient of determination is:-  0.708046687521789


<div class="alert alert-block alert-info">
<b>Conclusion:</b> We did not get the results as expected even after parameter tuning. We will proceed ahead and apply Random Forest regressor on the data to see if we can get better results or not
</div>

In [25]:
random_forest_grid = {'n_estimators' : hp.choice('n_estimators',range(5,50)),
                      'min_samples_split' : hp.uniform('min_samples_split',0.01,0.95),
                      'min_samples_leaf'  : hp.choice('min_samples_leaf',range(1,10)),
                      'max_features'      : hp.choice('max_features',['auto','sqrt','log2',None])
                     }

In [26]:
def hyperopt_train_test(params):
    reg = RandomForestRegressor(**params)
    return cross_val_score(reg, X_train, y_train).mean()

def function_to_minimise(params):
    accuracy = hyperopt_train_test(params)
    return {'loss': -1*accuracy, 'status': STATUS_OK}


trials          = Trials()
best            = fmin(function_to_minimise, random_forest_grid, algo=tpe.suggest, max_evals=30, trials=trials)
best_parameters = space_eval(random_forest_grid, best)
print('The best parameter tuned on training set is given by :- ',best_parameters)

100%|██████████████████████████████████████████████████| 30/30 [00:02<00:00, 10.09it/s, best loss: -0.8458595332069737]
The best parameter tuned on training set is given by :-  {'max_features': None, 'min_samples_leaf': 4, 'min_samples_split': 0.024477890823528636, 'n_estimators': 37}


In [28]:
rf_regressor = RandomForestRegressor(**best_parameters)
rf_regressor.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=4,
                      min_samples_split=0.024477890823528636,
                      min_weight_fraction_leaf=0.0, n_estimators=37,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [30]:
y_pred = rf_regressor.predict(X_test)
print('The coefficient of determination is:- ',r2_score(y_pred,y_test))

The coefficient of determination is:-  0.8595531131950803


<div class="alert alert-block alert-info">
<b>Conclusion:</b> We did see an improvement in the R^2 score from Random Forest Regressor. We will go ahead and try out one more regressor type-Adaboost Regressor
</div>

In [31]:
adaboost_reg_grid  = {'n_estimators' : hp.choice('n_estimators',range(5,50)),
                      'learning_rate' : hp.uniform('learning_rate',0.05,1.01),
                      'loss'          : hp.choice('loss',['linear','square','exponential'])
                     }

In [34]:
def hyperopt_train_test(params):
    reg = AdaBoostRegressor(**params)
    return cross_val_score(reg, X_train, y_train).mean()

def function_to_minimise(params):
    accuracy = hyperopt_train_test(params)
    return {'loss': -1*accuracy, 'status': STATUS_OK}


trials          = Trials()
best            = fmin(function_to_minimise, adaboost_reg_grid, algo=tpe.suggest, max_evals=30, trials=trials)
best_parameters = space_eval(adaboost_reg_grid, best)
print('The best parameter tuned on training set is given by :- ',best_parameters)

100%|██████████████████████████████████████████████████| 30/30 [00:02<00:00, 11.76it/s, best loss: -0.8408061742218745]
The best parameter tuned on training set is given by :-  {'learning_rate': 0.07592731447572787, 'loss': 'linear', 'n_estimators': 7}


In [35]:
model_adaboost = AdaBoostRegressor(**best_parameters)
model_adaboost.fit(X_train, y_train)

AdaBoostRegressor(base_estimator=None, learning_rate=0.07592731447572787,
                  loss='linear', n_estimators=7, random_state=None)

In [36]:
y_pred = model_adaboost.predict(X_test)

In [37]:
print('The coefficient of determination is:- ',r2_score(y_pred,y_test))

The coefficient of determination is:-  0.8350157655059629


<div class="alert alert-block alert-info">
<b>Conclusion :</b> Adaboost and Random Forests clearly outperformed the Ridge regressor
</div>