### Overfitting and Regularization assignment

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

import warnings
warnings.filterwarnings(action="ignore")

postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'houseprices'


engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))

houseprices = pd.read_sql_query('select * from houseprices',con=engine)

engine.dispose()

In [2]:
houseprices.head()

Unnamed: 0,id,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,...,poolarea,poolqc,fence,miscfeature,miscval,mosold,yrsold,saletype,salecondition,saleprice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
houseprices.select_dtypes(include = 'object').columns

Index(['mszoning', 'street', 'alley', 'lotshape', 'landcontour', 'utilities',
       'lotconfig', 'landslope', 'neighborhood', 'condition1', 'condition2',
       'bldgtype', 'housestyle', 'roofstyle', 'roofmatl', 'exterior1st',
       'exterior2nd', 'masvnrtype', 'exterqual', 'extercond', 'foundation',
       'bsmtqual', 'bsmtcond', 'bsmtexposure', 'bsmtfintype1', 'bsmtfintype2',
       'heating', 'heatingqc', 'centralair', 'electrical', 'kitchenqual',
       'functional', 'fireplacequ', 'garagetype', 'garagefinish', 'garagequal',
       'garagecond', 'paveddrive', 'poolqc', 'fence', 'miscfeature',
       'saletype', 'salecondition'],
      dtype='object')

In [4]:
houseprices.select_dtypes(include = ['int64', 'float64']).columns

Index(['id', 'mssubclass', 'lotfrontage', 'lotarea', 'overallqual',
       'overallcond', 'yearbuilt', 'yearremodadd', 'masvnrarea', 'bsmtfinsf1',
       'bsmtfinsf2', 'bsmtunfsf', 'totalbsmtsf', 'firstflrsf', 'secondflrsf',
       'lowqualfinsf', 'grlivarea', 'bsmtfullbath', 'bsmthalfbath', 'fullbath',
       'halfbath', 'bedroomabvgr', 'kitchenabvgr', 'totrmsabvgrd',
       'fireplaces', 'garageyrblt', 'garagecars', 'garagearea', 'wooddecksf',
       'openporchsf', 'enclosedporch', 'threessnporch', 'screenporch',
       'poolarea', 'miscval', 'mosold', 'yrsold', 'saleprice'],
      dtype='object')

Lets fit the Linear Regression Model(OLs) but first lets determine our training and test datas for our model.For this model we will use the train/test split method and compare the preformance of our model based on the scores of the train data and test data,and determine if there is an overfitting in our model or not.

In [5]:
houseprices['is_z_centralair'] = pd.get_dummies(houseprices.centralair, drop_first=True)
num_columns = ['overallqual','totalbsmtsf','firstflrsf','grlivarea','garagecars','garagearea'
               , 'secondflrsf']
houseprices['garage_cars_area'] = houseprices['garagecars'] * houseprices['garagearea']
houseprices['first_second_flrsf'] = houseprices['firstflrsf'] * houseprices['secondflrsf']
num_columns.extend(['is_z_centralair', 'garage_cars_area', 'first_second_flrsf'])
garage_dummies = pd.get_dummies(houseprices.garagefinish, prefix = 'garagefinish', drop_first = True)
houseprices = pd.concat([houseprices,garage_dummies], axis = 1)
garage_cols = list(garage_dummies.columns)
num_columns.extend(garage_cols)
X = houseprices[num_columns]
Y = houseprices['saleprice']

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 448 )

In [7]:
ols_model = LinearRegression()
ols_model.fit(X_train, Y_train);

now lets predict the the sale price based both from the training and testing datas 

In [8]:
predicted_from_train = ols_model.predict(X_train)
predicted_from_test = ols_model.predict(X_test)

Lets move on and check the performance of our model using the measure metrics

In [9]:
print('The R-squared performance of the model from the training and testing data is : {} and {} respectively'
      .format(ols_model.score(X_train, Y_train), ols_model.score(X_test, Y_test)))
print('The mean of squared errors(MSE) of the test model is: {}'.format(mse(Y_test, predicted_from_test)))
print('The absolute mean error of the test model is: {}'.format(mean_absolute_error(Y_test, predicted_from_test)))
print('The root mean squared error of the test model is: {}'.format(rmse(Y_test, predicted_from_test)))
print('The mean absolute percentage error of the test model is: {}'
      .format(np.mean(np.abs((Y_test - predicted_from_test)/Y_test) * 100)))

The R-squared performance of the model from the training and testing data is : 0.808963720954686 and 0.6491280747467483 respectively
The mean of squared errors(MSE) of the test model is: 2038824503.1904469
The absolute mean error of the test model is: 24437.858537571312
The root mean squared error of the test model is: 45153.344319003074
The mean absolute percentage error of the test model is: 15.249982256971535


From the R_squared result in the above, we can learn that our test data performed less than the training data.Lets try other models incase our model is sufering from over fitting.Lets try to regularize that using Ridge, Lasso and ElasticNet regression techniques.To come up with the optimum hyperparamater(parameter tunning) that minimizes our model's cost function lets use scikit-learn built in cross validation functions which selects the best from the list of hypermarameters.

In [10]:
alphas = np.logspace(-4, 10, 30)
lasso_cv = LassoCV(alphas = alphas, cv = 5)
lasso_cv.fit(X_train, Y_train)

pred_train = lasso_cv.predict(X_train)
pred_test = lasso_cv.predict(X_test)

print("The Best parameter value is: {}".format(lasso_cv.alpha_))
print('The R-squared performance of the model from the training and testing data is : {} and {} respectively'
      .format(lasso_cv.score(X_train, Y_train), lasso_cv.score(X_test, Y_test)))
print('The mean of squared errors(MSE) of the test model is: {}'.format(mse(Y_test, pred_test)))
print('The absolute mean error of the test model is: {}'.format(mean_absolute_error(Y_test, pred_test)))
print('The root mean squared error of the test model is: {}'.format(rmse(Y_test, pred_test)))
print('The mean absolute percentage error of the test model is: {}'
      .format(np.mean(np.abs((Y_test - pred_test)/Y_test) * 100)))

The Best parameter value is: 20.43359717856944
The R-squared performance of the model from the training and testing data is : 0.8089604741343502 and 0.649064861341611 respectively
The mean of squared errors(MSE) of the test model is: 2039191819.6670518
The absolute mean error of the test model is: 24434.123882358897
The root mean squared error of the test model is: 45157.41156960894
The mean absolute percentage error of the test model is: 15.249262438861548


In [11]:
alphas = np.logspace(-4, 10, 30)
ridge_cv = RidgeCV(alphas = alphas, cv = 5)
ridge_cv.fit(X_train, Y_train)

pred_train = ridge_cv.predict(X_train)
pred_test = ridge_cv.predict(X_test)

print("The Best parameter value is: {}".format(ridge_cv.alpha_))
print('The R-squared performance of the model from the training and testing data is : {} and {} respectively'
      .format(ridge_cv.score(X_train, Y_train), ridge_cv.score(X_test, Y_test)))
print('The mean of squared errors(MSE) of the test model is: {}'.format(mse(Y_test, pred_test)))
print('The absolute mean error of the test model is: {}'.format(mean_absolute_error(Y_test, pred_test)))
print('The root mean squared error of the test model is: {}'.format(rmse(Y_test, pred_test)))
print('The mean absolute percentage error of the test model is: {}'
      .format(np.mean(np.abs((Y_test - pred_test)/Y_test) * 100)))

The Best parameter value is: 0.7278953843983154
The R-squared performance of the model from the training and testing data is : 0.8089631293531379 and 0.6491739947007308 respectively
The mean of squared errors(MSE) of the test model is: 2038557674.4115481
The absolute mean error of the test model is: 24436.310361294072
The root mean squared error of the test model is: 45150.38952668679
The mean absolute percentage error of the test model is: 15.249629796106463


In [12]:
alphas = np.logspace(-4, 10, 30)
elastic_cv = ElasticNetCV(alphas = alphas, cv = 5)
elastic_cv.fit(X_train, Y_train)

pred_train = elastic_cv.predict(X_train)
pred_test = elastic_cv.predict(X_test)

print("The Best parameter value is: {}".format(elastic_cv.alpha_))
print('The R-squared performance of the model from the training and testing data is : {} and {} respectively'
      .format(elastic_cv.score(X_train, Y_train), elastic_cv.score(X_test, Y_test)))
print('The mean of squared errors(MSE) of the test model is: {}'.format(mse(Y_test, pred_test)))
print('The absolute mean error of the test model is: {}'.format(mean_absolute_error(Y_test, pred_test)))
print('The root mean squared error of the test model is: {}'.format(rmse(Y_test, pred_test)))
print('The mean absolute percentage error of the test model is: {}'
      .format(np.mean(np.abs((Y_test - pred_test)/Y_test) * 100)))

The Best parameter value is: 0.0009236708571873865
The R-squared performance of the model from the training and testing data is : 0.8089634339469872 and 0.6491607345166639 respectively
The mean of squared errors(MSE) of the test model is: 2038634725.8546722
The absolute mean error of the test model is: 24436.84084509711
The root mean squared error of the test model is: 45151.24279413217
The mean absolute percentage error of the test model is: 15.24977707312676


From the results in the above four models,it is quite hard to determine which model performed better because all four performed about the same.