### Challenge: model comparison

In [1]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import neighbors,
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
%matplotlib inline


In [25]:
mpg_1 = sns.load_dataset('mpg')

In [24]:
mpg_1.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [4]:
mpg_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
mpg             398 non-null float64
cylinders       398 non-null int64
displacement    398 non-null float64
horsepower      392 non-null float64
weight          398 non-null int64
acceleration    398 non-null float64
model_year      398 non-null int64
origin          398 non-null object
name            398 non-null object
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB


In [5]:
mpg_1['horsepower'].fillna((mpg_1['horsepower'].mean()), inplace=True)

lets First start model fitting using Linear Regression

In [12]:
mpg_1['cylinders'] = mpg_1.cylinders.astype(str)
mpg_1['model_year'] = mpg_1.model_year.astype(str)
X = mpg_1[['horsepower', 'displacement','weight', 'acceleration']]
X = pd.concat([X, pd.get_dummies(mpg_1.cylinders, drop_first = True)], axis = 1)
X = pd.concat([X, pd.get_dummies(mpg_1.model_year, drop_first = True)], axis = 1)
X = pd.concat([X, pd.get_dummies(mpg_1.origin, drop_first = True)], axis = 1)
Y = mpg_1.mpg

In [13]:
ols_mpg_model = LinearRegression()
cross_val_score(ols_mpg_model, X, Y, cv = 10)

array([ 0.60081488,  0.87441431,  0.65828614,  0.82595539,  0.70287861,
        0.84244152,  0.45444124,  0.34667844,  0.29734354, -0.7902755 ])

From the cross validation we can see that our model is overfitting.So lets use regularization methods to comat this overfitting of our model and come up with the best model that works for all datas to come.

In [11]:
alphas = np.logspace(-4, -0.5, 30)
lasso = Lasso(random_state=0, max_iter=20000)
tuned_parameters = [{'alpha': alphas}]
n_folds = 5

grid_mpg_model = GridSearchCV(lasso, param_grid=tuned_parameters, cv=n_folds, refit=True)
grid_mpg_model.fit(X, Y)
print('The best score of our model is: {}'.format(grid_mpg_model.best_score_))
print('The best alpha estimator out of the tunned parameters alphas is: {}'
      .format(grid_mpg_model.best_estimator_.alpha))

print('Finally to check if our model is overfitting or not lets perform nested cross validation: {}'
      .format(cross_val_score(grid_mpg_model, X, Y, cv = 10)))
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

The best score of our model is: 0.4927622908372824
The best alpha estimator out of the tunned parameters alphas is: 0.02592943797404667
Finally to check if our model is overfitting or not lets perform nested cross validation: [ 0.72487836  0.7047704   0.57491964  0.82266974  0.79202113  0.86297206
  0.68172157  0.5840682   0.03114789 -0.27515622]


In [21]:
alphas = np.logspace(-4, -0.5, 30)
elasticnet = ElasticNet(random_state=0, max_iter=20000)
tuned_parameters = [{'alpha': alphas}]
n_folds = 5

grid_mpg_model = GridSearchCV(elasticnet, param_grid=tuned_parameters, cv=n_folds, refit=True)
grid_mpg_model.fit(X, Y)
print('The best score of our model is: {}'.format(grid_mpg_model.best_score_))
print('The best alpha estimator out of the tunned parameters alphas is: {}'
      .format(grid_mpg_model.best_estimator_.alpha))

print('Finally to check if our model is overfitting or not lets perform nested cross validation: {}'
      .format(cross_val_score(grid_mpg_model, X, Y, cv = 10)))
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

The best score of our model is: 0.4999789572803573
The best alpha estimator out of the tunned parameters alphas is: 0.014873521072935119
Finally to check if our model is overfitting or not lets perform nested cross validation: [ 0.62499306  0.7479665   0.61127429  0.81767843  0.74228437  0.86690585
  0.70213913  0.61609483  0.0946442  -0.21848438]


From the regression model perspective,the ElasticNet model performed best with the above R-squared and alpha values.Now lets move on and check KNN regression on our dataset.

In [22]:
knn_mpg_model = neighbors.KNeighborsRegressor(n_neighbors = 10, weights= 'distance')
x = mpg_1[['horsepower', 'displacement','weight', 'acceleration']]
cross_val_score(knn_mpg_model, x, Y, cv = 10)

array([ 0.54474441,  0.56951609,  0.23684298,  0.68384988,  0.61558683,
        0.84880819,  0.74411962,  0.67161077, -0.8279847 ,  0.00409968])

From the above results we can observe that the both the ordinary least square and ElasticNet regression models out performed the kkn regressor model.This might be arised due too the knn's capability to address categorical data.The categorical variables which have a significant effect on the target variable were not included in case of the knn regressor and this resulted in a less accurate model.But if all the feature variables were float in both the cases the performance might have been more comparable.Lets see practically if this hypothesis applies by fittting the Linear Regression model to only the numerical variables that are included in the knn model.

In [23]:
ols_mpg_model = LinearRegression()
cross_val_score(ols_mpg_model, x, Y, cv = 10)

array([ 0.54989445,  0.52431975,  0.0673587 ,  0.64076227,  0.62995486,
        0.85072885,  0.69457333,  0.64852017, -0.87044176, -0.01175375])

we can see that when the same features are applied to Both the models,they performed about the same.But this explains the drow back of the knn regression model.