In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score

In [2]:
# Data Reading & Cleaning
cdata = pd.read_csv("car-mpg.csv")
cdata = cdata.drop('car_name', axis = 1)
cdata['origin'] = cdata['origin'].replace({1:'america', 2:'europe', 3:'asia'})
cdata = pd.get_dummies(cdata, columns = ['origin'])
cdata = cdata.replace('?', np.nan)
cdata = cdata.apply(lambda x: x.fillna(x.median()), axis=0)

In [3]:
X = cdata.drop(['mpg','origin_europe'], axis = 1)
y = cdata[['mpg']]

In [4]:
from sklearn import preprocessing

X_scaled = preprocessing.scale(X)
X_scaled = pd.DataFrame(X_scaled, columns = X.columns)

y_scaled = preprocessing.scale(y)
y_scaled = pd.DataFrame(y_scaled, columns=y.columns)

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size = 0.3, random_state = 1)

In [9]:
# Fit A Simple Regression Model

regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

for idx, col_name in enumerate(X_train.columns):
    print("Coefficients of {} is {}".format(col_name, regression_model.coef_[0][idx]))
    
intercept = regression_model.intercept_[0]
print("Intercept of model is {}".format(intercept))

print(regression_model.score(X_train, y_train))
print(regression_model.score(X_test, y_test))

Coefficients of cyl is 0.3210223856916103
Coefficients of disp is 0.3248343091848393
Coefficients of hp is -0.22916950059437619
Coefficients of wt is -0.7112101905072298
Coefficients of acc is 0.014713682764191044
Coefficients of yr is 0.37558119495107434
Coefficients of car_type is 0.3814769484233099
Coefficients of origin_america is -0.13618215843840364
Coefficients of origin_asia is -0.006137890589388876
Intercept of model is 0.01928411610363973
0.8343770256960538
0.8513421387780067


In [10]:
# Fit A Ridge Model (L2 Regularization)

ridge_model = Ridge(alpha=0.3)
ridge_model.fit(X_train, y_train)
print("Ridge model:", (ridge_model.coef_))
print(ridge_model.score(X_train, y_train))
print(ridge_model.score(X_test, y_test))

Ridge model: [[ 0.31658439  0.31300635 -0.22875871 -0.70101302  0.01295503  0.37442624
   0.37733935 -0.13479863 -0.00552806]]
0.8343615817491262
0.8519030143807422


In [11]:
# Fit A Lasso Model (L1 Regularization)

lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train, y_train)
print("Ridge model:", (lasso_model.coef_))
print(lasso_model.score(X_train, y_train))
print(lasso_model.score(X_test, y_test))

Ridge model: [-0.         -0.         -0.01690287 -0.51890013  0.          0.28138241
  0.1278489  -0.01642647  0.        ]
0.7938010766228453
0.8375229615977083


In [12]:
# More or less similar results but with less complex models.  Complexity is a function of variables and coefficients
## Note - with Lasso, we get equally good result in test though not so in training.  Further, the number of dimensions is much less
# in LASSO model than ridge or un-regularized model

In [16]:
## Adding Interaction Terms to Increase model complexity

from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree = 2, interaction_only = True)
X_poly = poly.fit_transform(X_scaled)

X_train, X_test, y_train, y_test = train_test_split(X_poly, y_scaled, test_size = 0.3, random_state = 1)
X_train.shape

(278, 46)

In [17]:
# Regression Model

regression_model.fit(X_train, y_train)
print(regression_model.coef_[0])
print(regression_model.score(X_train, y_train))
print(regression_model.score(X_test, y_test))

[ 4.48534254e-14 -4.80956948e+11 -5.51112240e-01 -2.98339652e-01
 -3.80620057e-01 -1.98438818e-01  3.84788700e-01 -6.40384302e+11
 -1.64919947e+11  7.48400761e+11 -1.48422513e-01 -1.81963946e-01
  4.15779915e-03  3.45336460e-01 -2.49827130e-01 -1.54090741e+12
 -1.49419515e+12  1.28985232e+00  6.07234883e-02  1.84721766e-02
 -6.64053099e-02  4.64088056e-01 -2.54320635e-01  1.87056165e-01
 -7.78781465e-01  3.58271877e-02 -7.18845289e-02 -2.54334131e-01
 -8.91939139e-02 -2.22423476e-01 -2.11968460e-02 -2.01984593e-02
  6.81451859e-02 -4.09083199e-01 -9.20136632e-03  2.62071451e-01
  6.66780746e-02  2.20932612e-01 -1.82740598e-01 -1.79983902e-01
  5.35457649e-02 -1.02800084e-01  1.23152231e-02 -8.77924484e+11
  5.30348753e-01  5.78932167e+11]
0.9149133827619226
0.8586272303875542


In [18]:
# Ridge Model with L2 Regularization

ridge_model.fit(X_train, y_train)
print(ridge_model.coef_)
print(ridge_model.score(X_train, y_train))
print(ridge_model.score(X_test, y_test))

[[ 0.          0.46885355 -0.37402127 -0.27089854 -0.46317828 -0.16311881
   0.38622902  0.23163975  0.05750483  0.05631796 -0.18330425 -0.14913042
  -0.00190077  0.31347509 -0.21325107  0.02784534 -0.15822531  0.76956105
   0.04194593  0.03727214 -0.05285657  0.39111239 -0.28346519  0.21829204
  -0.32564817  0.02123812 -0.06697644 -0.23952141 -0.09061334 -0.20705001
  -0.00695606 -0.03625414  0.07317645 -0.39531717 -0.03127763  0.07598792
   0.06354046  0.18321052 -0.18031527 -0.12535362  0.04665224 -0.09707233
   0.01599254 -0.23507189  0.24895732 -0.10142059]]
0.9141778746823366
0.860308526311051


In [19]:
# Lasso Model with L1 Regularization

lasso_model.fit(X_train, y_train)
print(lasso_model.coef_)
print(lasso_model.score(X_train, y_train))
print(lasso_model.score(X_test, y_test))

[ 0.         -0.         -0.         -0.03743801 -0.53257778  0.
  0.28388087  0.1133023  -0.0031615   0.          0.          0.
  0.         -0.         -0.          0.          0.         -0.
  0.          0.         -0.         -0.         -0.          0.
 -0.          0.00183785 -0.         -0.         -0.          0.
 -0.         -0.         -0.01793031 -0.          0.         -0.
  0.          0.         -0.05293357  0.          0.         -0.
  0.         -0.          0.         -0.        ]
0.8126132956294494
0.8502441371995254
