In [None]:
%matplotlib inline


# Numerical libraries
import numpy as np

# Import Linear Regression machine learning library
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.metrics import r2_score

# to handle data in form of rows and columns
import pandas as pd

# importing ploting libraries
import matplotlib.pyplot as plt

#importing seaborn for statistical plots
import seaborn as sns

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
mpg_df = pd.read_csv("/content/drive/MyDrive/Python Course/Model Tuning/Week 1 _ Feature Engineering and Cross Validation/car-mpg.csv")
mpg_df = mpg_df.drop('car_name', axis=1)
mpg_df['origin'] = mpg_df['origin'].replace({1: 'america', 2: 'europe', 3: 'asia'})
mpg_df = pd.get_dummies(mpg_df, columns=['origin'])
mpg_df = mpg_df.replace('?', np.nan)
mpg_df = mpg_df.apply(lambda x: x.fillna(x.median()),axis=0)

# separate independent and dependent variables

In [None]:
# Copy all the predictor variables into X dataframe. Since 'mpg' is dependent variable drop it
X = mpg_df.drop('mpg', axis=1)

# Copy the 'mpg' column alone into the y dataframe. This is the dependent variable
y = mpg_df[['mpg']]


In [None]:
from sklearn import preprocessing

# scale all the columns of the mpg_df. This will produce a numpy array
X_scaled = preprocessing.scale(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [None]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=1)

# fit a simple linear model

In [None]:
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

The coefficient for cyl is 2.5059518049385026
The coefficient for disp is 2.5357082860560514
The coefficient for hp is -1.7889335736325294
The coefficient for wt is -5.551819873098727
The coefficient for acc is 0.11485734803440747
The coefficient for yr is 2.9318465482116087
The coefficient for car_type is 2.977869737601945
The coefficient for origin_america is -0.583295529016598
The coefficient for origin_asia is 0.34749313804322646
The coefficient for origin_europe is 0.3774164680868858


In [None]:
intercept = regression_model.intercept_[0]

print("The intercept for our model is {}".format(intercept))

The intercept for our model is 23.665107741982705


# Create a regularized RIDGE model and note the coefficients

In [None]:
ridge = Ridge(alpha=.3)
ridge.fit(X_train,y_train)
print ("Ridge model:", (ridge.coef_))


Ridge model: [[ 2.47057467  2.44494419 -1.78573889 -5.47285499  0.10115618  2.92319984
   2.94492098 -0.57949986  0.34667456  0.37344909]]


# Create a regularized LASSO model and note the coefficients

In [None]:
lasso = Lasso(alpha=0.2)
lasso.fit(X_train,y_train)
print ("Lasso model:", (lasso.coef_))

# Observe, many of the coefficients have become 0 indicating drop of those dimensions from the model

Lasso model: [ 0.          0.         -0.3475456  -4.01181473  0.          2.64248634
  1.07111166 -0.54724128  0.          0.        ]


## Let us compare their scores

In [None]:
print(regression_model.score(X_train, y_train))
print(regression_model.score(X_test, y_test))


0.8343770256960538
0.8513421387780067


In [None]:
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))

0.8343617931312616
0.8518882171608504


In [None]:
print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))

0.8114389394513553
0.8547810865027448


In [None]:
# More or less similar results but with less complex models.  Complexity is a function of variables and coefficients
## Note - with Lasso, we get equally good result in test though not so in training.  Further, the number of dimensions is much less
# in LASSO model than ridge or un-regularized model

# Let us generate polynomial models reflecting the non-linear interaction between some dimensions

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
poly = PolynomialFeatures(degree = 2, interaction_only=True)

#poly = PolynomialFeatures(2)

In [None]:
X_poly = poly.fit_transform(X_scaled)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.30, random_state=1)
X_train.shape

(278, 56)

# Fit a simple non regularized linear model on poly features-

In [None]:
regression_model.fit(X_train, y_train)
print(regression_model.coef_[0])


[ 3.24082770e-13 -3.45886171e+10 -4.44319332e+00 -2.20713292e+00
 -2.95250387e+00 -1.53883744e+00  3.01473348e+00 -4.60540335e+10
 -2.07900267e+10  1.09400877e+11 -8.68487255e+10 -1.26832814e+00
 -1.16659123e+00 -1.35749094e-01  2.81359972e+00 -1.97650100e+00
 -1.10816273e+11 -1.93916592e+11 -7.12573640e+10 -6.80153296e+10
  3.85037368e-01  1.71579269e-01 -5.25571363e-01  3.49994333e+00
 -2.04658007e+00 -2.62191546e+10 -2.16090047e+10 -2.06258483e+10
  1.83546538e-01 -6.24509796e-01 -1.89501969e+00 -5.63179799e-01
 -1.08850675e+10 -8.97113113e+09 -8.56296678e+09 -1.93246841e-01
  5.19730568e-01 -3.53975296e+00  4.92858486e+10  4.06198503e+10
  3.87717472e+10  5.21789551e-01  1.75518036e+00  9.36501148e+09
  7.71834867e+09  7.36718284e+09  3.74290466e-01  6.87454454e+09
  5.66578394e+09  5.40800475e+09 -2.72196349e+10  2.96019913e+10
  2.82551736e+10  7.01517623e+09 -1.44902352e+11  8.29552777e+10]


In [None]:
ridge = Ridge(alpha=.3)
ridge.fit(X_train,y_train)
print ("Ridge model:", (ridge.coef_))

Ridge model: [[ 0.          3.73512981 -2.93500874 -2.13974194 -3.56547812 -1.28898893
   3.01290805  2.04739082  0.0786974   0.21972225 -0.3302341  -1.46231096
  -1.17221896  0.00856067  2.48054694 -1.67596093  0.99537516 -2.29024279
   4.7699338  -2.08598898  0.34009408  0.35024058 -0.41761834  3.06970569
  -2.21649433  1.86339518 -2.62934278  0.38596397  0.12088534 -0.53440382
  -1.88265835 -0.7675926  -0.90146842  0.52416091  0.59678246 -0.26349448
   0.5827378  -3.02842915 -0.36548074  0.5956112  -0.15941014  0.49168856
   1.45652375 -0.43819158 -0.20964198  0.77665496  0.36489921 -0.4750838
   0.3551047   0.23188557 -1.42941282  2.06831543 -0.34986402 -0.32320394
   0.39054656  0.06283411]]


In [None]:
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))


0.9143225702003367
0.861339805369855


In [None]:
lasso = Lasso(alpha=0.1)
lasso.fit(X_train,y_train)
print ("Lasso model:", (lasso.coef_))


Lasso model: [ 0.         -0.         -0.         -1.59613165 -5.22452383 -0.
  2.86907439  0.03030592 -0.10514919  0.          0.         -0.
 -0.          0.          0.28971732 -0.          0.         -0.
  0.11457443 -0.          0.          1.15720495  0.          0.
 -0.          0.          0.         -0.          0.04724906  0.
 -0.6925298  -0.          0.          0.         -0.         -0.
 -0.         -0.67082659  0.         -0.         -0.          0.16918498
 -0.         -0.61771612  0.          0.36046427  0.         -0.37086554
  0.          0.         -0.         -0.          0.18165859 -0.
 -0.         -0.        ]


In [None]:
print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))


0.8900519684208551
0.880222844847697
