In [1]:
# Numerical libraries
import numpy as np

## Import linear regression machine learning libraries
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.metrics import r2_score

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
mpg_df = pd.read_csv("car-mpg.csv")
mpg_df

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,origin,car_type,car_name
0,18.0,8,307.0,130,3504,12.0,70,1,0,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,0,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,0,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,0,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,0,ford torino
...,...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1,1,ford mustang gl
394,44.0,4,97.0,52,2130,24.6,82,2,1,vw pickup
395,32.0,4,135.0,84,2295,11.6,82,1,1,dodge rampage
396,28.0,4,120.0,79,2625,18.6,82,1,1,ford ranger


In [3]:
mpg_df = mpg_df.drop("car_name", axis = 1)
mpg_df

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,origin,car_type
0,18.0,8,307.0,130,3504,12.0,70,1,0
1,15.0,8,350.0,165,3693,11.5,70,1,0
2,18.0,8,318.0,150,3436,11.0,70,1,0
3,16.0,8,304.0,150,3433,12.0,70,1,0
4,17.0,8,302.0,140,3449,10.5,70,1,0
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1,1
394,44.0,4,97.0,52,2130,24.6,82,2,1
395,32.0,4,135.0,84,2295,11.6,82,1,1
396,28.0,4,120.0,79,2625,18.6,82,1,1


In [4]:
mpg_df["origin"] = mpg_df["origin"].replace({1 : "America", 2 : "Europe", 3 : "Asia"})
mpg_df

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,origin,car_type
0,18.0,8,307.0,130,3504,12.0,70,America,0
1,15.0,8,350.0,165,3693,11.5,70,America,0
2,18.0,8,318.0,150,3436,11.0,70,America,0
3,16.0,8,304.0,150,3433,12.0,70,America,0
4,17.0,8,302.0,140,3449,10.5,70,America,0
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,America,1
394,44.0,4,97.0,52,2130,24.6,82,Europe,1
395,32.0,4,135.0,84,2295,11.6,82,America,1
396,28.0,4,120.0,79,2625,18.6,82,America,1


In [5]:
mpg_df = pd.get_dummies(mpg_df, columns = ["origin"])
mpg_df = mpg_df.replace("?", np.nan)
mpg_df = mpg_df.apply(lambda x:x.fillna(x.median()), axis = 0)
mpg_df

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,car_type,origin_America,origin_Asia,origin_Europe
0,18.0,8,307.0,130,3504,12.0,70,0,1,0,0
1,15.0,8,350.0,165,3693,11.5,70,0,1,0,0
2,18.0,8,318.0,150,3436,11.0,70,0,1,0,0
3,16.0,8,304.0,150,3433,12.0,70,0,1,0,0
4,17.0,8,302.0,140,3449,10.5,70,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1,1,0,0
394,44.0,4,97.0,52,2130,24.6,82,1,0,0,1
395,32.0,4,135.0,84,2295,11.6,82,1,1,0,0
396,28.0,4,120.0,79,2625,18.6,82,1,1,0,0


In [6]:
X = mpg_df.drop("mpg", axis = 1)

y = mpg_df[["mpg"]]

In [7]:
X

Unnamed: 0,cyl,disp,hp,wt,acc,yr,car_type,origin_America,origin_Asia,origin_Europe
0,8,307.0,130,3504,12.0,70,0,1,0,0
1,8,350.0,165,3693,11.5,70,0,1,0,0
2,8,318.0,150,3436,11.0,70,0,1,0,0
3,8,304.0,150,3433,12.0,70,0,1,0,0
4,8,302.0,140,3449,10.5,70,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
393,4,140.0,86,2790,15.6,82,1,1,0,0
394,4,97.0,52,2130,24.6,82,1,0,0,1
395,4,135.0,84,2295,11.6,82,1,1,0,0
396,4,120.0,79,2625,18.6,82,1,1,0,0


In [8]:
y

Unnamed: 0,mpg
0,18.0
1,15.0
2,18.0
3,16.0
4,17.0
...,...
393,27.0
394,44.0
395,32.0
396,28.0


In [9]:
from sklearn import preprocessing

In [10]:
# scale all the columns of the mpg_df. This will produce a numpy array
X_scaled = preprocessing.scale(X)
X_scaled = pd.DataFrame(X_scaled, columns = X.columns)  # ideally the training and test should be 

y_scaled = preprocessing.scale(y)
y_scaled = pd.DataFrame(y_scaled, columns = y.columns)   # ideally the training and test should be 


In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size = 0.30, random_state = 1) 

# fit a simple linear model

In [13]:
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

In [14]:
for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

The coefficient for cyl is 0.3411638230480514
The coefficient for disp is 0.3324060814582771
The coefficient for hp is -0.22570986907902518
The coefficient for wt is -0.7177369483173448
The coefficient for acc is 0.020387646114638246
The coefficient for yr is 0.3733822858457041
The coefficient for car_type is 0.37898851619278756
The coefficient for origin_America is -8769455385568.458
The coefficient for origin_Asia is -7227510019209.508
The coefficient for origin_Europe is -6898676135401.727


In [16]:
intercept = regression_model.intercept_[0]
print("The intercept for our model is {}".format(intercept))

The intercept for our model is 0.01998499651378257


## create a regularized Ridge model and note the coefficients

In [17]:
ridge = Ridge(alpha = .3)
ridge.fit(X_train, y_train)
print("Ridge_model:", (ridge.coef_))

Ridge_model: [[ 0.31649043  0.31320707 -0.22876025 -0.70109447  0.01295851  0.37447352
   0.37725608 -0.07423624  0.04441039  0.04784031]]


In [21]:
## here we will see fractional differences not very big differences from the above coefficients 

In [18]:
# here we observe that none of the coefficient is zero but are very nearer to 0

# create a regularized lasso model and note the coefficients

In [19]:
lasso = Lasso(alpha = 0.20)
lasso.fit(X_train, y_train)
print("Lasso_model:", (lasso.coef_))

Lasso_model: [-0.         -0.         -0.         -0.49040652  0.          0.20770417
  0.09573255 -0.          0.          0.        ]


In [20]:
# here we observe that many of the coefficient is zero indicating drop of those dimenssion from the model

## Let us compare their scores

In [22]:
print(regression_model.score(X_train, y_train))
print(regression_model.score(X_test, y_test))

0.8337582284259674
0.8498484499774868


In [23]:
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))

0.8343617931312616
0.851888217160851


In [24]:
print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))

0.7449291598497929
0.7889774437561077


In [25]:
## lasso is 74 vs 78 so look like the performance of lasso is slightly gone down
## but keeping in mind this 74 is with 2 dimensions whereas the score of 83(in ridge) is with 10 dimensson 
## similarly the score of 83(in regression_model) is with 10 dimenssions
## so without loosing much accuraccy we have reduced the no of dimenssions 
## we have used for our mode building 
## so, defineately this model will survive in production much better than the above two
## as disscussed earlier more the no of dimensions more likely in the over fit zone 

## More or less similar results but with less complex models.  Complexity is a function of variables and coefficients
## Note - with Lasso, we get equally good result in test though not so in training.  Further, the number of dimensions is much less
## in LASSO model than ridge or un-regularized model

In [26]:
# <-------------------------------------------------------------------------------------------------------------------------->

## Let us generate polynomial models reflecting the non-linear interaction b/w Some dimensions

In [27]:
from sklearn.preprocessing import PolynomialFeatures

In [28]:
poly = PolynomialFeatures(degree = 2 ,interaction_only = True) 
## use the polynomialFeature function to create our new dimenssions 
# new dimenssions -- > but when we are creating our new dimenssions restrict ourselves to the degree of 2
## that means we can create the dimenssions of original dimenssion raise to the power of 2
## Note :-- but donot go beyond the power of 2.

## take only those dimenssions which shows some corelation b/w themselves or interaction b/w themselves(interaction_only = True)
## in this dataset we have--> hp and the wt both of them are +ve corelation
## similarly "hp" and "acc" they have inverse corelation

## donot give degree beyond 2 (it is a bad practice)

# poly = polynomialfeatures(2)

## we have used the polynomial feature generating function available in sklearn preprocessing library
## it does is --> it takes the 10 dimensions (the independent dimenssions) understand the relationship between those dimensions
## and based on those relationships --> it generates new dimennsions

In [30]:
X_poly = poly.fit_transform(X_scaled)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size = 0.30, random_state = 1)
X_train.shapee

(278, 56)

In [31]:
## the dataset with degree 2 is split into trainning set and test set 

## Fit a simple non-regularized linear model on poly feature 

In [32]:
regression_model.fit(X_train, y_train)
print(regression_model.coef_[0])

[-9.67853872e-13  8.35590194e+10 -4.44193447e+00 -2.20474084e+00
 -2.94978351e+00 -1.53682900e+00  3.01560586e+00  1.11257119e+11
  4.79177898e+10 -2.53728330e+11  2.13885648e+11 -1.27931356e+00
 -1.14059162e+00 -1.47628546e-01  2.83468437e+00 -1.98167706e+00
  2.67709434e+11  4.71946046e+11  1.75013992e+11  1.67051287e+11
  3.83430481e-01  1.69248581e-01 -5.35890579e-01  3.49282074e+00
 -2.04669762e+00  6.45293885e+10  5.31830976e+10  5.07633978e+10
  1.77463531e-01 -6.20143890e-01 -1.89239502e+00 -5.46977997e-01
  3.08549275e+10  2.54296633e+10  2.42726763e+10 -1.94962025e-01
  5.19100189e-01 -3.55959702e+00 -1.19598006e+11 -9.85689250e+10
 -9.40842819e+10  5.16204834e-01  1.77455139e+00 -3.78345784e+10
 -3.11820726e+10 -2.97633652e+10  3.72213364e-01 -1.64465344e+10
 -1.35547177e+10 -1.29380115e+10  6.30449376e+10 -7.37476036e+10
 -7.03922695e+10 -3.79537550e+09  3.57608932e+11 -1.92325457e+11]


In [33]:
## we have built a simple linear model with no regularization

## here X_tain is the one with 56 dimenssions 

## now when we observe the coefficients carefully --> most of them are e raise to the power - and + 

## here it clearly indicates with 56 dimenssions now WE ARE IN THE SITUATION OF CURSE OF DIMENSIONALITY 

## the no of datapoints we have is too small as compared to the permutations and combinations in these 56 dimensions 

## as a result of that the model that we generated is a overfit model  WITH many sharp peaks and Vallies

## the sharp peaks and valleies are evident from these magnitude of these coefficients.

## this model will perform very well on trainning set but perform poorly on test set

In [34]:
ridge = Ridge(alpha = 0.3)
ridge.fit(X_train, y_train)
print("Ridge model:", (ridge.coef_))

Ridge model: [[ 0.          3.73512981 -2.93500874 -2.13974194 -3.56547812 -1.28898893
   3.01290805  2.04739082  0.0786974   0.21972225 -0.3302341  -1.46231096
  -1.17221896  0.00856067  2.48054694 -1.67596093  0.99537516 -2.29024279
   4.7699338  -2.08598898  0.34009408  0.35024058 -0.41761834  3.06970569
  -2.21649433  1.86339518 -2.62934278  0.38596397  0.12088534 -0.53440382
  -1.88265835 -0.7675926  -0.90146842  0.52416091  0.59678246 -0.26349448
   0.5827378  -3.02842915 -0.36548074  0.5956112  -0.15941014  0.49168856
   1.45652375 -0.43819158 -0.20964198  0.77665496  0.36489921 -0.4750838
   0.3551047   0.23188557 -1.42941282  2.06831543 -0.34986402 -0.32320394
   0.39054656  0.06283411]]


In [35]:
## now we will notice that some of the coefficients in ridge are also --> 0. 
## in many of the cases they are very close to 0 but are not 0 
## 0. --> in this case they are round up to zero 

In [36]:
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))

0.9143225702003358
0.861339805369854


In [37]:
lasso = Lasso(alpha = 0.1)
lasso.fit(X_train, y_train)
print("Lasso model:", (lasso.coef_))

Lasso model: [ 0.         -0.         -0.         -1.59613165 -5.22452383 -0.
  2.86907439  0.03030592 -0.10514919  0.          0.         -0.
 -0.          0.          0.28971732 -0.          0.         -0.
  0.11457443 -0.          0.          1.15720495  0.          0.
 -0.          0.          0.         -0.          0.04724906  0.
 -0.6925298  -0.          0.          0.         -0.         -0.
 -0.         -0.67082659  0.         -0.         -0.          0.16918498
 -0.         -0.61771612  0.          0.36046427  0.         -0.37086554
  0.          0.         -0.         -0.          0.18165859 -0.
 -0.         -0.        ]


In [38]:
print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))

0.8900519684208552
0.880222844847697
