In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge

from sklearn.metrics import r2_score

In [3]:
from sklearn.ensemble import RandomForestRegressor

In [4]:
df = pd.read_csv('./data/Auto.csv')

In [5]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [6]:
df = df.replace('?', np.nan)
df = df.dropna()
df.shape

(392, 9)

In [7]:
df.columns

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'year', 'origin', 'name'],
      dtype='object')

In [8]:
X = df[[ 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'year']]
y = df[['mpg']]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Linear Regression Model

In [10]:
reg = LinearRegression()

In [11]:
reg.fit(X_train, y_train)

LinearRegression()

In [12]:
pred_linReg = reg.predict(X_test)

In [13]:
r2_linReg = r2_score(y_test, pred_linReg)
r2_linReg

0.755053072656052

In [14]:
linReg = {'r2_linReg': r2_linReg}

## Lasso Models L1 Regularization

In [15]:
reg_lso = Lasso()

In [16]:
reg_lso.fit(X_train, y_train)

Lasso()

In [17]:
pred_lso = reg_lso.predict(X_test)

In [18]:
r2_lasso = r2_score(y_test, pred_lso)
r2_lasso

0.7640096281987518

In [19]:
lasso = {'r2_lasso': r2_lasso}

### Reduce the amount of L1 Regularization

In [20]:
reg_lso_2 = Lasso(alpha=0.5)

In [21]:
reg_lso_2.fit(X_train, y_train)

Lasso(alpha=0.5)

In [22]:
pred_lso_2 = reg_lso_2.predict(X_test)

In [23]:
r2_lasso_2 = r2_score(y_test, pred_lso_2)
r2_lasso_2

0.7636777619489863

In [24]:
lasso_2 = {'r2_lasso_2': r2_lasso_2}

## Ridge Model L2 Regularization

In [25]:
reg_ridge = Ridge()

In [26]:
reg_ridge.fit(X_train, y_train)

Ridge()

In [27]:
pred_ridge = reg_ridge.predict(X_test)

In [28]:
r2_ridge = r2_score(y_test, pred_ridge)
r2_ridge

0.7551055833585947

In [29]:
ridge = {'r2_ridge': r2_ridge}

### Reduced Ridge Regression

In [30]:
reg_ridge_2 = Ridge(alpha=0.5)

In [31]:
reg_ridge_2.fit(X_train, y_train)

Ridge(alpha=0.5)

In [32]:
pred_ridge_2 = reg_ridge_2.predict(X_test)

In [33]:
r2_ridge_2 = r2_score(y_test, pred_ridge_2)
r2_ridge_2

0.755079491298146

In [34]:
ridge_2 = {'r2_ridge_2': r2_ridge_2}

## Compare non-linear model: Random Forest

In [35]:
reg_rf = RandomForestRegressor()

In [36]:
reg_rf.fit(X_train, y_train)
pred_reg_rf = reg_rf.predict(X_test)
r2_reg_rf = r2_score(y_test, pred_reg_rf)
r2_reg_rf

  reg_rf.fit(X_train, y_train)


0.7796410751833223

In [37]:
r2_rf = {'r2_reg_rf': r2_reg_rf}

## Compare the various models

In [38]:
obj = [linReg, lasso, lasso_2, ridge, ridge_2, r2_rf]

In [39]:
o2 = {}
names = []
for d in obj:
    for k,v in d.items():
        names.append(k)
        o2[k] = v

In [40]:
o2

{'r2_linReg': 0.755053072656052,
 'r2_lasso': 0.7640096281987518,
 'r2_lasso_2': 0.7636777619489863,
 'r2_ridge': 0.7551055833585947,
 'r2_ridge_2': 0.755079491298146,
 'r2_reg_rf': 0.7796410751833223}

In [41]:
d2 = pd.DataFrame(o2, index=['R2'])

In [42]:
d2 = d2.T
d2

Unnamed: 0,R2
r2_linReg,0.755053
r2_lasso,0.76401
r2_lasso_2,0.763678
r2_ridge,0.755106
r2_ridge_2,0.755079
r2_reg_rf,0.779641


In [43]:
d3 = d2.sort_values(by='R2')
d3

Unnamed: 0,R2
r2_linReg,0.755053
r2_ridge_2,0.755079
r2_ridge,0.755106
r2_lasso_2,0.763678
r2_lasso,0.76401
r2_reg_rf,0.779641
