In [37]:
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV   
from sklearn.metrics import mean_squared_error

import sklearn as sk
import pandas as pd
import numpy as np

# Read the dataset

In [38]:
df = pd.read_csv("auto-mpg.csv")
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


# Remove the "car name" coulmn

In [39]:
df = df.drop('car name', axis = 1)

# Normalizing the dataset using L2 norm

In [40]:
df.replace('?', pd.NA, inplace=True)
df = df.dropna()
normal_df = Normalizer(norm='l2').transform(df)



In [41]:
normal_df

array([[0.00511277, 0.00227234, 0.08720114, ..., 0.00340851, 0.019883  ,
        0.00028404],
       [0.00403885, 0.00215405, 0.09423973, ..., 0.00309645, 0.01884795,
        0.00026926],
       [0.00521025, 0.00231567, 0.09204783, ..., 0.00318404, 0.0202621 ,
        0.00028946],
       ...,
       [0.01389964, 0.00173746, 0.05863912, ..., 0.00503862, 0.03561783,
        0.00043436],
       [0.01064467, 0.00152067, 0.04562   , ..., 0.0070711 , 0.03117367,
        0.00038017],
       [0.01137482, 0.00146772, 0.04366461, ..., 0.00711843, 0.03008822,
        0.00036693]])

# Split the dataset into train and test

In [42]:
# Convert the array into a dataframe
normal_df = pd.DataFrame(normal_df)
y = normal_df.iloc[:,:1]
X = normal_df.iloc[:,1:] 

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [44]:
fit1 = LinearRegression().fit(X_train, y_train)

In [45]:
coefficients = fit1.coef_
# Coefficient of the year attribute:
coefficients[0][5]

0.5698716404255911

# Predict on the test dataset

In [46]:
y_pred = fit1.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(mse)

1.8746510415302598e-06


(C) (10 points) Try linear regression with regularization (Ridge and Lasso) as implemented in sklearn (RidgeCV and LassoCV). Use the cross-validation approach and compare the coefficients for the different attributes.

# Before we perform Ridge or Lasso on our Linear Model, let's perform Cross-Validation to find an optimal Alpha value

In [47]:
# Ridge
# alpha_test = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}
# ridge = Ridge()
# grid_search1 = GridSearchCV(estimator=ridge, param_grid=alpha_test, scoring='neg_mean_squared_error', cv=5)
# grid_search1.fit(X_train, y_train)
# best_alpha1 = grid_search1.best_params_['alpha']
# best_ridge1 = grid_search1.best_estimator_

In [48]:
ridge_regression = RidgeCV(alphas=[0.001, 0.01, 0.1, 1, 10, 100]).fit(X_train, y_train)
ridge_regression.coef_

array([[ 0.02053449, -0.01912991, -0.03005851, -0.04396165,  0.10153373,
         0.54081557,  0.03769747]])

In [49]:
# MSE on test using Ridge
y_pred = ridge_regression.predict(X_test)
ridge_mse = mean_squared_error(y_test, y_pred)
print(ridge_mse)

1.9148776877261556e-06


In [50]:
# Lasso
# lasso = Lasso()
# grid_search2 = GridSearchCV(estimator=lasso, param_grid=alpha_test, scoring='neg_mean_squared_error', cv=5)
# grid_search2.fit(X_train, y_train)
# best_alpha2 = grid_search2.best_params_['alpha']
# best_ridge2 = grid_search2.best_estimator_

In [51]:
lasso_regression = LassoCV(cv=5, random_state=0).fit(X_train, y_train)
lasso_regression.coef_

  y = column_or_1d(y, warn=True)


array([ 0.        ,  0.00883293, -0.04832203, -0.        ,  0.        ,
        0.64119412,  0.        ])

In [52]:
# MSE on test using Lasso
y_pred = lasso_regression.predict(X_test)
lasso_mse = mean_squared_error(y_test, y_pred)
print(lasso_mse)

1.8765639641889232e-06


In [53]:
print("coefficients of Ordinary Linear Regression: ", coefficients)
print("coefficients of Ridge: ", ridge_regression.coef_)
print("coefficients of Lasso: ", lasso_regression.coef_)


coefficients of Ordinary Linear Regression:  [[ 0.84020765 -0.11601289 -0.1712066  -1.61009463 -0.27982384  0.56987164
   0.84684289]]
coefficients of Ridge:  [[ 0.02053449 -0.01912991 -0.03005851 -0.04396165  0.10153373  0.54081557
   0.03769747]]
coefficients of Lasso:  [ 0.          0.00883293 -0.04832203 -0.          0.          0.64119412
  0.        ]
