Featurization, Model Selection & Tuning - Linear Regression

In [2]:
# Import numerical libraries
import numpy as np
import pandas as pd

#Import graphical plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#Import Linear Regression Machine Learning Libraries
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
#from sklearn.preprocessing import polynomialFeatures

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score


In [3]:
data = pd.read_csv(r'C:\Users\aashutosh\Downloads\car-mpg.csv')
data.head()

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,origin,car_type,car_name
0,18.0,8,307.0,130,3504,12.0,70,1,0,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,0,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,0,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,0,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,0,ford torino


In [4]:
data = data.drop(['car_name'], axis = 1)

#replace origin numbers with labels
data['origin'] = data['origin'].replace({1: 'america', 2: 'europe', 3: 'asia'})

#one-hot encoding
data = pd.get_dummies(data, columns=['origin'],dtype=int)

# replace '?' with NaN values
data = data.replace('?', np.nan)

# Convert all columns to numeric (invalid parsing -> NaN)
data = data.apply(pd.to_numeric, errors='ignore')

# Fill missing values with median (only numeric columns will be affected)
for col in data.columns:
    if data[col].dtype != 'O':
        data[col] = data[col].fillna(data[col].median())

  data = data.apply(pd.to_numeric, errors='ignore')


In [5]:
data.head()

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,car_type,origin_america,origin_asia,origin_europe
0,18.0,8,307.0,130.0,3504,12.0,70,0,1,0,0
1,15.0,8,350.0,165.0,3693,11.5,70,0,1,0,0
2,18.0,8,318.0,150.0,3436,11.0,70,0,1,0,0
3,16.0,8,304.0,150.0,3433,12.0,70,0,1,0,0
4,17.0,8,302.0,140.0,3449,10.5,70,0,1,0,0


2.Model building

In [6]:
x = data.drop(['mpg'],axis=1) # independent columns
y = data[['mpg']] # dependent variable

In [8]:
# scaling data

x_s = preprocessing.scale(x)
x_s = pd.DataFrame(x_s , columns = x.columns)# converting scaled data into dataframe

y_s = preprocessing.scale(y)
y_s = pd.DataFrame(y_s, columns = y.columns) # ideally train, test data should be in columns

In [9]:
# split into train , test set

x_train, x_test, y_train, y_test = train_test_split(x_s, y_s, test_size=0.30,random_state = 1)
x_train.shape

(278, 10)

2.a Linear Model

In [11]:
# Fit linear model and find coefficients
regression_model = LinearRegression()
regression_model.fit(x_train, y_train)

for idx, col_name in enumerate(x_train.columns):
    print('The coefficient for {} is {}'.format(col_name, regression_model.coef_[0][idx]))

intercept = regression_model.intercept_[0]
print('The intercept is {}'.format(intercept))

The coefficient for cyl is 0.32102238569161323
The coefficient for disp is 0.3248343091848378
The coefficient for hp is -0.22916950059437616
The coefficient for wt is -0.7112101905072294
The coefficient for acc is 0.01471368276419148
The coefficient for yr is 0.37558119495107445
The coefficient for car_type is 0.38147694842331026
The coefficient for origin_america is -0.07472247547584143
The coefficient for origin_asia is 0.044515252035678604
The coefficient for origin_europe is 0.048348549539454194
The intercept is 0.01928411610363977


2.b Regularization Ridge Regression

In [12]:
#alpha factor here is lambda (penalty term) which helps tp reduce the magnitude of coefficients
ridge_model = Ridge(alpha=0.4)

ridge_model.fit(x_train, y_train)

print('Ridge model coefficents: {}'.format(ridge_model.coef_))
#as the data has 10 columns hence 10 coefficients appear here

Ridge model coefficents: [ 0.31495967  0.30948411 -0.22861679 -0.69782283  0.01239531  0.37411266
  0.37586629 -0.07408168  0.04437854  0.0476772 ]


2.c Regularized Lasso Regression

In [None]:
#alpha factor here is lambda (penalty term) which helps to reduce the magnitude of coeffients
lasso_model = Lasso(alpha=0.1)
lasso_model.fit(x_train, y_train)

print('Lasso model coef:{}'.format(lasso_model.coef_))

#as the data has 10 columns hence 10 coefficients appear here

Lasso model coef:[-0.         -0.         -0.01690287 -0.51890013  0.          0.28138241
  0.1278489  -0.01642647  0.          0.        ]
