In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('auto-mpg.csv')

In [3]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [4]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight            int64
acceleration    float64
model year        int64
origin            int64
car name         object
dtype: object

In [5]:
obj_df = df.select_dtypes(include=['object']).copy()
num_df = df.select_dtypes(include=['float64', 'int64']).copy()

In [6]:
obj_df.head().T

Unnamed: 0,0,1,2,3,4
horsepower,130,165,150,150,140
car name,chevrolet chevelle malibu,buick skylark 320,plymouth satellite,amc rebel sst,ford torino


In [7]:
num_df.head().T

Unnamed: 0,0,1,2,3,4
mpg,18.0,15.0,18.0,16.0,17.0
cylinders,8.0,8.0,8.0,8.0,8.0
displacement,307.0,350.0,318.0,304.0,302.0
weight,3504.0,3693.0,3436.0,3433.0,3449.0
acceleration,12.0,11.5,11.0,12.0,10.5
model year,70.0,70.0,70.0,70.0,70.0
origin,1.0,1.0,1.0,1.0,1.0


In [8]:
#IQR method
Q1 = num_df.quantile(0.25)
Q3 = num_df.quantile(0.75)
IQR = Q3 - Q1

print(IQR)

mpg               11.50
cylinders          4.00
displacement     157.75
weight          1384.25
acceleration       3.35
model year         6.00
origin             1.00
dtype: float64


In [9]:
print((num_df < (Q1 - 1.5 * IQR)) | (num_df > (Q3 + 1.5 * IQR)))

       mpg  cylinders  displacement  weight  acceleration  model year  origin
0    False      False         False   False         False       False   False
1    False      False         False   False         False       False   False
2    False      False         False   False         False       False   False
3    False      False         False   False         False       False   False
4    False      False         False   False         False       False   False
..     ...        ...           ...     ...           ...         ...     ...
393  False      False         False   False         False       False   False
394  False      False         False   False          True       False   False
395  False      False         False   False         False       False   False
396  False      False         False   False         False       False   False
397  False      False         False   False         False       False   False

[398 rows x 7 columns]


In [10]:
#Removing outliers
num_df_out = num_df[~((num_df < (Q1 - 1.5 * IQR)) | (num_df > (Q3 + 1.5 * IQR))).any(axis=1)]

In [11]:
obj_df.head().T

Unnamed: 0,0,1,2,3,4
horsepower,130,165,150,150,140
car name,chevrolet chevelle malibu,buick skylark 320,plymouth satellite,amc rebel sst,ford torino


In [12]:
#label encoding
from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()
label_obj_df = obj_df.apply(lb_make.fit_transform)

In [13]:
label_obj_df.head().T

Unnamed: 0,0,1,2,3,4
horsepower,15,33,27,27,22
car name,49,36,231,14,161


In [14]:
new_df = pd.concat([num_df_out, label_obj_df], axis=1)

In [15]:
new_df.head().T

Unnamed: 0,0,1,2,3,4
mpg,18.0,15.0,18.0,16.0,17.0
cylinders,8.0,8.0,8.0,8.0,8.0
displacement,307.0,350.0,318.0,304.0,302.0
weight,3504.0,3693.0,3436.0,3433.0,3449.0
acceleration,12.0,11.5,11.0,12.0,10.5
model year,70.0,70.0,70.0,70.0,70.0
origin,1.0,1.0,1.0,1.0,1.0
horsepower,15.0,33.0,27.0,27.0,22.0
car name,49.0,36.0,231.0,14.0,161.0


In [16]:
new_df.isna().sum()

mpg             8
cylinders       8
displacement    8
weight          8
acceleration    8
model year      8
origin          8
horsepower      0
car name        0
dtype: int64

In [17]:
mena_num_df = new_df.fillna(new_df.mean())

#### lasso

In [18]:
#lasso regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

X = mena_num_df.drop('mpg', axis=1)
y = mena_num_df['mpg']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [19]:
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)

In [20]:
y_pred = lasso.predict(X_test)

In [21]:
print(mean_squared_error(y_test, y_pred))

10.834734885814798


In [22]:
#r2 score
from sklearn.metrics import r2_score

print(r2_score(y_test, y_pred))

0.8258432055490966


#### Ridge

In [23]:
#ridge regression
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=1)
ridge.fit(X_train, y_train)

In [24]:
y_pred = ridge.predict(X_test)

In [25]:
print(mean_squared_error(y_test, y_pred))

10.746788369431064


In [26]:
print(r2_score(y_test, y_pred))

0.8272568518946632


#### ElasticNet

In [27]:
# elastic net
from sklearn.linear_model import ElasticNet

elastic = ElasticNet(alpha=0.1, l1_ratio=0.5)
elastic.fit(X_train, y_train)

In [28]:
y_pred = elastic.predict(X_test)

In [29]:
print(mean_squared_error(y_test, y_pred))

10.830856032105528


In [30]:
print(r2_score(y_test, y_pred))

0.825905553980809
