In [1]:
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
import pandas as pd
import numpy as np



In [2]:
# Reading the auto mpg dataset from the url
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
column_names = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin', 'car name']

df = pd.read_csv(url, names=column_names, na_values='?', comment='\t', sep=' ', skipinitialspace=True)
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790.0,15.6,82,1,
394,44.0,4,97.0,52.0,2130.0,24.6,82,2,
395,32.0,4,135.0,84.0,2295.0,11.6,82,1,
396,28.0,4,120.0,79.0,2625.0,18.6,82,1,


In [3]:
# Removing irelevent columns
df.drop('car name', inplace = True, axis=1)
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


In [4]:
f'Records: {df.shape[0]} & Features: {df.shape[1]}'

'Records: 398 & Features: 8'

In [5]:
# Checking for null values
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model year      0
origin          0
dtype: int64

In [6]:
# Removing records with null values
df.dropna(subset = ['horsepower'], inplace=True)
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
dtype: int64

In [7]:
# Checking the data type of columns
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        392 non-null    float64
 5   acceleration  392 non-null    float64
 6   model year    392 non-null    int64  
 7   origin        392 non-null    int64  
dtypes: float64(5), int64(3)
memory usage: 27.6 KB


In [8]:
# Generate descriptive statistics of the DataFrame
df.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0
mean,23.445918,5.471939,194.41199,104.469388,2977.584184,15.541327,75.979592,1.576531
std,7.805007,1.705783,104.644004,38.49116,849.40256,2.758864,3.683737,0.805518
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0,1.0
25%,17.0,4.0,105.0,75.0,2225.25,13.775,73.0,1.0
50%,22.75,4.0,151.0,93.5,2803.5,15.5,76.0,1.0
75%,29.0,8.0,275.75,126.0,3614.75,17.025,79.0,2.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0,3.0


In [9]:
# Apply scaling to the dateset
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
df_scaled.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0
mean,0.3842,0.494388,0.326646,0.317768,0.386897,0.448888,0.498299,0.288265
std,0.20758,0.341157,0.270398,0.209191,0.240829,0.164218,0.306978,0.402759
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.212766,0.2,0.095607,0.157609,0.173589,0.34375,0.25,0.0
50%,0.365691,0.2,0.21447,0.258152,0.337539,0.446429,0.5,0.0
75%,0.531915,1.0,0.536822,0.434783,0.56755,0.537202,0.75,0.5
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
# Calculate the correlation to find the most relevant features
df_scaled.corr()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
mpg,1.0,-0.777618,-0.805127,-0.778427,-0.832244,0.423329,0.580541,0.565209
cylinders,-0.777618,1.0,0.950823,0.842983,0.897527,-0.504683,-0.345647,-0.568932
displacement,-0.805127,0.950823,1.0,0.897257,0.932994,-0.5438,-0.369855,-0.614535
horsepower,-0.778427,0.842983,0.897257,1.0,0.864538,-0.689196,-0.416361,-0.455171
weight,-0.832244,0.897527,0.932994,0.864538,1.0,-0.416839,-0.30912,-0.585005
acceleration,0.423329,-0.504683,-0.5438,-0.689196,-0.416839,1.0,0.290316,0.212746
model year,0.580541,-0.345647,-0.369855,-0.416361,-0.30912,0.290316,1.0,0.181528
origin,0.565209,-0.568932,-0.614535,-0.455171,-0.585005,0.212746,0.181528,1.0


In [11]:
# Seperating features and target variable
x = df_scaled[['cylinders', 'displacement', 'horsepower', 'weight']]
y = df_scaled['mpg']

In [12]:
#Spliting data into 80% training data & 20% testing data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [13]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((313, 4), (313,), (79, 4), (79,))

In [14]:
# Creating a dataframe to store results
results = pd.DataFrame()
Name = []
r2_list = []
mse_list = []

In [15]:
# Apply Linear Regression model
lr = LinearRegression()
lr_pred = lr.fit(x_train, y_train).predict(x_test)
Name.append('Linear Regression')
r2_list.append(r2_score(y_test, lr_pred))
mse_list.append(mean_squared_error(y_test, lr_pred))
print("R2 score using Linear Regression is: {}%".format(r2_score(y_test, lr_pred)*100))
print("Mean Squared Error using Linear Regression is: {}".format(mean_squared_error(y_test, lr_pred)))

R2 score using Linear Regression is: 71.78657423120652%
Mean Squared Error using Linear Regression is: 0.013834164692978459


In [16]:
# Applying & fine tuning Lasso Regression model
alpha_values = np.arange(0.1,1.1, 0.1)
best_alpha = 0
best_mse = 1
for alpha in alpha_values:
    model = Lasso(alpha=alpha, random_state=1)
    y_pred = model.fit(x_train, y_train).predict(x_test)
    mse = mean_squared_error(y_test, y_pred)
    if mse < best_mse:
        best_mse = mse
        best_alpha = alpha
print("Best alpha:", best_alpha)
print("Best MSE score:", best_mse)

Best alpha: 0.1
Best MSE score: 0.04906435554862313


In [17]:
# Apply Lasso Regression model
lasso = Lasso(alpha=best_alpha)
lasso_pred = lasso.fit(x_train, y_train).predict(x_test)
Name.append('Lasso Regression')
r2_list.append(r2_score(y_test, lasso_pred))
mse_list.append(mean_squared_error(y_test, lasso_pred))
print("R2 score using Lasso Regression is: {}%".format(r2_score(y_test, lasso_pred)*100))
print("Mean Squared Error using Lasso Regression is: {}".format(mean_squared_error(y_test, lasso_pred)))

R2 score using Lasso Regression is: -0.061954146559961565%
Mean Squared Error using Lasso Regression is: 0.04906435554862313


In [18]:
# Applying & fine tuning Ridge Regression model
alpha_values = np.arange(0.1,1.1, 0.1)
best_alpha = 0
best_mse = 1
for alpha in alpha_values:
    model = Ridge(alpha=alpha, random_state=1)
    y_pred = model.fit(x_train, y_train).predict(x_test)
    mse = mean_squared_error(y_test, y_pred)
    if mse < best_mse:
        best_mse = mse
        best_alpha = alpha
print("Best alpha:", best_alpha)
print("Best MSE score:", best_mse)

Best alpha: 0.1
Best MSE score: 0.013872340568850484


In [19]:
rd = Ridge(alpha=best_alpha)
rd_pred = rd.fit(x_train, y_train).predict(x_test)
Name.append('Ridge Regression')
r2_list.append(r2_score(y_test, rd_pred))
mse_list.append(mean_squared_error(y_test, rd_pred))
print("R2 score using Ridge Regression is: {}%".format(r2_score(y_test, rd_pred)*100))
print("Mean Squared Error using Ridge Regression is: {}".format(mean_squared_error(y_test, rd_pred)))

R2 score using Ridge Regression is: 71.7087182663559%
Mean Squared Error using Ridge Regression is: 0.013872340568850484


In [20]:
# Applying & fine tuning K-Neighbors Regressor model
n_values = range(1,101)
best_n = 0
best_mse = 1
for n in n_values:
    model = KNeighborsRegressor(n_neighbors=n)
    y_pred = model.fit(x_train, y_train).predict(x_test)
    mse = mean_squared_error(y_test, y_pred)
    if mse < best_mse:
        best_mse = mse
        best_n = n
print("Best n:", best_n)
print("Best MSE score:", best_mse)

Best n: 75
Best MSE score: 0.012372645627559934


In [21]:
knn = KNeighborsRegressor(n_neighbors=best_n)
knn_pred = knn.fit(x_train, y_train).predict(x_test)
Name.append('K-Neighbors Regressor')
r2_list.append(r2_score(y_test, knn_pred))
mse_list.append(mean_squared_error(y_test, knn_pred))
print("R2 score using K-Neighbors Regressor is: {}%".format(r2_score(y_test, knn_pred)*100))
print("Mean Squared Error using K-Neighbors Regressor is: {}".format(mean_squared_error(y_test, knn_pred)))

R2 score using K-Neighbors Regressor is: 74.76719941364276%
Mean Squared Error using K-Neighbors Regressor is: 0.012372645627559934


In [22]:
# Apply Support Vector Regressor model
svr = SVR(kernel='linear')
svr_pred = svr.fit(x_train, y_train).predict(x_test)
Name.append('Support Vector Regressor')
r2_list.append(r2_score(y_test, svr_pred))
mse_list.append(mean_squared_error(y_test, svr_pred))
print("R2 score using Support Vector Regressor is: {}%".format(r2_score(y_test, svr_pred)*100))
print("Mean Squared Error using Support Vector Regressor is: {}".format(mean_squared_error(y_test, svr_pred)))

R2 score using Support Vector Regressor is: 71.40895600286066%
Mean Squared Error using Support Vector Regressor is: 0.01401932592808751


In [23]:
# Apply Decision Trees model
dt = DecisionTreeRegressor()
dt_pred = dt.fit(x_train, y_train).predict(x_test)
Name.append('Decision Trees')
r2_list.append(r2_score(y_test, dt_pred))
mse_list.append(mean_squared_error(y_test, dt_pred))
print("R2 score using Decision Trees is: {}%".format(r2_score(y_test, dt_pred)*100))
print("Mean Squared Error using Decision Trees is: {}".format(mean_squared_error(y_test, dt_pred)))

R2 score using Decision Trees is: 62.280559056539154%
Mean Squared Error using Decision Trees is: 0.01849534198417292


In [24]:
# Applying & fine tuning Random Forest Regressor model
n_values = range(10, 501, 10)
best_n = 0
best_mse = 1
best_rf = None
for n in n_values:
    model = RandomForestRegressor(n_estimators=n, random_state=1)
    y_pred = model.fit(x_train, y_train).predict(x_test)
    mse = mean_squared_error(y_test, y_pred)
    if mse < best_mse:
        best_mse = mse
        best_n = n
        best_rf = model
print("Best n:", best_n)
print("Best MSE score:", best_mse)

Best n: 30
Best MSE score: 0.014880285224777301


In [25]:
rf_pred = best_rf.fit(x_train, y_train).predict(x_test)
Name.append('Random Forest Regressor')
r2_list.append(r2_score(y_test, rf_pred))
mse_list.append(mean_squared_error(y_test, rf_pred))
print("R2 score using Random Forest Regressor is: {}%".format(r2_score(y_test, rf_pred)*100))
print("Mean Squared Error using Linear Regression is: {}".format(mean_squared_error(y_test, rf_pred)))

R2 score using Random Forest Regressor is: 69.65311372787035%
Mean Squared Error using Linear Regression is: 0.014880285224777301


In [26]:
# Showing the results 
results['Name'] = Name
results['r2_score'] = r2_list
results['MSE'] = mse_list
results

Unnamed: 0,Name,r2_score,MSE
0,Linear Regression,0.717866,0.013834
1,Lasso Regression,-0.00062,0.049064
2,Ridge Regression,0.717087,0.013872
3,K-Neighbors Regressor,0.747672,0.012373
4,Support Vector Regressor,0.71409,0.014019
5,Decision Trees,0.622806,0.018495
6,Random Forest Regressor,0.696531,0.01488
