In [19]:
# Authors:  Eddie F. Carrizales and Jesper S. Bajwa
# Date:  09/22/2022

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
%matplotlib inline

# Accessing public google drive file
url='https://drive.google.com/file/d/1aSPL-6CFXTMxeA1kIF18hRlNOdyvyrDT/view?usp=sharing'
file_id=url.split('/')[-2]
dwn_url='https://drive.google.com/uc?id=' + file_id

In [20]:
# Reading the data file into a Pandas dataframe 'df'
original_df = pd.read_csv(dwn_url, names = ["mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "model year", "origin", "car name"])

In [21]:
# Remove any entries that do not have complete data
df_clean = original_df.dropna(axis = 'rows', how = 'any')

# Remove unnecessary column 'car name'
df_clean = df_clean.drop(columns = ['car name'])

# Move target variable columns 'mgp' to the last column position
df_clean = df_clean.iloc[:, [1,2,3,4,5,6,7,0]]

In [22]:
# Displays our clean dataset
df_clean

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin,mpg
0,8,307.0,130.0,3504,12.0,70,1,18.0
1,8,350.0,165.0,3693,11.5,70,1,15.0
2,8,318.0,150.0,3436,11.0,70,1,18.0
3,8,304.0,150.0,3433,12.0,70,1,16.0
4,8,302.0,140.0,3449,10.5,70,1,17.0
...,...,...,...,...,...,...,...,...
401,4,140.0,86.0,2790,15.6,82,1,27.0
402,4,97.0,52.0,2130,24.6,82,2,44.0
403,4,135.0,84.0,2295,11.6,82,1,32.0
404,4,120.0,79.0,2625,18.6,82,1,28.0


In [23]:
# Normalize data
mms = MinMaxScaler()

df_clean_normed = df_clean.copy()
df_clean_normed[['displacement', 'horsepower', 'weight', 'acceleration', 'mpg']] = mms.fit_transform(df_clean_normed[['displacement', 'horsepower', 'weight', 'acceleration', 'mpg']])

In [24]:
# Displays our clean normalized dataset
df_clean_normed

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin,mpg
0,8,0.617571,0.456522,0.536150,0.238095,70,1,0.239362
1,8,0.728682,0.646739,0.589736,0.208333,70,1,0.159574
2,8,0.645995,0.565217,0.516870,0.178571,70,1,0.239362
3,8,0.609819,0.565217,0.516019,0.238095,70,1,0.186170
4,8,0.604651,0.510870,0.520556,0.148810,70,1,0.212766
...,...,...,...,...,...,...,...,...
401,4,0.186047,0.217391,0.333711,0.452381,82,1,0.478723
402,4,0.074935,0.032609,0.146583,0.988095,82,2,0.930851
403,4,0.173127,0.206522,0.193365,0.214286,82,1,0.611702
404,4,0.134367,0.179348,0.286929,0.630952,82,1,0.505319


In [25]:
# The thing we are trying to predict is the quality of the wine.
# Using this corr() function, we are able to see the correlation that each of the columns have to
#   the quality of the wine.
# As we can see, there are positive and negative values describing how strongly the correlate with
#   the quality which has a value of 1.0, from this data we want to pick the columns that have an absolute
#   value which is closest to 1.0 (i.e., correlates the strongest)
# Note: In this case they all have somewhat high correlation, thus we will keep all the columns
df_clean_normed.corr()["mpg"]

cylinders      -0.777618
displacement   -0.805127
horsepower     -0.778427
weight         -0.832244
acceleration    0.423329
model year      0.580541
origin          0.565209
mpg             1.000000
Name: mpg, dtype: float64

In [26]:
# Creating Train Test split 80/20
X = df_clean_normed[['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin']]
y = df_clean_normed['mpg']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [27]:
#Creating the model and training 
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [28]:
model.coef_ #m1, m2, m3, mN are the coefficients (also called betas)

array([-0.02093152,  0.29208121, -0.10273994, -0.63804135,  0.01574573,
        0.02024223,  0.03352799])

In [29]:
model.intercept_ # the intercept is b (in y = mx + b)

-0.9187239728967851

In [30]:
# model evaluation for training set
y_train_predict = model.predict(X_train)
rmse_train = (np.sqrt(mean_squared_error(y_train, y_train_predict)))
mse_train = (mean_squared_error(y_train, y_train_predict))
r2_train = r2_score(y_train, y_train_predict)

print("The model performance for training set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse_train)) # the lower the better
print('MSE is {}'.format(mse_train)) # the lower the better
print('R2 score is {}'.format(r2_train)) # how well it appoximates the line (higher better)
print("\n")

The model performance for training set
--------------------------------------
RMSE is 0.08338863649904185
MSE is 0.0069536646971693344
R2 score is 0.8292924627961259




In [31]:
# model evaluation for test set
y_test_predict = model.predict(X_test)
rmse_test = (np.sqrt(mean_squared_error(y_test, y_test_predict)))
mse_test = (mean_squared_error(y_test, y_test_predict))
r2_test = r2_score(y_test, y_test_predict)

print("The model performance for test set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse_test)) # the lower the better
print('MSE is {}'.format(mse_test)) # the lower the better
print('R2 score is {}'.format(r2_test)) # how well it appoximates the line (higher better)
print("\n")

The model performance for test set
--------------------------------------
RMSE is 0.10438966436586289
MSE is 0.010897202026417505
R2 score is 0.7838182265584779




In [32]:
# Predictions table with predictions column
predictions_df = pd.DataFrame()
#predictions_df[['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin']] = X_test
predictions_df['Actual mpg'] = y_test
predictions_df['Predicted mpg'] = model.predict(X_test)
predictions_df

Unnamed: 0,Actual mpg,Predicted mpg
316,0.864362,0.600096
107,0.239362,0.334064
335,0.558511,0.447925
375,0.505319,0.532115
353,0.619681,0.675333
...,...,...
377,0.664894,0.568604
369,0.436170,0.503769
308,0.670213,0.543422
328,0.617021,0.620309


In [33]:
# Convert X_train to matrix for analitical solution calulations
XX = np.asmatrix(X_train)

In [34]:
# Calculation of the analytical solution for linear regression for our data
thetavec = (np.linalg.inv(XX.T.dot(XX)).dot(XX.T)).dot(y_train)
print(thetavec)

[[-0.03750547  0.33995981 -0.31010239 -0.51993787 -0.10036493  0.01019469
   0.02900652]]


In [35]:
# Analytical solution evaluation
y_test_ana = X_test.dot(thetavec.T)
rmse_ana = (np.sqrt(mean_squared_error(y_test, y_test_ana)))
mse_ana = (mean_squared_error(y_test, y_test_ana))
r2_ana = r2_score(y_test, y_test_ana)

print("The model performance using the analytical solution")
print("------------------------------------------")
print("RMSE is {}".format(rmse_ana))
print("MSE is {}".format(mse_ana))
print("R2 score {}".format(r2_ana)) 

The model performance using the analytical solution
------------------------------------------
RMSE is 0.11204635642789242
MSE is 0.012554385988766308
R2 score 0.7509425427791996
