In [1]:
# Authors:  Eddie F. Carrizales
# Date:  09/25/2022

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv" # url where data is located
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
%matplotlib inline

In [2]:
df = pd.read_csv(url, sep = ';')

In [3]:
quality = df["quality"] #temp variable to save quality data (we will drop it and add it at the end of the table)

#Drops the following columns from our dataframe (axis = 0 means drop rows, axis = 1 means drop columns)
df = df.drop(["quality", "fixed acidity", "volatile acidity", "citric acid", "residual sugar", "free sulfur dioxide", 
              "total sulfur dioxide", "pH", "sulphates"], axis = 1)

df["quality"] = quality #adds it back at the end of the table for better viewing
df # Shows our resulting table

Unnamed: 0,chlorides,density,alcohol,quality
0,0.045,1.00100,8.8,6
1,0.049,0.99400,9.5,6
2,0.050,0.99510,10.1,6
3,0.058,0.99560,9.9,6
4,0.058,0.99560,9.9,6
...,...,...,...,...
4893,0.039,0.99114,11.2,6
4894,0.047,0.99490,9.6,5
4895,0.041,0.99254,9.4,6
4896,0.022,0.98869,12.8,7


In [4]:
# split the data 90/10

X = df.drop(['quality'],axis=1).values   # independant features
Y = df['quality'].values		

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1, random_state=5)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(4408, 3)
(490, 3)
(4408,)
(490,)


In [5]:
model = LinearRegression()
model.fit(X_train, Y_train)

LinearRegression()

In [6]:
model.coef_ #m1, m2 and m3 are the coefficients (also called betas)

array([-2.55428504, 21.0194092 ,  0.34029958])

In [7]:
model.intercept_ # the intercept is b (in y = mx + b)

-18.47910631571404

In [8]:
# model evaluation for training set
y_train_predict = model.predict(X_train)
rmse = (np.sqrt(mean_squared_error(Y_train, y_train_predict)))
r2 = r2_score(Y_train, y_train_predict)

print("The model performance for training set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))
print("\n")

The model performance for training set
--------------------------------------
RMSE is 0.7989885805221255
R2 score is 0.195858874118079




In [9]:
# model evaluation for testing set
y_test_predict = model.predict(X_test)
rmse = (np.sqrt(mean_squared_error(Y_test, y_test_predict)))
r2 = r2_score(Y_test, y_test_predict)

print("The model performance for testing set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))

The model performance for testing set
--------------------------------------
RMSE is 0.751232776539804
R2 score is 0.18976416943777918


In [10]:
# Predictions table with predictions column
predictions_df = pd.DataFrame()
predictions_df[["Chlorides","Density","Alcohol"]] = X_test
predictions_df['Actual Quality'] = Y_test
predictions_df['Predicted Quality'] = model.predict(X_test)
predictions_df

Unnamed: 0,Chlorides,Density,Alcohol,Actual Quality,Predicted Quality
0,0.045,0.99390,9.0,6,5.359838
1,0.076,0.99204,10.1,6,5.615888
2,0.042,0.99260,10.6,5,5.884655
3,0.037,0.99180,11.0,5,6.016731
4,0.036,0.99068,11.4,7,6.131863
...,...,...,...,...,...
485,0.044,1.00030,8.6,7,5.360797
486,0.038,0.99820,9.2,5,5.536161
487,0.037,0.99324,12.2,7,6.455358
488,0.037,0.99200,11.1,6,6.054964
