In [21]:
# Import the usual
import pandas as pd
import numpy as np
import matplotlib.pyplot as pyplot
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import linear_model

# Import wine quality
red = pd.read_csv("winequality-red.csv", sep=';')

# Seperate out inputs and response
y = red["quality"]
x = red[["fixed acidity","volatile acidity","citric acid","residual sugar","chlorides","free sulfur dioxide","total sulfur dioxide","density","pH","sulphates","alcohol"]]

# Create a test fraction
test_size = 0.01
# Create a seed for train_test_split from sklearn.model_selection
seed = 352
# Seperate out input and response testing and training data
X_train, X_test, y_train, y_test =  train_test_split(x, y, test_size=test_size, random_state=seed)

# Create model
reg = linear_model.LinearRegression()
# Train model
reg.fit(X_train, y_train)
# Find predictions for test data:
predictions_test = reg.predict(X_test)
predictions_train = reg.predict(X_train)
# Report metrics
print("Mean squared error for training data: %.6f" % mean_squared_error(y_train, predictions_train))
print("Mean squared error for test data: %.6f" % mean_squared_error(y_test, predictions_test))
print('Coefficient of Determination for training data: %.6f' % r2_score(y_train, predictions_train))
print('Coefficient of Determination for test data: %.6f' % r2_score(y_test, predictions_test))

# Results:
# Mean squared error for training data: 0.416223
# Mean squared error for test data: 0.474954
# Coefficient of Determination for training data: 0.361297
# Coefficient of Determination for test data: 0.235295

# For this assignment I drew from the documentation in scikit-learn, namely https://scikit-learn.org/stable/modules/linear_model.html#, as well as the 
# example given for fitting a linear regression also in the documentation, https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html#sphx-glr-auto-examples-linear-model-plot-ols-py
# modifiying it as neccessary. I did find that, as expected, the starting seed for train_test_split, as well as the fraction of the sample taken for the 
# test subset influenced the accuracy of the predictions a fair bit, due to the nature of choosing a test set. Further exploration
# on the warmup excercise would be interesting, possibly in averaging out results for a range of test sets as well as random seeds.

Mean squared error for training data: 0.416223
Mean squared error for test data: 0.474954
Coefficient of Determination for training data: 0.361297
Coefficient of Determination for test data: 0.235295
