In [5]:
# Import the usual
import pandas as pd
import numpy as np
import matplotlib.pyplot as pyplot
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import linear_model

# Import wine quality
red = pd.read_csv("winequality-red.csv", sep=';')

# Seperate out inputs and response
y = red["quality"]
x = red[["fixed acidity","volatile acidity","citric acid","residual sugar","chlorides","free sulfur dioxide","total sulfur dioxide","density","pH","sulphates","alcohol"]]

# Create a test fraction
test_size = 0.01
# Create a seed for train_test_split from sklearn.model_selection
seed = 352
# Seperate out input and response testing and training data
X_train, X_test, y_train, y_test =  train_test_split(x, y, test_size=test_size, random_state=seed)

# Create model
reg = linear_model.LogisticRegression(max_iter=10000)
# Train model
reg.fit(X_train, y_train)
# Find predictions for test data:
predictions_test = reg.predict(X_test)
predictions_train = reg.predict(X_train)
# Report metrics
print('Accuracy Score for training data: %.6f' % accuracy_score(y_train, predictions_train))
print('Accuracy Score for test data: %.6f' % accuracy_score(y_test, predictions_test))

# Results
# Accuracy Score for training data: 0.603917
# Accuracy Score for test data: 0.562500


# For this assigment I drew mainly from the documentation in scikit-learn, namely: https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
# Here I drew from what my sparse experience with python, as well as the example for the linear regression tool also in the documentation in order to produce the work above. 
# I also found that the logistic regression model would run into the preset iteration limit for the solver chosen. I tested putting together a pipeline for scaling the data,
# but found that simply increasing the number of iterations to a degree would seemingly bypass this issue. I did find that, as expected, the starting seed for train_test_split, 
# as well as the fraction of the sample taken for the test subset influenced the accuracy of the predictions a fair bit, due to the nature of choosing a test set. Further exploration
# on the warmup excercise would be interesting, possibly in averaging out results for a range of test sets as well as random seeds.

Accuracy Score for training data: 0.603917
Accuracy Score for test data: 0.562500
Mean squared error for training data: 0.500948
Mean squared error for test data: 0.625000
Coefficient of Determination for training data: 0.231284
Coefficient of Determination for test data: -0.006289
