In [1]:
#############
## IMPORTS ##
#############
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
from sklearn.svm import SVC

In [2]:
#####################################
## READING RED AND WHITE WINE DATA ##
#####################################
redWineData = pd.read_csv("./data/winequality-red.csv")
whiteWineData = pd.read_csv("./data/winequality-white.csv")

In [3]:
###################################################
## SPLITTING DATA INTO TRAINING DATA & TEST DATA ##
###################################################
# Making the fractions that are used to split the data into training, testing and validation data.
trainingData = 2/3
testData = 1/3
# Getting the dimensions of the redWineData and the whiteWineData.
redDimension = redWineData.shape
whiteDimension = whiteWineData.shape
# Getting the number of row that would be the cutoff for red and white wine trianing data. 
redTrainingRows = redDimension[0] * trainingData
whiteTrainingRows = whiteDimension[0] * trainingData
# Getting the training and y data for the red wine. 
redTrainingData = redWineData.loc[0:redTrainingRows]
redQualityTrainingData = redTrainingData['quality'].reset_index(drop=True)
redTrainingData = redTrainingData.drop('quality', axis=1).reset_index(drop=True)
# Getting the test and y data for red wine. 
redTestData = redWineData.loc[redTrainingRows+1:]
redQualityTestData = redTestData['quality'].reset_index(drop=True)
redTestData = redTestData.drop('quality', axis=1).reset_index(drop=True)
# Getting the training and y data for the white wine. 
whiteTrainingData = whiteWineData.loc[0:whiteTrainingRows]
whiteQualityTrainingData = whiteTrainingData['quality']
whiteTrainingData = whiteTrainingData.drop('quality', axis=1).reset_index(drop=True)
# Getting the test and y data for red wine. 
whiteTestData = whiteWineData.loc[whiteTrainingRows+1:]
whiteQualityTestData = whiteTestData['quality']
whiteTestData = whiteTestData.drop('quality', axis=1).reset_index(drop=True)

print("The red wine training data has " + str(len(redTrainingData)) + " rows.")
print("The red wine test data has " + str(len(redTestData)) + " rows.")
print("The white wine training data has " + str(len(whiteTrainingData)) + " rows.")
print("The white wine test data has " + str(len(whiteTestData)) + " rows.")

The red wine training data has 1067 rows.
The red wine test data has 532 rows.
The white wine training data has 3266 rows.
The white wine test data has 1631 rows.


In [4]:
##############################
## STANDARDIZING THE X DATA ##
##############################
# Does it make sense to scale the whole dataaset, or does it make sense to scale each individual feature one at a time?
# I don't see the use to scale the entire dataset around one common mean, rather than scaling to the mean of each individual feature. 
# The only was you would scale the entire dataset if you had very similar features, otherwise I think it makes more sense to scale and fit each feature individually. 
# NO NEED TO SCALE DATA. THIS WAS DONE AS AN ATTEMPT TO TRY GET MORE ACCURATE RESULTS.

# Creating a standard scalar object. 
sc = StandardScaler()
# Scaling all the training, test and validation data.
redTrainingDataScaled = sc.fit_transform(redTrainingData)
redTestDataScaled = sc.fit_transform(redTestData)
whiteTrainingDataScaled = sc.fit_transform(whiteTrainingData)
whiteTestDataScaled = sc.fit_transform(whiteTestData)

In [7]:
###############################
## LOGISTIC REGRESSION MODEL ##
###############################
# Making a logistic regression object for the Red Wine Data. 
redLR = LogisticRegression(solver='lbfgs', max_iter=10000)
# Fitting the model using the training data, with the predictors in the redTrainingData variable, and the redQualityTrainingData is the target variable.
redLR.fit(redTrainingData, redQualityTrainingData)
# Getting the prediction for the test data.
redPredictionsArray = redLR.predict(redTestData)
# Making the true data to compare against into a numpy array.
redTrueDataArray = np.array(redQualityTestData.values)
# Calculating the accuracy of the model constructed. 
redAccuracyLR = accuracy_score(redPredictionsArray, redTrueDataArray)
# Calculating the importance of each feature in the model. We need to take the expoenents as the coef attribute calculates the log of the coefficients.
redFeatureImportance = np.exp(redLR.coef_[0])
# Making a list of the importance and the column names
redFeatureImportanceList = list(zip(redTrainingData.columns, redFeatureImportance))
# Sorting this list out in ascending order.
redFeatureImportanceList.sort(key=lambda x:x[1], reverse=True)
print("The 5 most importance features in the Logistic Regressoin Model made to predict the quality of Red Wine are: ")
# Getting the 5 most importance features
for i in range(5):
    print(redFeatureImportanceList[i])
# Printing out the accuracy.
print("The Logistic Regression model made to predict the Red Wine Quality has an accuracy of " + str(redAccuracyLR))

# Making a logistic regression object for the White Wine Data. 
whiteLR = LogisticRegression(solver='liblinear', max_iter=10000)
# Fitting the model using the scaled training data, with the predictors in the redTrainingDataScaled variable, and the redQualityTrainingData is the target variable.
whiteLR.fit(whiteTrainingData, whiteQualityTrainingData)
# Getting the prediction for the test data.
whitePredictionsArray = whiteLR.predict(whiteTestData)
# Making the true data to compare against into a numpy array.
whiteTrueDataArray = np.array(whiteQualityTestData.values)
# Calculating the accuracy of the model constructed. 
whiteAccuracyLR = accuracy_score(whitePredictionsArray, whiteTrueDataArray)
# Calculating the importance of each feature in the model. We need to take the expoenents as the coef attribute calculates the log of the coefficients.
whiteFeatureImportance = np.exp(whiteLR.coef_[0])
# Making a list of the importance and the column names
whiteFeatureImportanceList = list(zip(whiteTrainingData.columns, whiteFeatureImportance))
# Sorting this list out in ascending order.
whiteFeatureImportanceList.sort(key=lambda x:x[1], reverse=True)
print("The 5 most importance features in the Logistic Regressoin Model made to predict the quality of White Wine are: ")
# Getting the 5 most importance features
for i in range(5):
    print(whiteFeatureImportanceList[i])
# Printing out the accuracy.
print("The Logistic Regression model made to predict the Red Wine Quality has an accuracy of " + str(whiteAccuracyLR))


The 5 most importance features in the Logistic Regressoin Model made to predict the quality of Red Wine are: 
('volatile acidity', 3.1970032774585913)
('pH', 2.0139131108433563)
('fixed acidity', 1.270604443687973)
('chlorides', 1.174998107796296)
('residual sugar', 1.1293050310318276)
The Logistic Regression model made to predict the Red Wine Quality has an accuracy of 0.6109022556390977
The 5 most importance features in the Logistic Regressoin Model made to predict the quality of White Wine are: 
('volatile acidity', 2.377301200392446)
('fixed acidity', 1.548316218774758)
('chlorides', 1.079301635380718)
('total sulfur dioxide', 1.0034373387749984)
('free sulfur dioxide', 1.0007232859558965)
The Logistic Regression model made to predict the Red Wine Quality has an accuracy of 0.5236051502145923


In [14]:
####################################
## RANDOM FOREST CLASSIFIER MODEL ##
####################################
# Creating the random forest classifier object.
redRF = RandomForestClassifier(n_estimators=100, min_samples_split=100, random_state=1)
# Fitting the model using the redTrainingDataScaled data, and the redQualityTrainingDataScaled as the target variable.
redRF.fit(redTrainingDataScaled, redQualityTrainingData)
# Generating the prediction values for the test data. 
redRFPredictions = redRF.predict(redTestDataScaled)
# Turning the predictions numpy array into a Pandas series. 
redRFPredictions = pd.Series(redRFPredictions, index = redTestData.index)
# Calculating the precision of the model. 
redScoreRF = precision_score(redRFPredictions, redQualityTestData, average="weighted")
# Getting the importance of each feature. 
redFeatureImportance = redRF.feature_importances_
# Making a list of the importances and the feature names. 
redFeatureImportanceList = sorted(list(zip(redTrainingData.columns, redFeatureImportance)), key=lambda x:x[1], reverse=True)
print("The 5 most importance features in the Random Forest Classifier Model made to predict the quality of Red Wine are: ")
# Getting the 5 most importance features
for i in range(5):
    print(redFeatureImportanceList[i])
# Printing out the precision score.
print("The Random Forest Classifier model made to predict the Red Wine Quality has an precision of " + str(redScoreRF))

# Creating the random forest classifier object.
whiteRF = RandomForestClassifier(n_estimators=100, min_samples_split=100, random_state=1)
# Fitting the model using the redTrainingDataScaled data, and the redQualityTrainingDataScaled as the target variable.
whiteRF.fit(whiteTrainingDataScaled, whiteQualityTrainingData)
# Generating the prediction values for the test data. 
whiteRFPredictions = whiteRF.predict(whiteTestDataScaled)
# Turning the predictions numpy array into a Pandas series. 
whiteRFPredictions = pd.Series(whiteRFPredictions, index = whiteTestData.index)
# Calculating the precision of the model. 
whiteScoreRF = precision_score(whiteRFPredictions, whiteQualityTestData, average="weighted")
# Getting the importance of each feature. 
whiteFeatureImportance = whiteRF.feature_importances_
# Making a list of the importances and the feature names. 
whiteFeatureImportanceList = sorted(list(zip(whiteTrainingData.columns, whiteFeatureImportance)), key=lambda x:x[1], reverse=True)
print("The 5 most importance features in the Random Forest Classifier Model made to predict the quality of Red Wine are: ")
# Getting the 5 most importance features
for i in range(5):
    print(whiteFeatureImportanceList[i])
# Printing out the precision score.
print("The Random Forest Classifier model made to predict the White Wine Quality has an precision of " + str(whiteScoreRF))


The 5 most importance features in the Random Forest Classifier Model made to predict the quality of Red Wine are: 
('alcohol', 0.2846502156130831)
('total sulfur dioxide', 0.16854049490383124)
('sulphates', 0.13834113520264815)
('volatile acidity', 0.11520021137546858)
('density', 0.08125250847419094)
The Random Forest Classifier model made to predict the Red Wine Quality has an precision of 0.6725133472122985
The 5 most importance features in the Random Forest Classifier Model made to predict the quality of Red Wine are: 
('alcohol', 0.20829446801493368)
('density', 0.13687580445075784)
('volatile acidity', 0.1362369597623192)
('chlorides', 0.08316279742708009)
('free sulfur dioxide', 0.07484491798634728)
The Random Forest Classifier model made to predict the White Wine Quality has an precision of 0.667593081516935


In [25]:
###############
## SVM MODEL ##
###############
# Making a SVM object.
redSVM = SVC(kernel="linear", probability=True)
# Fitting the model using the redTrainingDataScaled data, and the redQualityTrainingData as the target variable. 
redSVM.fit(redTrainingDataScaled, redQualityTrainingData)
# Getting the redTrainingDataScaled score.
redSVMScoreTraining = redSVM.score(redTrainingDataScaled, redQualityTrainingData)
# Printing out the score for the SVM mdel made above. 
print("The score for the SVM used to model the Red Wine training data is " + str(redSVMScoreTraining))
# Getting the redTestDataScaled score.
redSVMScoreTest = redSVM.score(redTestDataScaled, redQualityTestData)
# Printing out the score for the SVM mdel made above. 
print("The score for the SVM used to model the Red Wine test data is " + str(redSVMScoreTest))
# Calculating the coefficient for the SVM model used for the training data. 
redSVMTrainingCoeff = redSVM.coef_
# Getting the most important coefficient value which would be the first one in the list.
importanceRedSVM = redSVMTrainingCoeff[0]
# Making a list of the feature names, the importance coeffeicient.
importanceRedList = list(zip(redTrainingData.columns, importanceRedSVM))
# Sorting the list of importance of the features. 
importanceRedList.sort(key=lambda x: x[1], reverse=True)
# Printing out the 5 most important features.
for i in range(5):
    print(importanceRedList[i])
# Getting the probablity of the prediction, using the redTrainingDataScaled.
redTestProbability = redSVM.predict_proba(redTestDataScaled)
# Getting the prediction values for the redTestDataScaled. 
redPredictionValues = redSVM.predict(redTestDataScaled)

# Making a SVM object.
whiteSVM = SVC(kernel="linear", probability=True)
# Fitting the model using the redTrainingDataScaled data, and the redQualityTrainingData as the target variable. 
whiteSVM.fit(whiteTrainingDataScaled, whiteQualityTrainingData)
# Getting the redTrainingDataScaled score.
whiteSVMScoreTraining = whiteSVM.score(whiteTrainingDataScaled, whiteQualityTrainingData)
# Printing out the score for the SVM mdel made above. 
print("The score for the SVM used to model the White Wine training data is " + str(whiteSVMScoreTraining))
# Getting the redTestDataScaled score.
whiteSVMScoreTest = whiteSVM.score(whiteTestDataScaled, whiteQualityTestData)
# Printing out the score for the SVM mdel made above. 
print("The score for the SVM used to model the White Wine test data is " + str(redSVMScoreTest))
# Calculating the coefficient for the SVM model used for the training data. 
whiteSVMTrainingCoeff = whiteSVM.coef_
# Getting the most important coefficient value which would be the first one in the list.
importanceWhiteSVM = whiteSVMTrainingCoeff[0]
# Making a list of the feature names, the importance coeffeicient.
importanceWhiteList = list(zip(whiteTrainingData.columns, importanceWhiteSVM))
# Sorting the list of importance of the features. 
importanceWhiteList.sort(key=lambda x: x[1], reverse=True)
# Printing out the 5 most important features.
for i in range(5):
    print(importanceWhiteList[i])
# Getting the probablity of the prediction, using the redTrainingDataScaled.
whiteTestProbability = whiteSVM.predict_proba(whiteTestDataScaled)
# Getting the prediction values for the redTestDataScaled. 
whitePredictionValues = whiteSVM.predict(whiteTestDataScaled)


The score for the SVM used to model the Red Wine training data is 0.5998125585754451
The score for the SVM used to model the Red Wine test data is 0.5695488721804511
[('fixed acidity', 0.5480521123718827), ('volatile acidity', 0.4818175390776087), ('citric acid', 0.6688554517617892), ('residual sugar', 0.2762336492578549), ('chlorides', 0.2511072948233213), ('free sulfur dioxide', 0.07696827692871955), ('total sulfur dioxide', -1.103416131262917), ('density', 0.7129117630460664), ('pH', 0.5346304045970975), ('sulphates', -0.29440240327862965), ('alcohol', -0.3370718213157221)]
('density', 0.7129117630460664)
('citric acid', 0.6688554517617892)
('fixed acidity', 0.5480521123718827)
('pH', 0.5346304045970975)
('volatile acidity', 0.4818175390776087)
[7 7 5 6 5 5 5 5 6 7 6 6 5 7 5 5 6 5 5 6 6 6 6 6 6 6 7 5 5 5 5 7 5 7 6 6 6
 7 6 7 7 5 6 6 6 6 5 7 6 6 6 7 7 7 6 6 6 6 6 7 6 5 6 6 5 7 6 7 6 6 6 5 5 5
 6 6 6 5 6 6 6 6 6 7 6 5 6 6 5 6 6 6 6 6 6 7 5 5 5 6 7 6 6 6 6 7 5 5 6 5 7
 5 6 6 6 6 5 5 6 