In [67]:
#############
## IMPORTS ##
#############

import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [31]:
#####################################
## READING RED AND WHITE WINE DATA ##
#####################################

redWineData = pd.read_csv("./data/winequality-red.csv")
whiteWineData = pd.read_csv("./data/winequality-white.csv")

In [None]:
######################################
## SUMMARY OF RED & WHITE WINE DATA ##
######################################
redSummary = redWineData.describe()
whiteSummary = whiteWineData.describe()

In [None]:
########################################################################################
## CREATING BAR CHARTS TO COMPARE THE MEANS OF ALL ATTRIUBUTES IN RED AND WHITE WINES ##
########################################################################################

plt.bar(redSummary.columns, redSummary.loc['mean'], label="Red Wine", color="red")
plt.xticks(rotation=45)
plt.xlabel("Attributes")
plt.ylabel("Mean value")
plt.title("Mean value of Attributes: Red Wine")
# plt.savefig("./figures/redWineAttributesMeans.jpeg")
# plt.show()
plt.bar(whiteSummary.columns, whiteSummary.loc['mean'], label="White Wine", color="blue")
plt.xticks(rotation=45)
plt.xlabel("Attributes")
plt.ylabel("Mean value")
plt.title("Mean value of Attributes: White Wine")
# plt.savefig(("./figures/whiteWineAttributesMeans.jpeg"))
# plt.show()

In [None]:
###############################################################
## COMPARING THE ALCOHOL CONTENT AND THE QUALITY OF THE WINE ##
###############################################################
redQuality = redWineData['quality']
whiteQuality = whiteWineData['quality']
redAlcohol = redWineData['alcohol']
whiteAlcohol = whiteWineData['alcohol']
redBar = plt.bar(redQuality, redAlcohol, color='red', width=0.5)
blueBar = plt.bar(whiteQuality, whiteAlcohol, color='blue', width=0.3)
redBar.set_label("Red Wine")
blueBar.set_label("White Wine")
plt.legend(loc='upper right')
plt.xlabel("Alcohol Content")
plt.ylabel("Wine Quality")
plt.title("Alcohol Content vs Wine Quality for Red & White Wines")
plt.show()

In [63]:
##########################################################################
## SPLITTING DATA INTO TRAINING DATA, VALIDATION DATA & PREDICTION DATA ##
##########################################################################
trainingData = 2/3
validationData = 1/6
predictionData = 1/6

redDimension = redWineData.shape
whiteDimension = whiteWineData.shape

redTrainingRows = redDimension[0] * trainingData
redValidationRows = redTrainingRows + (redDimension[0] * validationData)

whiteTrainingRows = whiteDimension[0] * trainingData
whiteValidationRows = whiteTrainingRows + (whiteDimension[0] * validationData)

redTrainingData = redWineData.loc[0:redTrainingRows]
redQualityTrainingData = redTrainingData['quality']
redTrainingData = redTrainingData.drop('quality', axis=1)

redValidationData = redWineData.loc[redTrainingRows+1:redValidationRows]
redQualityValidationData = redValidationData['quality']
redValidationData = redValidationData.drop('quality', axis=1)

redTestData = redWineData.loc[redValidationRows+1:]
redQualityPredictionData = redTestData['quality']
redTestData = redTestData.drop('quality', axis=1)

whiteTrainingData = whiteWineData.loc[0:whiteTrainingRows]
whiteQualityTrainingData = whiteTrainingData['quality']
whiteTrainingData = whiteTrainingData.drop('quality', axis=1)

whiteValidationData = whiteWineData.loc[whiteTrainingRows+1:whiteValidationRows]
whiteQualityValidationData = whiteValidationData['quality']
whiteValidationData = whiteValidationData.drop('quality', axis=1)

whiteTestData = whiteWineData.loc[whiteValidationRows+1:]
whiteQualityPredictionData = whiteTestData['quality']
whiteTestData = whiteTestData.drop('quality', axis=1)

In [64]:
##############################
## STANDARDIZING THE X DATA ##
##############################
sc = StandardScaler()
redTrainingDataScaled = sc.fit_transform(redTrainingData)
redValidationDataScaled = sc.fit_transform(redValidationData)
redTestDataScaled = sc.fit_transform(redTestData)
whiteTrainingDataScaled = sc.fit_transform(whiteTrainingData)
whiteValidationDataScaled = sc.fit_transform(whiteValidationData)
whiteTestDataScaled = sc.fit_transform(whiteTestData)

In [66]:
############################
## NORMALIZING THE X DATA ##
############################
a = preprocessing.normalize(redTrainingDataScaled, axis=0)
b = preprocessing.normalize(redValidationDataScaled, axis=0)
c = preprocessing.normalize(redTestDataScaled, axis=0)
d = preprocessing.normalize(whiteTrainingData, axis=0)
e = preprocessing.normalize(whiteValidationDataScaled, axis=0)
f = preprocessing.normalize(whiteTestDataScaled, axis=0)
redTrainingDataNormalized = pd.DataFrame(a, columns = redTrainingData.columns)
redValidationDataNormalized = pd.DataFrame(b, columns = redValidationData.columns)
redTestDataNormalized = pd.DataFrame(c, columns = redTestData.columns)
whiteTrainingDataNormalized = pd.DataFrame(d, columns = whiteTrainingData.columns)
whiteValidationDataNormalized = pd.DataFrame(e, columns = whiteValidationData.columns)
whiteTestDataNormalized = pd.DataFrame(f, columns = whiteTestData.columns)

In [None]:
##################################
## PRINCIPAL COMPONENT ANALYSIS ##
##################################
pca = PCA(0.05)
redTrainingDataPCA = pca.fit_transform(redTrainingDataNormalized)
redValidationDataPCA = pca.fit_transform(redValidationDataNormalized)
redTestDataPCA = pca.fit_transform(redTestDataNormalized)

