In [121]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, cohen_kappa_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt

wwine = pd.read_csv('winequality-white.csv', delimiter = ';')
rwine = pd.read_csv('winequality-red.csv', delimiter = ';')

wwine['wine_type'] = 'White Wine'
rwine['wine_type'] = 'Red Wine'

wines = pd.concat([wwine, rwine])

# bucket wine quality scores into qualitative quality labels
wines['quality_label'] = wines['quality'].apply(lambda value: 'Low'
if value <= 5 else 'Medium'
if value <= 7 else 'High')

wines['quality_label'] = pd.Categorical(wines['quality_label'], categories=['Low', 'Medium', 'High'])

X = wines.iloc[:,:-3]
y = np.array(wines['quality_label'])

X_scaled = StandardScaler().fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9)

#print(Counter(wtp_train_y), Counter(wtp_test_y))
#print('Features:', list(X.columns))

In [122]:
# Logistic Regression
model_lr = LogisticRegression().fit(X_train, y_train)
model_lr_prediction = model_lr.predict(X_test)
lr = accuracy_score(y_test, model_lr_prediction)
lr

0.7069230769230769

In [123]:
# K-Fold Logistic Regression
lr_plot = cross_val_score(LogisticRegression(), X, y, cv=13)
print('Maximum: ', lr_plot.max())
print('Mean: ', lr_plot.mean())
#lr_plot = pd.Series(lr_plot)
#lr_plot.plot()

Maximum:  0.866
Mean:  0.6720055495606598


In [124]:
kappa_lr = cohen_kappa_score(y_test, model_lr_prediction)
kappa_lr

0.3687172839931021

In [125]:
# Decision Tree
model_dt = DecisionTreeClassifier().fit(X_train, y_train)
y_predict = model_dt.predict(X_test)
dt = accuracy_score(y_test, y_predict)
dt

0.7346153846153847

In [130]:
kappa_dt = cohen_kappa_score(y_test, y_predict)
kappa_dt

0.46454154727793706

In [143]:
# KNN
model_knn = KNeighborsClassifier(n_neighbors = 7, metric = 'minkowski', p = 2).fit(X_train, y_train)
model_knn_prediction = model_knn.predict(X_test)
knn = accuracy_score(y_test, model_knn_prediction)
knn

0.6569230769230769

In [144]:
kappa_knn = cohen_kappa_score(y_test, model_knn_prediction)
kappa_knn

0.2725657676827493

In [145]:
# SVM
model_svm = SVC(kernel='linear').fit(X_train, y_train)
model_svm_prediction = model_svm.predict(X_test)
svm = accuracy_score(y_test, model_svm_prediction)
svm

0.7369230769230769

In [146]:
kappa_svm = cohen_kappa_score(y_test, model_svm_prediction)
kappa_svm

0.43372290887600495

In [147]:
model_svm2 = SVC(kernel='poly').fit(X_train, y_train)
model_svm_prediction2 = model_svm2.predict(X_test)
svm2 = accuracy_score(y_test, model_svm_prediction2)
svm2

0.6123076923076923

In [149]:
kappa_svm2 = cohen_kappa_score(y_test, model_svm_prediction2)
kappa_svm2

0.006393536098608266

In [150]:
model_svm3 = SVC(kernel='rbf').fit(X_train, y_train)
model_svm_prediction3 = model_svm3.predict(X_test)
svm3 = accuracy_score(y_test, model_svm_prediction3)
svm3

0.6138461538461538

In [151]:
kappa_svm3 = cohen_kappa_score(y_test, model_svm_prediction3)
kappa_svm3

0.008387528110374909

In [126]:
# Random Forest Classifier Training
model_rf = RandomForestClassifier(n_estimators=800, criterion = 'entropy', random_state = 5).fit(X_train, y_train)
model_rf_prediction = model_rf.predict(X_test)
rf = accuracy_score(y_test, model_rf_prediction)
print('Accuracy Score: ', rf)

Accuracy Score:  0.8323076923076923


In [127]:
print(classification_report(y_test, model_rf_prediction))

              precision    recall  f1-score   support

        High       1.00      0.32      0.49        34
         Low       0.83      0.74      0.78       471
      Medium       0.83      0.91      0.87       795

    accuracy                           0.83      1300
   macro avg       0.89      0.66      0.71      1300
weighted avg       0.84      0.83      0.83      1300



In [128]:
confus_matrix = pd.DataFrame(
    confusion_matrix(y_test, model_rf_prediction),
    columns=['Predicted high', 'Predicted low', 'Predicted medium'],
    index=['True high', 'True low,', 'True medium']
    )
print(confus_matrix)

             Predicted high  Predicted low  Predicted medium
True high                11              0                23
True low,                 0            348               123
True medium               0             72               723


In [129]:
kappa_rf = cohen_kappa_score(y_test, model_rf_prediction)
print('Kappa: ', kappa_rf)

Kappa:  0.646170614681797
