# Wine classification

In this part different machine learning models will be used to predict the wine quality

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the data into dataframes

df_red = pd.read_csv('winequality-red.csv', delimiter = ';')
df_white = pd.read_csv('winequality-white.csv', delimiter = ';')
df_combined = pd.read_csv('winequality.csv')

df_red.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

# Red wine classification

In [4]:
target = df_red['quality']
attr = df_red.drop(['quality'], axis=1)

attr.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


In [5]:
# Separate the data into training and test data

x_train, x_test, y_train, y_test = train_test_split(attr, target, test_size = 0.2)

In [6]:
# Scale the data

scaler = StandardScaler()
scaler.fit_transform(x_train, y_train)
scaler.transform(x_test)

array([[-0.75786917, -0.58771411,  0.40264116, ...,  0.18588158,
        -0.63805493, -1.16873704],
       [ 0.38116099, -0.36299472,  0.19779707, ...,  0.05830308,
        -0.17454211, -0.8863591 ],
       [ 0.49506401,  0.36734328, -0.16068009, ..., -0.32443244,
         0.0572143 , -0.79223312],
       ...,
       [-0.81482068,  1.35049059, -0.46794622, ...,  0.56861709,
        -0.34835942, -0.13335127],
       [-0.87177219,  0.81678205, -1.28732258, ...,  1.07893111,
        -0.81187223, -0.8863591 ],
       [ 0.89372457, -0.19445518,  1.58049467, ..., -0.13306468,
        -0.46423762, -1.356989  ]])

**Create a dummy model for determining a benchmark**

In [7]:
model = DummyClassifier()

model.fit(x_train, y_train)

pred = model.predict(x_test)

In [8]:
model.score(x_test, y_test)

0.43125

In [9]:
print(confusion_matrix(y_true = y_test, y_pred=pred))

[[  0   0   1   0   0   0]
 [  0   0  12   0   0   0]
 [  0   0 138   0   0   0]
 [  0   0 131   0   0   0]
 [  0   0  37   0   0   0]
 [  0   0   1   0   0   0]]


In [10]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00        12
           5       0.43      1.00      0.60       138
           6       0.00      0.00      0.00       131
           7       0.00      0.00      0.00        37
           8       0.00      0.00      0.00         1

    accuracy                           0.43       320
   macro avg       0.07      0.17      0.10       320
weighted avg       0.19      0.43      0.26       320



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**Logistic Regression model**

In [11]:
model = LogisticRegression(max_iter = 10000)

model.fit(x_train, y_train)

pred = model.predict(x_test)

In [12]:
model.score(x_test, y_test)

0.5875

In [13]:
print(confusion_matrix(y_true = y_test, y_pred = pred))

[[  0   0   1   0   0   0]
 [  0   0   9   3   0   0]
 [  0   0 112  26   0   0]
 [  0   0  50  68  13   0]
 [  0   0   5  24   8   0]
 [  0   0   0   0   1   0]]


In [14]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00        12
           5       0.63      0.81      0.71       138
           6       0.56      0.52      0.54       131
           7       0.36      0.22      0.27        37
           8       0.00      0.00      0.00         1

    accuracy                           0.59       320
   macro avg       0.26      0.26      0.25       320
weighted avg       0.54      0.59      0.56       320



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**Decision Tree model**

In [15]:
model = DecisionTreeClassifier()
model.fit(x_train, y_train)
pred = model.predict(x_test)

model.score(x_test, y_test)

0.64375

In [16]:
print(confusion_matrix(y_true = y_test, y_pred = pred))

[[  0   0   0   1   0   0]
 [  0   0   8   4   0   0]
 [  1   6 109  19   3   0]
 [  1   2  33  73  21   1]
 [  1   0   1  10  24   1]
 [  0   0   0   0   1   0]]


In [17]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00        12
           5       0.72      0.79      0.75       138
           6       0.68      0.56      0.61       131
           7       0.49      0.65      0.56        37
           8       0.00      0.00      0.00         1

    accuracy                           0.64       320
   macro avg       0.32      0.33      0.32       320
weighted avg       0.65      0.64      0.64       320



**Random Forest model**

In [18]:
model = RandomForestClassifier()
model.fit(x_train, y_train)
pred = model.predict(x_test)

model.score(x_test, y_test)

0.75

In [19]:
print(confusion_matrix(y_true = y_test, y_pred = pred))

[[  0   0   0   1   0   0]
 [  1   1   8   2   0   0]
 [  0   0 117  20   1   0]
 [  0   0  24  98   9   0]
 [  0   0   2  11  24   0]
 [  0   0   0   0   1   0]]


In [20]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       1.00      0.08      0.15        12
           5       0.77      0.85      0.81       138
           6       0.74      0.75      0.75       131
           7       0.69      0.65      0.67        37
           8       0.00      0.00      0.00         1

    accuracy                           0.75       320
   macro avg       0.53      0.39      0.40       320
weighted avg       0.75      0.75      0.74       320



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**Gradient Boost model**

In [21]:
model = GradientBoostingClassifier()
model.fit(x_train, y_train)
pred = model.predict(x_test)

model.score(x_test, y_test)

0.684375

In [22]:
print(confusion_matrix(y_true = y_test, y_pred = pred))

[[  0   0   0   1   0   0]
 [  0   1   7   4   0   0]
 [  0   1 117  19   1   0]
 [  0   0  38  83   9   1]
 [  0   0   5  13  18   1]
 [  0   0   0   0   1   0]]


The best model had a score of 0.75 which is quite low, so we will modify the problem.

# Transform the problem into a binary classification problem with 2 classes:
   1. not good (quality < 7) : 0
   2. good (quality >= 7) : 1

In [23]:
# Create the target attribute 'isGood'

df_red['isGood'] = np.where(df_red['quality'] >= 7, 1, 0)

df_red['isGood'].value_counts()

0    1382
1     217
Name: isGood, dtype: int64

In [24]:
# Separate the data into train and test 

target = df_red['isGood']
attr = df_red.drop(['isGood', 'quality'], axis = 1)
x_train, x_test, y_train, y_test = train_test_split(attr, target, test_size = 0.3)


In [25]:
# Scale the data

scaler = StandardScaler()
scaler.fit_transform(x_train, y_train)
scaler.transform(x_test)

array([[ 2.17595498, -0.27921325,  1.39084062, ..., -1.39449787,
        -0.28593733,  0.06763826],
       [ 0.65178259, -0.39127911, -0.16821239, ..., -1.26339023,
         0.54630164, -0.40081983],
       [ 0.08727429, -1.9602012 ,  1.23493532, ...,  0.11323998,
         0.6743384 ,  1.28562928],
       ...,
       [-1.09819312,  1.4858241 , -1.41545481, ...,  1.16210108,
        -0.86210277,  1.09824605],
       [ 0.53888093, -0.95160843,  0.61131412, ...,  1.03099344,
         0.48228325,  0.44240473],
       [-0.36433234, -1.68003654,  0.50737725, ...,  0.63767053,
         0.35424649,  1.56670413]])

**Dummy model**

In [26]:
model = DummyClassifier()

model.fit(x_train, y_train)

pred = model.predict(x_test)

In [27]:
model.score(x_test, y_test)

0.8770833333333333

In [28]:
print(confusion_matrix(y_test, pred))

[[421   0]
 [ 59   0]]


In [29]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.88      1.00      0.93       421
           1       0.00      0.00      0.00        59

    accuracy                           0.88       480
   macro avg       0.44      0.50      0.47       480
weighted avg       0.77      0.88      0.82       480



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**Decision Tree model**

In [30]:
model = DecisionTreeClassifier()
model.fit(x_train, y_train)
pred = model.predict(x_test)

model.score(x_test, y_test)

0.8520833333333333

In [31]:
print(confusion_matrix(y_test, pred))

[[373  48]
 [ 23  36]]


In [32]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.94      0.89      0.91       421
           1       0.43      0.61      0.50        59

    accuracy                           0.85       480
   macro avg       0.69      0.75      0.71       480
weighted avg       0.88      0.85      0.86       480



**Logistic Regression model**

In [33]:
model = LogisticRegression(max_iter = 10000)
model.fit(x_train, y_train)
pred = model.predict(x_test)

model.score(x_test, y_test)

0.8833333333333333

In [34]:
print(confusion_matrix(y_test, pred))

[[403  18]
 [ 38  21]]


In [35]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.91      0.96      0.94       421
           1       0.54      0.36      0.43        59

    accuracy                           0.88       480
   macro avg       0.73      0.66      0.68       480
weighted avg       0.87      0.88      0.87       480



**Gradient Boost model**

In [36]:
model = GradientBoostingClassifier()
model.fit(x_train, y_train)
pred = model.predict(x_test)

model.score(x_test, y_test)

0.9083333333333333

In [37]:
print(confusion_matrix(y_test, pred))

[[400  21]
 [ 23  36]]


In [38]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.95      0.95      0.95       421
           1       0.63      0.61      0.62        59

    accuracy                           0.91       480
   macro avg       0.79      0.78      0.78       480
weighted avg       0.91      0.91      0.91       480



**Random Forest model**

In [39]:
model = RandomForestClassifier()
model.fit(x_train, y_train)
pred = model.predict(x_test)

model.score(x_test, y_test)

0.9333333333333333

In [40]:
print(confusion_matrix(y_test, pred))

[[409  12]
 [ 20  39]]


In [41]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.95      0.97      0.96       421
           1       0.76      0.66      0.71        59

    accuracy                           0.93       480
   macro avg       0.86      0.82      0.84       480
weighted avg       0.93      0.93      0.93       480



The Random Forest Classifier seems to be the best option in classifying red wine quality.

# White wine classification

In [42]:
# Create the target attribute 'isGood'

df_white['isGood'] = np.where(df_white['quality'] >= 7, 1, 0)
df_white['isGood'].value_counts()

0    3838
1    1060
Name: isGood, dtype: int64

In [43]:
# Separate the train and test data

target = df_white['isGood']
attr = df_white.drop(['isGood', 'quality'], axis = 1)
x_train, x_test, y_train, y_test = train_test_split(attr, target, test_size = 0.3)

In [44]:
# Scale the data

scaler = StandardScaler()
scaler.fit_transform(x_train, y_train)
scaler.transform(x_test)

array([[-1.47482831,  0.41058272, -0.03310804, ...,  0.38549912,
         0.01332137,  0.48768449],
       [-0.29480043,  0.99666541, -0.35924122, ..., -1.00645105,
        -1.04867041,  1.62153742],
       [ 1.12123302, -0.76158265, -0.7669077 , ..., -0.74131769,
         2.22580424,  1.29757944],
       ...,
       [-0.05879485,  0.21522183, -0.7669077 , ..., -0.80760103,
        -0.07517795, -0.88913691],
       [-0.88481437,  1.19202631, -0.19617463, ...,  0.18664909,
         2.49130218, -0.80814742],
       [-0.05879485,  0.11754138, -0.11464133, ..., -0.94016771,
         1.16381246,  0.56867399]])

**Dummy model**

In [45]:
model = DummyClassifier()
model.fit(x_train, y_train)
pred = model.predict(x_test)

model.score(x_test, y_test)

0.780952380952381

**Logistic Regression**

In [46]:
model = LogisticRegression(max_iter = 10000)
model.fit(x_train, y_train)
pred = model.predict(x_test)

model.score(x_test, y_test)

0.8020408163265306

In [47]:
print(confusion_matrix(y_test, pred))

[[1095   53]
 [ 238   84]]


In [48]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.82      0.95      0.88      1148
           1       0.61      0.26      0.37       322

    accuracy                           0.80      1470
   macro avg       0.72      0.61      0.62      1470
weighted avg       0.78      0.80      0.77      1470



**Decision Tree model**

In [49]:
model = DecisionTreeClassifier()
model.fit(x_train, y_train)
pred = model.predict(x_test)

model.score(x_test, y_test)

0.8027210884353742

In [50]:
print(confusion_matrix(y_test, pred))

[[995 153]
 [137 185]]


In [51]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.88      0.87      0.87      1148
           1       0.55      0.57      0.56       322

    accuracy                           0.80      1470
   macro avg       0.71      0.72      0.72      1470
weighted avg       0.81      0.80      0.80      1470



**Gradient Boost model**

In [52]:
model = GradientBoostingClassifier()
model.fit(x_train, y_train)
pred = model.predict(x_test)

model.score(x_test, y_test)

0.8258503401360544

In [53]:
print(confusion_matrix(y_test, pred))

[[1095   53]
 [ 203  119]]


In [54]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.84      0.95      0.90      1148
           1       0.69      0.37      0.48       322

    accuracy                           0.83      1470
   macro avg       0.77      0.66      0.69      1470
weighted avg       0.81      0.83      0.80      1470



**Random Forest model**

In [55]:
model = RandomForestClassifier()
model.fit(x_train, y_train)
pred = model.predict(x_test)

model.score(x_test, y_test)

0.8734693877551021

In [56]:
print(confusion_matrix(y_test, pred))

[[1110   38]
 [ 148  174]]


In [57]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.88      0.97      0.92      1148
           1       0.82      0.54      0.65       322

    accuracy                           0.87      1470
   macro avg       0.85      0.75      0.79      1470
weighted avg       0.87      0.87      0.86      1470



Again, the Random Forest Classifier seems to be the best classifier for wine quality.

# Combined dataset of white and red wines

In [58]:
# Create the target attribute 'isGood'

df_combined['isGood'] = np.where(df_combined['quality'] >= 7, 1, 0)
df_combined['isGood'].value_counts()

0    5220
1    1277
Name: isGood, dtype: int64

In [59]:
# Separate the train and test data

target = df_combined['isGood']
attr = df_combined.drop(['isGood', 'quality'], axis = 1)
x_train, x_test, y_train, y_test = train_test_split(attr, target, test_size = 0.3)

In [60]:
# Scale the data

scaler = StandardScaler()
scaler.fit_transform(x_train, y_train)
scaler.transform(x_test)

array([[ 1.61801381, -1.00176215, -0.78011745, ..., -0.20182305,
         1.83268667, -0.56219541],
       [-0.02487281, -0.23113025, -0.5343967 , ...,  1.19390228,
         0.74960228, -0.56219541],
       [ 1.58012977, -0.53938301,  1.06278818, ...,  0.28668082,
         0.08308881, -0.56219541],
       ...,
       [ 0.57540243, -0.46231982,  0.01847499, ..., -0.34139558,
        -1.08330976, -0.56219541],
       [ 0.4718883 , -0.23113025, -0.35010614, ..., -0.48096812,
        -0.75005302, -0.56219541],
       [-1.50501814,  3.15965009,  1.73852025, ...,  1.12411602,
         2.41588595,  1.77874096]])

**Dummy model**

In [61]:
model = DummyClassifier()
model.fit(x_train, y_train)
pred = model.predict(x_test)

model.score(x_test, y_test)

0.8082051282051282

**Logistic Regression model**

In [62]:
model = LogisticRegression(max_iter = 10000)
model.fit(x_train, y_train)
pred = model.predict(x_test)

model.score(x_test, y_test)

0.8174358974358974

In [63]:
print(confusion_matrix(y_test, pred))

[[1504   72]
 [ 284   90]]


In [64]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.84      0.95      0.89      1576
           1       0.56      0.24      0.34       374

    accuracy                           0.82      1950
   macro avg       0.70      0.60      0.61      1950
weighted avg       0.79      0.82      0.79      1950



**Decision Tree model**

In [65]:
model = DecisionTreeClassifier()
model.fit(x_train, y_train)
pred = model.predict(x_test)

model.score(x_test, y_test)

0.8435897435897436

In [66]:
print(confusion_matrix(y_test, pred))

[[1414  162]
 [ 143  231]]


In [67]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.91      0.90      0.90      1576
           1       0.59      0.62      0.60       374

    accuracy                           0.84      1950
   macro avg       0.75      0.76      0.75      1950
weighted avg       0.85      0.84      0.85      1950



**Gradient Boost model**

In [68]:
model = GradientBoostingClassifier()
model.fit(x_train, y_train)
pred = model.predict(x_test)

model.score(x_test, y_test)

0.8451282051282051

In [69]:
print(confusion_matrix(y_test, pred))

[[1504   72]
 [ 230  144]]


In [70]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.87      0.95      0.91      1576
           1       0.67      0.39      0.49       374

    accuracy                           0.85      1950
   macro avg       0.77      0.67      0.70      1950
weighted avg       0.83      0.85      0.83      1950



**Random Forest model**

In [71]:
model = RandomForestClassifier()
model.fit(x_train, y_train)
pred = model.predict(x_test)

model.score(x_test, y_test)

0.8902564102564102

In [72]:
print(confusion_matrix(y_test, pred))

[[1533   43]
 [ 171  203]]


In [73]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.90      0.97      0.93      1576
           1       0.83      0.54      0.65       374

    accuracy                           0.89      1950
   macro avg       0.86      0.76      0.79      1950
weighted avg       0.89      0.89      0.88      1950



The Random Forest Classifier is the best option for a wine quality classifier with an accuracy of 0.89