In [324]:
import pandas as pd
import numpy as np
import tensorflow as tf
import sklearn as sk
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
%matplotlib inline

from keras.models import Sequential
from keras.layers import Dense, Flatten, Activation, Conv2D, MaxPooling2D
from keras.activations import linear, relu, sigmoid

import warnings
warnings.filterwarnings('ignore')

## Dataset load

In [325]:
df = pd.read_csv("/content/drive/MyDrive/WineQT.csv")
df = df.sample(frac=1).reset_index()
df.head()

Unnamed: 0,index,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id
0,510,8.4,0.56,0.04,2.0,0.082,10.0,22.0,0.9976,3.22,0.44,9.6,5,718
1,654,8.6,0.22,0.36,1.9,0.064,53.0,77.0,0.99604,3.47,0.87,11.0,7,925
2,843,7.0,0.745,0.12,1.8,0.114,15.0,64.0,0.99588,3.22,0.59,9.5,6,1194
3,60,7.7,0.49,0.26,1.9,0.062,9.0,31.0,0.9966,3.39,0.64,9.6,5,87
4,1109,6.6,0.88,0.04,2.2,0.066,12.0,20.0,0.99636,3.53,0.56,9.9,5,1556


In [326]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1143 entries, 0 to 1142
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 1143 non-null   int64  
 1   fixed acidity         1143 non-null   float64
 2   volatile acidity      1143 non-null   float64
 3   citric acid           1143 non-null   float64
 4   residual sugar        1143 non-null   float64
 5   chlorides             1143 non-null   float64
 6   free sulfur dioxide   1143 non-null   float64
 7   total sulfur dioxide  1143 non-null   float64
 8   density               1143 non-null   float64
 9   pH                    1143 non-null   float64
 10  sulphates             1143 non-null   float64
 11  alcohol               1143 non-null   float64
 12  quality               1143 non-null   int64  
 13  Id                    1143 non-null   int64  
dtypes: float64(11), int64(3)
memory usage: 125.1 KB


## Features correlation with quality to choose best 4 features

In [327]:
num_df = df.corrwith(df["quality"]).apply(lambda x: np.absolute(x)).sort_values(ascending=False)
print(num_df)

quality                 1.000000
alcohol                 0.484866
volatile acidity        0.407394
sulphates               0.257710
citric acid             0.240821
total sulfur dioxide    0.183339
density                 0.175208
chlorides               0.124085
fixed acidity           0.121970
Id                      0.069708
index                   0.067794
free sulfur dioxide     0.063260
pH                      0.052453
residual sugar          0.022002
dtype: float64


In [328]:
X = df[['alcohol', 'volatile acidity', 'sulphates', 'citric acid']]
y = df['quality']

In [329]:
X.head()

Unnamed: 0,alcohol,volatile acidity,sulphates,citric acid
0,9.6,0.56,0.44,0.04
1,11.0,0.22,0.87,0.36
2,9.5,0.745,0.59,0.12
3,9.6,0.49,0.64,0.26
4,9.9,0.88,0.56,0.04


In [330]:
y.head()

0    5
1    7
2    6
3    5
4    5
Name: quality, dtype: int64

## Function for normalization

In [331]:
def normalize_features(normalize_x):
    for column in normalize_x.columns:
        normalize_x[column] = (normalize_x[column] - normalize_x[column].min()) / (
                normalize_x[column].max() - normalize_x[column].min())
    return normalize_x

## Splitting the data

In [332]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

normalize_features(x_train)
normalize_features(x_test)


Unnamed: 0,alcohol,volatile acidity,sulphates,citric acid
345,0.250000,0.600000,0.233533,0.00
103,0.196429,0.278261,0.185629,0.46
309,0.732143,0.078261,0.113772,0.49
528,0.482143,0.373913,0.263473,0.08
1088,0.178571,0.391304,0.143713,0.24
...,...,...,...,...
102,0.285714,0.452174,0.119760,0.00
308,0.428571,0.226087,0.191617,0.47
591,0.107143,0.295652,0.191617,0.25
4,0.267857,0.608696,0.137725,0.04


## SVM Model

In [333]:
from sklearn import svm

svm_model = svm.SVC()
svm_model.fit(x_train, y_train)

predictions = svm_model.predict(x_test)

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         8
           5       0.77      0.61      0.68       157
           6       0.50      0.73      0.59       128
           7       0.48      0.34      0.40        41
           8       0.00      0.00      0.00         8

    accuracy                           0.59       343
   macro avg       0.29      0.28      0.28       343
weighted avg       0.59      0.59      0.58       343



## Random Forest Model

In [334]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(x_train, y_train.ravel())

predictions = rf_model.predict(x_test)

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.25      0.12      0.17         8
           5       0.72      0.50      0.59       157
           6       0.46      0.68      0.55       128
           7       0.43      0.44      0.43        41
           8       0.00      0.00      0.00         8

    accuracy                           0.54       343
   macro avg       0.31      0.29      0.29       343
weighted avg       0.56      0.54      0.53       343



## ANN Model

In [335]:
model = Sequential(
    [
        Flatten(),
        Dense(256, activation = "linear"),
        Dense(128, activation = "linear"),
        Dense(64, activation = "linear"),
        Dense(10, activation = "sigmoid")
    ]
)

model.compile(
    optimizer = "adam",
    loss = "sparse_categorical_crossentropy",
    metrics=['accuracy']
)


model.fit(x_train, y_train.ravel(), epochs=15)


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f0a0a2dc550>

In [336]:
predictions = model.predict(x_test)

y_pred_labels = [np.argmax(i) for i in predictions]

print(classification_report(y_test, y_pred_labels))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         8
           5       0.78      0.48      0.60       157
           6       0.47      0.72      0.57       128
           7       0.44      0.54      0.48        41
           8       0.00      0.00      0.00         8

    accuracy                           0.55       343
   macro avg       0.28      0.29      0.27       343
weighted avg       0.58      0.55      0.54       343

