In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
import os
from sklearn import metrics as m
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler

In [2]:
df = pd.read_csv("winequality-red.csv")

In [3]:
# a mapping dictionary that maps the quality values from 0 to 5
quality_mapping = {
3: 0, 4: 1, 5: 2, 6: 3, 7: 4, 8: 5}
df["quality"] = df["quality"].map(quality_mapping)

In [4]:
# df["acidity_by_Ph"] = df["fixed acidity"]/df["pH"]
df["density_sugar"] = df["density"]/df["residual sugar"]
df["sulphate_by_chloride"] = df["sulphates"]/df["chlorides"]

In [5]:
corr_matrix = df.corr()
corr_vals = abs(corr_matrix["quality"]).sort_values(ascending=False)
top8_corr =  corr_vals[1:9]

In [6]:
top8_corr # we will carry with these attributes for the modeling

alcohol                 0.480738
volatile acidity        0.391735
sulphate_by_chloride    0.363317
sulphates               0.270777
citric acid             0.233733
total sulfur dioxide    0.185404
density                 0.173251
fixed acidity           0.127766
Name: quality, dtype: float64

In [7]:
top8_corr = pd.DataFrame(top8_corr)

We observe that the atributes that correlate best with the wine quality are alcohol, volatile acicity, sulphates, citric acid, total sufur dioxide, density, fixed acidity and chlorides. We will be using these in the modeling

Note: To prevent the model from overfitting I decided to remove two attributes which seemed to not have a major importance and correlation with the target (citric acid and fixed acidity)

In [8]:
# top8_corr.drop(["citric acid", "fixed acidity"], axis=0, inplace=True)

In [9]:
list_top8 = list(top8_corr.index)

In [10]:
list_top8.append("quality")

In [11]:
df_model = df[list_top8]

In [12]:
df.shape

(1596, 14)

We can see that most of the selected attributes shown an skewed distribution and possess different scales. We will therefore standarized the data before modeling. But first, lets split the data into the test and train sets.

#### 2. Splitting data: 
First we check the distribution of the target values to see if an stratified split is needed

We can see that we have a skewed dataset for classification 
so we may want to stratify the data before split. Furthermore, we will want to use **stratified k-fold cross-validation** (For classification problems).

-> There are several choices for selecting the appropriate number of bins. If
you have a lot of samples( > 10k, > 100k), then you don’t need to care about the
number of bins. Just divide the data into 10 or 20 bins. If you do not have a lot of
samples, you can use a simple rule like Sturge’s Rule to calculate the appropriate
number of bins.


Number of Bins = 1 + log2(N) Where N is the number of samples you have in your dataset. # 12 in our case

For the stratification we will use the most correlated attribute (alchohol)

In [13]:
def strater (col_item):
    if col_item <= 9.3:
        return 1
    elif col_item > 9.3 and col_item <= 10:
        return 2
    elif col_item > 10 and col_item <= 11:
        return 3
    elif col_item > 11 and col_item <= 12:
        return 4
    else:
        return 5

In [14]:
df["strat"] = df["alcohol"].apply(strater)

In [15]:
split = StratifiedShuffleSplit(n_splits = 1, test_size=0.20, random_state=10)

In [16]:
for train_index, test_index in split.split(df, df["strat"]):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]

In [17]:
X = df.drop("quality", axis=1)

we go with stratified split then:

In [18]:
strat_train_set.drop("strat", axis=1, inplace=True)
strat_test_set.drop("strat", axis=1, inplace=True)

In [19]:
X_train_full = strat_train_set.drop("quality", axis=1)
y_train_full = strat_train_set["quality"].values

In [20]:
X_test = strat_test_set.drop("quality", axis=1)
y_test = strat_test_set["quality"].values

In [21]:
X_train = X_train_full[300:]
X_valid = X_train_full[:300]
y_train = y_train_full[300:]
y_valid = y_train_full[:300]

In [22]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

In [23]:
df.shape

(1596, 15)

In [24]:
X_train.shape

(976, 13)

In [25]:
X_valid.shape

(300, 13)

In [26]:
X_test.shape

(320, 13)

In [27]:
# 1) Model

model = keras.models.Sequential([
keras.layers.Input(shape=(13,)),
keras.layers.Dense(100, activation="selu",  kernel_regularizer=keras.regularizers.l2(0.01)),
keras.layers.Dense(50, activation="selu", kernel_regularizer=keras.regularizers.l2(0.01)),
keras.layers.Dense(50, activation="selu",  kernel_regularizer=keras.regularizers.l2(0.01)),
keras.layers.Dense(20, activation="selu",  kernel_regularizer=keras.regularizers.l2(0.01)),
keras.layers.Dropout(rate=0.2),
keras.layers.Dense(6, activation="softmax")
])

In [28]:
# 2) compile

model.compile(loss="sparse_categorical_crossentropy",
              optimizer= keras.optimizers.SGD(lr=0.009),
              metrics=["accuracy"])

In [29]:
# 3) track
# 1) Let’s start by defining the root log directory we will use for our TensorBoard logs
root_logdir = os.path.join(os.curdir, "my_logs")
# 2) plus a small function that generates a subdirectory path for the current datetime so that it’s different at every 
def get_run_logdir():
    import time
    run_id = time.strftime("run_SGD%Y_%m_%d-%H_%M_%S")
    return os.path.join(root_logdir, run_id)

run_logdir = get_run_logdir()

In [30]:
# def exponential_decay(lr0, s): # with this piece of code and using decay in the optimizer we are able to use exponential deacay
#     # we start with high learning rate (0.01) and decrease  by 10 e very s steps. We can see how this helped reach better accuracy in less time
#     def exponential_decay_fn(epoch):
#         return lr0 * 0.1**(epoch / s)
#     return exponential_decay_fn

# exponential_decay_fn = exponential_decay(lr0=0.01, s=20)

In [31]:
%tensorboard --logdir {logs_base_dir}  --host localhost

UsageError: Line magic function `%tensorboard` not found.


In [None]:
%load_ext tensorboard
%tensorboard --logdir=./my_logs --port=6009

In [None]:
# 4) Fit

early_stopping_cb = keras.callbacks.EarlyStopping(patience=20)
checkpoint_cb = keras.callbacks.ModelCheckpoint("my_wine_model.h5", save_best_only=True)
tensorboard_cb = keras.callbacks.TensorBoard(run_logdir)
# lr_scheduler = keras.callbacks.LearningRateScheduler(exponential_decay_fn)

history = model.fit(X_train, y_train, epochs=500,
                    validation_data=(X_valid, y_valid),
                    callbacks=[early_stopping_cb, checkpoint_cb, tensorboard_cb])

In [None]:
model.save("my_wine_class_model.h5")

In [None]:
model = keras.models.load_model("my_wine_class_model.h5") 

In [None]:
model.evaluate(X_test, y_test)

In [None]:
y_predicted=model.predict(X_test)

In [None]:
X_test.shape

In [None]:
y_predicted = [np.argmax(i) for i in y_predicted]

In [None]:
tf.math.confusion_matrix(labels=y_test, predictions=y_predicted)

In [None]:
y_predicted = model.predict(X_test)
y_predicted_labels = [np.argmax(i) for i in y_predicted]
corr_x = tf.math.confusion_matrix(labels=y_test,predictions=y_predicted_labels)

plt.figure(figsize = (10,7))
sns.heatmap(corr_x, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')
plt.savefig("corr-mat.png", dpi=300)