In [None]:
# %% Deep learning - Section 13.131
#    APRF example 1: wine quality

# This code pertains a deep learning course provided by Mike X. Cohen on Udemy:
#   > https://www.udemy.com/course/deeplearning_x
# The "base" code in this repository is adapted (with very minor modifications)
# from code developed by the course instructor (Mike X. Cohen), while the
# "exercises" and the "code challenges" contain more original solutions and
# creative input from my side. If you are interested in DL (and if you are
# reading this statement, chances are that you are), go check out the course, it
# is singularly good.


In [2]:
# %% Libraries and modules
import numpy               as np
import matplotlib.pyplot   as plt
import torch
import torch.nn            as nn
import seaborn             as sns
import copy
import torch.nn.functional as F
import pandas              as pd
import scipy.stats         as stats
import sklearn.metrics     as skm
import time

from torch.utils.data                 import DataLoader,TensorDataset
from sklearn.model_selection          import train_test_split
from google.colab                     import files
from torchsummary                     import summary
from IPython                          import display
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg')


In [None]:
# %% Load and prepare data

# Load
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
data = pd.read_csv(url,sep=';')

# Remove some outliers (see lec. 82 for why)
data = data[data['total sulfur dioxide']<200]

# Z-score all the variables but quality
cols2zscore = data.keys()
cols2zscore = cols2zscore.drop('quality')

for col in cols2zscore:
    mean_val  = np.mean(data[col])
    std_val   = np.std(data[col])
    data[col] = (data[col] - mean_val) / std_val

# Binarise quality
data.loc[:,'boolean_quality'] = 0
data.loc[data['quality']>5, 'boolean_quality'] = 1
data.loc[data['quality']<6, 'boolean_quality'] = 0 # Implicit but here for clarity

# Convert from pandas dataframe to PyTorch tensor
data_t = torch.tensor( data[cols2zscore].values ).float()
labels = torch.tensor( data['boolean_quality'].values ).float()

print(f'Data shape: {data_t.shape}')
print(f'Labels shape: {labels.shape}')

# Labels need to be multidimentional for PyTorch (i.e. a matrix), not an array, and need to be long integers too
labels = labels[:,None]
print(f'Proper labels shape: {labels.shape}')


In [4]:
# %% Split into train and test data

# Split with scikitlearn
train_data,test_data,train_labels,test_labels = train_test_split(data_t,labels,test_size=0.1)

# Convert into PyTorch datasets
train_data = TensorDataset(train_data,train_labels)
test_data  = TensorDataset(test_data,test_labels)

# Convert into DataLoader objects
batch_size   = 32
train_loader = DataLoader(train_data,batch_size=batch_size,shuffle=True,drop_last=True)
test_loader  = DataLoader(test_data,batch_size=test_data.tensors[0].shape[0])


In [5]:
# %% Model class

# Optional arg to switch activation function
class ANN(nn.Module):
    def __init__(self):
        super().__init__()

        # Layers
        self.input  = nn.Linear(11,16)
        self.hid1   = nn.Linear(16,32)
        self.hid2   = nn.Linear(32,32)
        self.output = nn.Linear(32,1)

    # Forward propagation (pass raw output)
    def forward(self,x):

        x = F.relu(self.input(x))
        x = F.relu(self.hid1(x))
        x = F.relu(self.hid2(x))
        x = self.output(x)

        return x


In [6]:
# %% Function to train the model

# Parameters
num_epochs = 1000

def train_model():

    # Loss function and optimizer (vary lr to highlight differences in activation functions)
    loss_fun  = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.SGD(ANN.parameters(),lr=0.01)

    # Initialise losses
    train_acc = []
    test_acc  = []
    losses    = []

    # Loop over epochs
    for epoch_i in range(num_epochs):

        # Switch training mode on
        ANN.train()

        batch_acc  = []
        batch_loss = []

        for X,y in train_loader:

            # Forward propagation and loss (with batchnorm arg)
            yHat = ANN(X)
            loss = loss_fun(yHat,y)

            # Only now do backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Batch training accuracy
            batch_acc.append( 100*torch.mean(((yHat>0) == y).float()).item() )
            batch_loss.append(loss.item())

        # Average accuracy from batch
        train_acc.append(np.mean(batch_acc))
        losses.append(np.mean(batch_loss))

        # Test accuracy
        ANN.eval()
        X,y = next(iter(test_loader))
        with torch.no_grad():
            yHat = ANN(X)
        test_acc.append( 100*torch.mean(((yHat>0) == y).float()).item() )

    # Function output
    return train_acc,test_acc,losses


In [8]:
# %% Fit the model

ANN = ANN()
train_acc,test_acc,losses = train_model()


In [None]:
# %% Get data to compute performance measures on train and test data

# Predictions for (all) training data (i.e. raw output of last layer)
train_preds = ANN(train_loader.dataset.tensors[0])
print(train_preds.shape)

# Predictions for test data (i.e. raw output of last layer)
test_preds = ANN(test_loader.dataset.tensors[0])
print(test_preds.shape)


In [None]:
# %% Compute performance measures on train and test data

# Preallocate
train_metrics = np.zeros(4)
test_metrics  = np.zeros(4)

# Training performance measures (>0 because using raw output, and with binary classification we
# only need the sign of the output)
train_metrics[0] = skm.accuracy_score (train_loader.dataset.tensors[1],(train_preds>0).float())
train_metrics[1] = skm.precision_score(train_loader.dataset.tensors[1],(train_preds>0).float())
train_metrics[2] = skm.recall_score   (train_loader.dataset.tensors[1],(train_preds>0).float())
train_metrics[3] = skm.f1_score       (train_loader.dataset.tensors[1],(train_preds>0).float())

# Test performance measures
test_metrics[0] = skm.accuracy_score (test_loader.dataset.tensors[1],(test_preds>0).float())
test_metrics[1] = skm.precision_score(test_loader.dataset.tensors[1],(test_preds>0).float())
test_metrics[2] = skm.recall_score   (test_loader.dataset.tensors[1],(test_preds>0).float())
test_metrics[3] = skm.f1_score       (test_loader.dataset.tensors[1],(test_preds>0).float())


In [None]:
# %% Plotting

phi = (1 + np.sqrt(5)) / 2
fig = plt.figure(figsize=(6*phi,6))

plt.bar(np.arange(4)-.1,train_metrics,.5)
plt.bar(np.arange(4)+.1,test_metrics,.5)
plt.xticks([0,1,2,3],['Accuracy','Precision','Recall','F1-score'])
plt.ylim([.6,1])
plt.legend(['Train','Test'])
plt.title('Performance metrics')

plt.savefig('figure2_aprf_example_wine.png')

plt.show()

files.download('figure2_aprf_example_wine.png')


In [None]:
# %% Potting

# Confusion matrices
train_conf = skm.confusion_matrix(train_loader.dataset.tensors[1],train_preds>0)
test_conf  = skm.confusion_matrix(test_loader.dataset.tensors[1],test_preds>0)

phi = (1 + np.sqrt(5)) / 2
fig,ax = plt.subplots(1,2,figsize=(phi*6,6))

cmap = plt.cm.Blues
vmax = len(train_preds)/2

# Training confusion matrix
im = ax[0].imshow(train_conf,cmap=cmap,vmax=vmax)
ax[0].set_xticks([0,1])
ax[0].set_yticks([0,1])
ax[0].set_xticklabels(['bad','good'])
ax[0].set_yticklabels(['bad','good'])
ax[0].set_xlabel('Predicted quality')
ax[0].set_ylabel('True quality')
ax[0].set_title('TRAIN confusion matrix')

# Add text labels (flexible label color)
for i in range(train_conf.shape[0]):
    for j in range(train_conf.shape[1]):
        val   = train_conf[i,j]/vmax
        color = 'white' if val > 0.5 else 'black'
        ax[0].text(j,i,f"{train_conf[i,j]}",ha='center',va='center',color=color)

# Test confusion matrix
vmax = len(test_preds)/2

ax[1].imshow(test_conf,cmap=cmap,vmax=vmax)
ax[1].set_xticks([0,1])
ax[1].set_yticks([0,1])
ax[1].set_xticklabels(['bad','good'])
ax[1].set_yticklabels(['bad','good'])
ax[1].set_xlabel('Predicted quality')
ax[1].set_ylabel('True quality')
ax[1].set_title('TEST confusion matrix')

# Add text labels (flexible label color)
for i in range(test_conf.shape[0]):
    for j in range(test_conf.shape[1]):
        val   = test_conf[i,j]/vmax
        color = 'white' if val > 0.5 else 'black'
        ax[1].text(j,i,f"{test_conf[i,j]}",ha='center',va='center',color=color)

plt.savefig('figure3_aprf_example_wine.png')

plt.show()

files.download('figure3_aprf_example_wine.png')


In [None]:
# %% Exercise 1
#    The confusion matrices are rotated, in the sense that most visualizations (outside of Python) have reality in the columns
#    and predictions in the rows. Change the code to get the matrices in the orientation shown in the slides. Make sure
#    all the labels are correct! (hint: you might need to consult the help file:
#    https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html)

# Here's the code

# Rotated matrices
train_conf = skm.confusion_matrix(train_loader.dataset.tensors[1],train_preds>0)
test_conf  = skm.confusion_matrix(test_loader.dataset.tensors[1],test_preds>0)

train_conf_rot = np.fliplr(np.flipud(train_conf.T))
test_conf_rot  = np.fliplr(np.flipud(test_conf.T))

phi = (1 + np.sqrt(5)) / 2
fig,ax = plt.subplots(1,2,figsize=(phi*6,6))

cmap = plt.cm.Blues
vmax = len(train_preds)/2

im = ax[0].imshow(train_conf_rot,cmap=cmap,vmax=vmax)
ax[0].set_xticks([0,1])
ax[0].set_yticks([0,1])
ax[0].set_xticklabels(['good','bad'])
ax[0].set_yticklabels(['good','bad'])
ax[0].set_xlabel('True quality')
ax[0].set_ylabel('Predicted quality')
ax[0].set_title('TRAIN confusion matrix')

for i in range(train_conf_rot.shape[0]):
    for j in range(train_conf_rot.shape[1]):
        val   = train_conf_rot[i,j]/vmax
        color = 'white' if val > 0.5 else 'black'
        ax[0].text(j,i,f"{train_conf_rot[i,j]}",ha='center',va='center',color=color)

vmax = len(test_preds)/2

ax[1].imshow(test_conf_rot,cmap=cmap,vmax=vmax)
ax[1].set_xticks([0,1])
ax[1].set_yticks([0,1])
ax[1].set_xticklabels(['good','bad'])
ax[1].set_yticklabels(['good','bad'])
ax[1].set_xlabel('True quality')
ax[1].set_ylabel('Predicted quality')
ax[1].set_title('TEST confusion matrix')

for i in range(test_conf_rot.shape[0]):
    for j in range(test_conf_rot.shape[1]):
        val   = test_conf_rot[i,j]/vmax
        color = 'white' if val > 0.5 else 'black'
        ax[1].text(j,i,f"{test_conf_rot[i,j]}",ha='center',va='center',color=color)

plt.savefig('figure4_aprf_example_wine_extra1.png')

plt.show()

files.download('figure4_aprf_example_wine_extra1.png')


In [None]:
# %% Exercise 2
#    Normalize the confusion matrix to 'all' (the total N). The numbers are so long that they're difficult to read! Change
#    the text drawing code to transform the proportion into percent, and show only the first 2 digits after the period.

# Here's the code

# Rotated matrices and percent values
train_conf = skm.confusion_matrix(train_loader.dataset.tensors[1],train_preds>0)
test_conf  = skm.confusion_matrix(test_loader.dataset.tensors[1],test_preds>0)

train_conf_norm = train_conf / train_conf.sum()
test_conf_norm  = test_conf / test_conf.sum()

train_conf_rot = np.fliplr(np.flipud(train_conf_norm.T))
test_conf_rot  = np.fliplr(np.flipud(test_conf_norm.T))

phi = (1 + np.sqrt(5)) / 2
fig,ax = plt.subplots(1,2,figsize=(phi*6,6))

cmap = plt.cm.Blues
vmax = train_conf_norm.max()

im = ax[0].imshow(train_conf_rot,cmap=cmap,vmax=vmax)
ax[0].set_xticks([0,1])
ax[0].set_yticks([0,1])
ax[0].set_xticklabels(['good','bad'])
ax[0].set_yticklabels(['good','bad'])
ax[0].set_xlabel('True quality')
ax[0].set_ylabel('Predicted quality')
ax[0].set_title('TRAIN confusion matrix')

for i in range(train_conf_rot.shape[0]):
    for j in range(train_conf_rot.shape[1]):
        val          = train_conf_rot[i,j]
        percent_text = f"{val*100:.2f}%"
        color        = 'white' if val > 0.25 else 'black'
        ax[0].text(j,i,percent_text,ha='center',va='center',color=color)

vmax = test_conf_norm.max()

ax[1].imshow(test_conf_rot,cmap=cmap,vmax=vmax)
ax[1].set_xticks([0,1])
ax[1].set_yticks([0,1])
ax[1].set_xticklabels(['good','bad'])
ax[1].set_yticklabels(['good','bad'])
ax[1].set_xlabel('True quality')
ax[1].set_ylabel('Predicted quality')
ax[1].set_title('TEST confusion matrix')

for i in range(test_conf_rot.shape[0]):
    for j in range(test_conf_rot.shape[1]):
        val          = test_conf_rot[i,j]
        percent_text = f"{val*100:.2f}%"
        color        = 'white' if val > 0.25 else 'black'
        ax[1].text(j,i,percent_text,ha='center',va='center',color=color)

plt.savefig('figure5_aprf_example_wine_extra2.png')

plt.show()

files.download('figure5_aprf_example_wine_extra2.png')
