In [None]:
# %% Deep learning - Section 10.93
#    Code challenge 11: predict sugar

#    1) Start drom code from video 10.083
#    2) Predict 'residual sugar' instead of quality
#    3) Use only one batch size (unless you want to explore)
#    4) Plot train/test losses and model prediction vs. observation, along with
#       correlation coefficients for train and test
#    5) Produce a correlation matrix of the model features and interpret them

# This code pertains a deep learning course provided by Mike X. Cohen on Udemy:
#   > https://www.udemy.com/course/deeplearning_x
# The "base" code in this repository is adapted (with very minor modifications)
# from code developed by the course instructor (Mike X. Cohen), while the
# "exercises" and the "code challenges" contain more original solutions and
# creative input from my side. If you are interested in DL (and if you are
# reading this statement, chances are that you are), go check out the course, it
# is singularly good.


In [None]:
# %% Libraries and modules
import numpy               as np
import matplotlib.pyplot   as plt
import torch
import torch.nn            as nn
import seaborn             as sns
import copy
import torch.nn.functional as F
import pandas              as pd
import scipy.stats         as stats
import time

from torch.utils.data                 import DataLoader,TensorDataset
from sklearn.model_selection          import train_test_split
from google.colab                     import files
from torchsummary                     import summary
from IPython                          import display
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg')


In [None]:
# %% Load and prepare data

# Load
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
data = pd.read_csv(url,sep=';')

# Remove some outliers (see lec. 82 for why)
data = data[data['total sulfur dioxide']<200]

# Z-score all the variables
cols2zscore = data.keys()

for col in cols2zscore:
    mean_val  = np.mean(data[col])
    std_val   = np.std(data[col])
    data[col] = (data[col] - mean_val) / std_val

# Convert from pandas dataframe to PyTorch tensor (remove sugar before
# generating the data tensor)
cols2zscore = cols2zscore.drop('residual sugar')
data_t      = torch.tensor( data[cols2zscore].values ).float()
labels      = torch.tensor( data['residual sugar'].values ).float()

print(f'Data shape: {data_t.shape}')
print(f'Labels shape: {labels.shape}')

# Labels need to be multidimentional for PyTorch, not an array, and need to be long integers too
labels = labels[:,None]
print(f'Proper labels shape: {labels.shape}')

# Reminder of what the data are actually like
data.describe()


In [None]:
# %% Split into train and test data

# Split with scikitlearn
train_data,test_data,train_labels,test_labels = train_test_split(data_t,labels,test_size=0.1)

# Convert into PyTorch datasets
train_data = TensorDataset(train_data,train_labels)
test_data  = TensorDataset(test_data,test_labels)

# Convert into DataLoader objects
# > the train_loader is moved inside the train_model() function to allow a parametric test of the batch size
test_loader  = DataLoader(test_data,batch_size=test_data.tensors[0].shape[0])


In [None]:
# %% Model class

class model_class(nn.Module):
    def __init__(self):
        super().__init__()

        # Layers
        self.input  = nn.Linear(11,32)
        self.hid1   = nn.Linear(32,32)
        self.hid2   = nn.Linear(32,32)
        self.output = nn.Linear(32,1)

    # Forward propagation
    def forward(self,x):

        x = F.relu(self.input(x))
        x = F.relu(self.hid1(x))
        x = F.relu(self.hid2(x))
        x = self.output(x)

        return x


In [None]:
# %% Function to train the model

# Parameters
num_epochs = 1000

def train_model():

    # Loss function and optimizer
    loss_fun = nn.MSELoss()
    optimizer = torch.optim.SGD(ANN.parameters(),lr=0.01)

    # Initialise losses
    train_losses = []
    test_losses  = []

    # Loop over epochs
    for epoch_i in range(num_epochs):

        # Switch training mode on
        ANN.train()

        batch_loss = []

        for X,y in train_loader:

            # Forward propagation and loss
            yHat = ANN(X)
            loss = loss_fun(yHat,y)

            # Only now do backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Track losses (no accuracy for continous vars)
            batch_loss.append(loss.item())

        # Average losses from batch
        train_losses.append(np.mean(batch_loss))

        # Evaluate on test set
        ANN.eval()
        with torch.no_grad():

            batch_loss = []

            for X,y in test_loader:

                yHat_test = ANN(X)
                test_loss = loss_fun(yHat_test,y)
                batch_loss.append(test_loss.item())

            test_losses.append(np.mean(batch_loss))

    # Final predictions on all training and test data
    ANN.eval()
    with torch.no_grad():

        # Train data
        train_preds   = []
        train_targets = []

        for X,y in train_loader:

            train_preds.append(ANN(X))
            train_targets.append(y)

        train_preds   = torch.cat(train_preds,dim=0)
        train_targets = torch.cat(train_targets,dim=0)

        # Test data
        test_preds = []
        test_targets = []

        for X,y in test_loader:

            test_preds.append(ANN(X))
            test_targets.append(y)

        test_preds   = torch.cat(test_preds,dim=0)
        test_targets = torch.cat(test_targets,dim=0)

    # Function output
    return train_losses,test_losses, train_preds,train_targets, test_preds,test_targets


In [None]:
# %% Parametric experiment over mini-batches size (track time as well)

# Takes about 8 mins
batch_size_exp = np.arange(3,8)

num_exps         = len(batch_size_exp)
test_losses_all  = np.zeros((num_epochs,num_exps))
train_losses_all = np.zeros((num_epochs,num_exps))

train_preds_all   = []
train_targets_all = []
test_preds_all    = []
test_targets_all  = []

elapsed_time = np.zeros((len(batch_size_exp),1))

for i,exp_i in enumerate(batch_size_exp):

        start_time = time.time()

        batch_size   = int(2**exp_i)
        train_loader = DataLoader(train_data,batch_size=batch_size,shuffle=True,drop_last=True)

        ANN = model_class()
        train_losses,test_losses, train_preds,train_targets, test_preds,test_targets = train_model()

        train_losses_all[:,i] = train_losses
        test_losses_all[:,i]  = test_losses

        train_preds_all.append(train_preds.numpy())
        train_targets_all.append(train_targets.numpy())

        test_preds_all.append(test_preds.numpy())
        test_targets_all.append(test_targets.numpy())

        elapsed_time[i] = time.time() - start_time
        print(f"Batch size 2^{exp_i} = {batch_size} completed in {elapsed_time[i, 0]:.2f} seconds")


In [None]:
test_preds_all = np.array(test_preds_all)
test_preds_all.shape


In [None]:
# %% Plotting

phi = ( 1 + np.sqrt(5) ) / 2

for i, exp_i in enumerate(batch_size_exp):

    batch_size = 2**exp_i

    fig,ax = plt.subplots(1,2,figsize=(1.5*6*phi,6))

    # Training and test losses
    ax[0].plot(train_losses_all[:,i],label='Train loss')
    ax[0].plot(test_losses_all[:,i],label='Test loss')
    ax[0].set_title('Loss curves')
    ax[0].set_xlabel('Epoch')
    ax[0].set_ylabel('MSE loss')
    ax[0].legend()
    ax[0].grid(True)

    # True vs predicted values (and get correlation coefficients)
    train_preds   = train_preds_all[i].squeeze()
    train_targets = train_targets_all[i].squeeze()
    r_train       = np.corrcoef(train_preds,train_targets)[0,1]

    test_preds   = test_preds_all[i].squeeze()
    test_targets = test_targets_all[i].squeeze()
    r_test       = np.corrcoef(test_preds, test_targets)[0,1]

    ax[1].plot(train_targets_all[i],train_preds_all[i],'o',label=f'Train (r={r_train:.3f})',alpha=0.75)
    ax[1].plot(test_targets_all[i],test_preds_all[i],'x',label=f'Train (r={r_test:.3f})',alpha=0.75)
    ax[1].plot([min(train_targets_all[i]),max(train_targets_all[i])],
               [min(train_targets_all[i]),max(train_targets_all[i])],
               'k--', label='Identity')
    ax[1].set_title('True vs. predicted values')
    ax[1].set_xlabel('True values')
    ax[1].set_ylabel('Predicted values')
    ax[1].legend()
    ax[1].grid(True)

    fig.suptitle(f'Batch Size: {batch_size}',fontsize=14)
    fig.tight_layout(rect=[0,0.03,1,0.95])

    filename = f'figure{49+i}_code_challenge_11.png'
    plt.savefig(filename)
    plt.show()
    files.download(filename)


In [None]:
# %% Plotting

# Notice how the correlation patterns are not obvious, menaing that that there are
# quite some subtle relationship between wine quality and the listed features (beside
# the quite strong correlation with alcohol eheheh)

# Visualise covariance matrix of data features
phi = ( 1 + np.sqrt(5) ) / 2
fig = plt.figure(figsize=(6*phi,6))

plt.imshow(np.corrcoef(data.T),vmin=-1,vmax=1,cmap='jet')
plt.xticks(range(len(data.keys())),labels=data.keys(),rotation=90)
plt.yticks(range(len(data.keys())),labels=data.keys())
plt.colorbar()
plt.title('Data correlation matrix')

plt.savefig('figure54_code_challenge_11.png')

plt.show()

files.download('figure54_code_challenge_11.png')


In [None]:
# %% Exercise 1
#    Why did the output node not have a sigmoid activation function? It is possible to train this model using a sigmoid
#    function on the output?

# I would say that we don't use a sigmoid because we are dealing with a continous
# predictor, in other words, it's not a classification task (which a sigmoid is
# useful for)


In [None]:
# %% Exercise 2
#    Let's say you don't need to know the *exact* sugar amount, only the approximate amount. You could then label each
#    wine as being in one of three bins, according to the amount of sugar. What would you need to change in the model?

# In this case one might categorise the sugar variable, and then build a classification
# model similar to the one built for quality; see code in cells below.
# main changes I've implemented:
# > Data preprocessing and categorisation, I opted for a 3 quintile split of the sugar feature, after normalisation (i.e. 1/3, 1/3, and 1/3);
# > Model class now requires 3 output nodes;
# > Loss function needs to be nn.CrossEntropyLoss() for multioutput categorisation;
# > Adapt accuracy computation.


In [None]:
# %% Load and prepare data

# Load
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
data = pd.read_csv(url,sep=';')

# Remove some outliers (see lec. 82 for why)
data = data[data['total sulfur dioxide']<200]

# Z-score all the variables
cols2zscore = data.keys()

for col in cols2zscore:
    mean_val  = np.mean(data[col])
    std_val   = np.std(data[col])
    data[col] = (data[col] - mean_val) / std_val

# Categorise residual sugar (IQRs)
q33 = data['residual sugar'].quantile(0.33)
q66 = data['residual sugar'].quantile(0.66)

data.loc[:,'categorical_residual_sugar'] = 0                              # low sugar
data.loc[data['residual sugar'] > q33, 'categorical_residual_sugar'] = 1  # medium sugar
data.loc[data['residual sugar'] > q66, 'categorical_residual_sugar'] = 2  # high sugar

cols2zscore = data.keys()

# Convert from pandas dataframe to PyTorch tensor
cols2zscore = cols2zscore.drop('categorical_residual_sugar')
cols2zscore = cols2zscore.drop('residual sugar')
data_t      = torch.tensor( data[cols2zscore].values ).float()
labels      = torch.tensor( data['categorical_residual_sugar'].values ).long()

print(f'Data shape: {data_t.shape}')
print(f'Labels shape: {labels.shape}')

# Labels need to be multidimentional for PyTorch, not an array, and need to be long integers too
#labels = labels[:,None]
print(f'Proper labels shape: {labels.shape}')

# Reminder of what the data are actually like
data.describe()


In [None]:
# %% Split into train and test data

# Split with scikitlearn
train_data,test_data,train_labels,test_labels = train_test_split(data_t,labels,test_size=0.1)

# Convert into PyTorch datasets
train_data = TensorDataset(train_data,train_labels)
test_data  = TensorDataset(test_data,test_labels)

# Convert into DataLoader objects
# > the train_loader is moved inside the train_model() function to allow a parametric test of the batch size
test_loader  = DataLoader(test_data,batch_size=test_data.tensors[0].shape[0])


In [None]:
# %% Model class

class model_class(nn.Module):
    def __init__(self):
        super().__init__()

        # Layers
        self.input  = nn.Linear(11,32)
        self.hid1   = nn.Linear(32,32)
        self.hid2   = nn.Linear(32,32)
        self.output = nn.Linear(32,3)

    # Forward propagation
    def forward(self,x):

        x = F.relu(self.input(x))
        x = F.relu(self.hid1(x))
        x = F.relu(self.hid2(x))
        x = self.output(x)

        return x


In [None]:
# %% Function to train the model

# Parameters
num_epochs = 1000

def train_model():

    # Loss function and optimizer
    loss_fun = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(ANN.parameters(),lr=0.01)

    # Initialise losses
    train_acc = []
    test_acc  = []
    losses    = []

    # Loop over epochs
    for epoch_i in range(num_epochs):

        # Switch training mode on
        ANN.train()

        batch_acc  = []
        batch_loss = []

        for X,y in train_loader:

            # Forward propagation and loss
            yHat = ANN(X)
            loss = loss_fun(yHat,y)

            # Only now do backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Batch training accuracy
            predicted_class = torch.argmax(yHat,dim=1)
            batch_acc.append( 100*torch.mean((predicted_class==y).float()).item() )
            batch_loss.append(loss.item())

        # Average accuracy from batch
        train_acc.append(np.mean(batch_acc))
        losses.append(np.mean(batch_loss))

        # Test accuracy
        ANN.eval()
        X,y = next(iter(test_loader))
        with torch.no_grad():
            yHat = ANN(X)
        predicted_class = torch.argmax(yHat,dim=1)
        test_acc.append( 100*torch.mean((predicted_class==y).float()).item() )

    # Function output
    return train_acc,test_acc,losses


In [None]:
# %% Parametric experiment over mini-batches size (track time as well)

# Takes about 8 mins
batch_size_exp = np.arange(3,8)

train_acc    = np.zeros((num_epochs,len(batch_size_exp)))
test_acc     = np.zeros((num_epochs,len(batch_size_exp)))
losses       = np.zeros((num_epochs,len(batch_size_exp)))
elapsed_time = np.zeros((len(batch_size_exp),1))

for i,exp_i in enumerate(batch_size_exp):

        start_time = time.time()

        batch_size   = int(2**exp_i)
        train_loader = DataLoader(train_data,batch_size=batch_size,shuffle=True,drop_last=True)

        ANN = model_class()
        train_acc[:,i],test_acc[:,i],losses[:,i] = train_model()

        elapsed_time[i] = time.time() - start_time
        print(f"Batch size 2^{exp_i} = {batch_size} completed in {elapsed_time[i, 0]:.2f} seconds")


In [None]:
# %% Functions for 1D smoothing filter

# Improved for edge effects - adaptive window
def smooth_adaptive(x,k):
    smoothed = np.zeros_like(x)
    half_k   = k // 2

    for i in range(len(x)):
        start       = max(0, i-half_k)
        end         = min(len(x), i+half_k + 1)
        smoothed[i] = np.mean(x[start:end])

    return smoothed


In [None]:
# %% Plotting

phi = ( 1 + np.sqrt(5) ) / 2
fig,ax = plt.subplots(1,2,figsize=(1.5*6*phi,6))

cmaps = plt.cm.plasma(np.linspace(.1,.9,len(batch_size_exp)))
for i in range(len(batch_size_exp)):
    ax[0].plot(smooth_adaptive(train_acc[:,i],40),color=cmaps[i])
    ax[1].plot(smooth_adaptive(test_acc[:,i],40),color=cmaps[i])

ax[0].set_title('Train accuracy')
ax[1].set_title('Test accuracy')

# Make the legend easier to read
leglabels = [2**int(i) for i in batch_size_exp]

# Common features
for i in range(2):
    ax[i].legend(leglabels)
    ax[i].set_xlabel('Epoch')
    ax[i].set_ylabel('Accuracy (%)')
    ax[i].set_ylim([41,101])
    ax[i].grid()

plt.savefig('figure55_code_challenge_11_extra2.png')

plt.show()

files.download('figure55_code_challenge_11_extra2.png')
