In [None]:
# %% Deep learning - Section 12.119
#    Code challenge 19: unbalanced data

#    1) Start from code from video 10.092 (wine dataset)
#    2) Use a leaky ReLU, Adam with initial lr = 0.001, and 500 epochs
#    3) Create a function exporting train/test dataloaders with a specified
#       quality threshold for binarising bad and good wine
#    4) Train the model using thresholds of 4.5/8, 5.5/8, and 6.5/8
#    5) Plot a 3x3 subplot matrix with losses, avg accuracy, and per category
#       accuracy on each line

# This code pertains a deep learning course provided by Mike X. Cohen on Udemy:
#   > https://www.udemy.com/course/deeplearning_x
# The "base" code in this repository is adapted (with very minor modifications)
# from code developed by the course instructor (Mike X. Cohen), while the
# "exercises" and the "code challenges" contain more original solutions and
# creative input from my side. If you are interested in DL (and if you are
# reading this statement, chances are that you are), go check out the course, it
# is singularly good.


In [None]:
# %% Libraries and modules
import numpy               as np
import matplotlib.pyplot   as plt
import torch
import torch.nn            as nn
import seaborn             as sns
import copy
import torch.nn.functional as F
import pandas              as pd
import scipy.stats         as stats
import time

from torch.utils.data                 import DataLoader,TensorDataset
from sklearn.model_selection          import train_test_split
from google.colab                     import files
from torchsummary                     import summary
from IPython                          import display
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg')


In [None]:
# %% Function to get data and flexibly split them into two categories

def gen_data(data,split_threshold):

    # Remove some outliers (see lec. 82 for why)
    data = data[data['total sulfur dioxide']<200].copy()

    # Z-score all the variables but quality
    cols2zscore = data.keys()
    cols2zscore = cols2zscore.drop('quality')

    for col in cols2zscore:
        mean_val      = np.mean(data[col])
        std_val       = np.std(data[col])
        data.loc[:,col] = (data[col] - mean_val) / std_val

    # Binarise quality according to input threshold
    data.loc[:,'boolean_quality'] = 0
    data.loc[data['quality']>split_threshold, 'boolean_quality'] = 1
    data.loc[data['quality']<split_threshold, 'boolean_quality'] = 0 # Implicit but here for clarity

    # Convert from pandas dataframe to PyTorch tensor
    data_t = torch.tensor( data[cols2zscore].values ).float()
    labels = torch.tensor( data['boolean_quality'].values ).float()

    # Labels need to be multidimentional for PyTorch, not an array, and need to be long integers too
    labels = labels[:,None]

    # Split with scikitlearn
    train_data,test_data,train_labels,test_labels = train_test_split(data_t,labels,test_size=0.1)

    # Convert into PyTorch datasets
    train_data = TensorDataset(train_data,train_labels)
    test_data  = TensorDataset(test_data,test_labels)

    # Convert into DataLoader objects
    batch_size   = 32
    train_loader = DataLoader(train_data,batch_size=batch_size,shuffle=True,drop_last=True)
    test_loader  = DataLoader(test_data,batch_size=test_data.tensors[0].shape[0])

    return train_loader,test_loader


In [None]:
# %% Test data function

# Load
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
data = pd.read_csv(url,sep=';')

# Call function (input data and threshold as a value between 1 and 8)
train_loader,test_loader = gen_data(data,6.5)


In [None]:
# %% Model class

# Optional arg to switch activation function
class ANN_wine(nn.Module):
    def __init__(self,act_fun):
        super().__init__()

        # Layers
        self.input  = nn.Linear(11,32)
        self.hid1   = nn.Linear(32,32)
        self.hid2   = nn.Linear(32,32)
        self.output = nn.Linear(32,1)

        # Activation function
        self.act_fun = act_fun

    # Forward propagation
    def forward(self,x):

        act_fun = getattr(torch.nn,self.act_fun)()
        x = act_fun(self.input(x))
        x = act_fun(self.hid1(x))
        x = act_fun(self.hid2(x))
        x = self.output(x)

        return x


In [None]:
# %% Function to train the model

# Parameters
num_epochs = 1000

def train_model():

    # Loss function and optimizer (vary lr to highlight differences in activation functions)
    loss_fun  = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(ANN.parameters(),lr=0.001)

    # Initialise losses
    train_acc = []
    test_acc  = []
    losses    = []

    # Loop over epochs
    for epoch_i in range(num_epochs):

        # Switch training mode on
        ANN.train()

        batch_acc  = []
        batch_loss = []

        for X,y in train_loader:

            # Forward propagation and loss (with batchnorm arg)
            yHat = ANN(X)
            loss = loss_fun(yHat,y)

            # Only now do backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Batch training accuracy
            batch_acc.append( 100*torch.mean(((yHat>0) == y).float()).item() )
            batch_loss.append(loss.item())

        # Average accuracy from batch
        train_acc.append(np.mean(batch_acc))
        losses.append(np.mean(batch_loss))

        # Test accuracy
        ANN.eval()
        X,y = next(iter(test_loader))
        with torch.no_grad():
            yHat = ANN(X)
        test_acc.append( 100*torch.mean(((yHat>0) == y).float()).item() )

    # Function output
    return train_acc,test_acc,losses,ANN


In [None]:
# %% Parametric experiment on data split thresholds

# Data and parameters
url  = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
data = pd.read_csv(url,sep=';')

thresholds = [4.5,5.5,6.5]

# Preallocate vars
train_acc  = np.zeros((len(thresholds),num_epochs))
test_acc   = np.zeros((len(thresholds),num_epochs))
losses     = np.zeros((len(thresholds),num_epochs))

test_acc_0 = np.zeros(len(thresholds))
test_acc_1 = np.zeros(len(thresholds))
n_samples  = np.zeros((len(thresholds),2))

# Run the experiment! (it takes ~5 mins)
for i,threshold in enumerate(thresholds):

    # Get data
    train_loader,test_loader = gen_data(data,threshold)

    # Run model
    ANN = ANN_wine('LeakyReLU')
    train_acc[i,:],test_acc[i,:],losses[i,:],ANN = train_model()

    # Get accuracies by category
    ANN.eval()
    X_test, y_test = next(iter(test_loader))
    with torch.no_grad():
        yHat  = ANN(X_test)
        preds = (yHat > 0).float().flatten()

    # Flatten labels
    y_true = y_test.flatten()

    # Loop through both classes for accuracies (0 and 1)
    for label in [0,1]:
        idx = y_true == label
        n   = idx.sum().item()
        n_samples[i,label] = n
        if n > 0:
            acc = 100 * torch.mean((preds[idx] == y_true[idx]).float()).item()
            if label == 0:
                test_acc_0[i] = acc
            else:
                test_acc_1[i] = acc
            print(f'Accuracy for class {label} and threshold {threshold}: {acc:.2f}% ({n} samples)')
        else:
            print(f'No samples for class {label}')


In [None]:
# %% Functions for 1D smoothing filter

# Improved for edge effects - adaptive window
def smooth_adaptive(x,k):
    smoothed = np.zeros_like(x)
    half_k   = k // 2

    for i in range(len(x)):
        start       = max(0, i-half_k)
        end         = min(len(x), i+half_k + 1)
        smoothed[i] = np.mean(x[start:end])

    return smoothed


In [None]:
# %% Plotting

phi = (1 + np.sqrt(5)) / 2
fig,ax = plt.subplots(3,3,figsize=(7*phi,7))

for i,threshold in enumerate(thresholds):

    # Column 0 (losses)
    ax[i,0].plot(smooth_adaptive(losses[i,:],25),color='firebrick')
    ax[i,0].set_title(f'Loss (data threshold={threshold})')
    ax[i,0].set_xlabel('Epoch')
    ax[i,0].set_ylabel('Loss')

    # Column 1 (accuracies)
    ax[i,1].plot(smooth_adaptive(train_acc[i,:],25),label='Train',color='steelblue')
    ax[i,1].plot(smooth_adaptive(test_acc[i,:],25),label='Test',color='darkorange')
    ax[i,1].set_title(f'Accuracy (data threshold={threshold})')
    ax[i,1].set_xlabel('Epoch')
    ax[i,1].set_ylabel('Accuracy (%)')
    ax[i,1].set_ylim([45,105])
    ax[i,1].legend()

    # Column 2 (accuracy by class)
    accs   = [test_acc_0[i],test_acc_1[i]]
    labels = ['Low qual. wines','High qual. wines']
    counts = n_samples[i,:]

    bars = ax[i,2].bar(labels,accs)
    for bar, count in zip(bars, counts):
        ax[i,2].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
                      f'n={int(count)}',ha='center',va='bottom')

    ax[i,2].set_ylim([0,115])
    ax[i,2].set_ylabel('Accuracy (%)')
    ax[i,2].set_title(f'Class-wise acc. (data threshold={threshold})')

plt.tight_layout()

plt.savefig('figure8_code_challenge_19.png')

plt.show()

files.download('figure8_code_challenge_19.png')


In [None]:
# %% Exercise 1
#    L2 regularization is supposed to help minimize over-training. Try adding an L2 regularizer and see if that
#    helps reduce the bias due to unbalanced N.

# Easily done by adding weight_decay=0.01 to the optimizer line. It doesn't seem
# to change a lot, if anything the first example, with only N=5 samples in one
# of the categories, shows nicely how the overall accuray on test data (~97%)
# can be quite misleading! A bit like the cats and boats example.


In [None]:
# %% Exercise 2
#    Does the unbalanced design get better (that is, less unbalanced) if the train/test split is 80/20 instead of 90/10?
#    Try it and find out! Note that you don't need to train models for this question; you simply need to modify the data
#    splitting function and then compute the proportions of the two categories.

# Trying this without L2 regularisation for comparability with the results from
# original experiment. And no chaging to 20% test data doesn't seem to improve
# or change anything that much; most likely the unbalance is still too strong
# for the lower and upper splits.
