In [None]:
# %% Deep learning - Section 15.147
#    Code challenge 22: Xavier vs. Kaiming
#
#    1) Start from code from video 13.131 (wine dataset)
#    2) Model wine quality with both Xavier and Kaiming inits (keep default
#       bias init)
#    3) Run both models 10 times, and run a t-test to compare (average from last
#       five epochs)
#    4) Plot losses, train, and test performance for one run
#    5) Plot losses, train, and test performance distributions from 10 runs

# This code pertains a deep learning course provided by Mike X. Cohen on Udemy:
#   > https://www.udemy.com/course/deeplearning_x
# The "base" code in this repository is adapted (with very minor modifications)
# from code developed by the course instructor (Mike X. Cohen), while the
# "exercises" and the "code challenges" contain more original solutions and
# creative input from my side. If you are interested in DL (and if you are
# reading this statement, chances are that you are), go check out the course, it
# is singularly good.


In [2]:
# %% Libraries and modules
import numpy               as np
import matplotlib.pyplot   as plt
import torch
import torch.nn            as nn
import seaborn             as sns
import copy
import torch.nn.functional as F
import pandas              as pd
import scipy.stats         as stats
import sklearn.metrics     as skm
import time

from torch.utils.data                 import DataLoader,TensorDataset
from sklearn.model_selection          import train_test_split
from google.colab                     import files
from torchsummary                     import summary
from IPython                          import display
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg')
plt.style.use('default')


In [None]:
# %% Load and prepare data

# Load
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
data = pd.read_csv(url,sep=';')

# Remove some outliers (see lec. 82 for why)
data = data[data['total sulfur dioxide']<200]

# Z-score all the variables but quality
cols2zscore = data.keys()
cols2zscore = cols2zscore.drop('quality')

for col in cols2zscore:
    mean_val  = np.mean(data[col])
    std_val   = np.std(data[col])
    data[col] = (data[col] - mean_val) / std_val

# Binarise quality
data.loc[:,'boolean_quality'] = 0
data.loc[data['quality']>5, 'boolean_quality'] = 1
data.loc[data['quality']<6, 'boolean_quality'] = 0 # Implicit but here for clarity

# Convert from pandas dataframe to PyTorch tensor
data_t = torch.tensor( data[cols2zscore].values ).float()
labels = torch.tensor( data['boolean_quality'].values ).float()

print(f'Data shape: {data_t.shape}')
print(f'Labels shape: {labels.shape}')

# Labels need to be multidimentional for PyTorch (i.e. a matrix), not an array, and need to be long integers too
labels = labels[:,None]
print(f'Proper labels shape: {labels.shape}')


In [4]:
# %% Split into train and test data

# Split with scikitlearn
train_data,test_data,train_labels,test_labels = train_test_split(data_t,labels,test_size=0.1)

# Convert into PyTorch datasets
train_data = TensorDataset(train_data,train_labels)
test_data  = TensorDataset(test_data,test_labels)

# Convert into DataLoader objects
batch_size   = 32
train_loader = DataLoader(train_data,batch_size=batch_size,shuffle=True,drop_last=True)
test_loader  = DataLoader(test_data,batch_size=test_data.tensors[0].shape[0])


In [12]:
# %% Model class

class ANN(nn.Module):
    def __init__(self):
        super().__init__()

        # Layers
        self.input  = nn.Linear(11,16)
        self.hid1   = nn.Linear(16,32)
        self.hid2   = nn.Linear(32,32)
        self.output = nn.Linear(32,1)

    # Forward propagation (pass raw output)
    def forward(self,x):

        x = F.relu(self.input(x))
        x = F.relu(self.hid1(x))
        x = F.relu(self.hid2(x))
        x = self.output(x)

        return x


In [69]:
# %% Function to train the model

# Parameters
num_epochs = 600

def train_model(model):

    # Loss function and optimizer
    loss_fun  = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.SGD(model.parameters(),lr=0.01)

    # Initialise losses
    train_acc = []
    test_acc  = []
    losses    = []

    # Loop over epochs
    for epoch_i in range(num_epochs):

        batch_acc  = []
        batch_loss = []

        for X,y in train_loader:

            # Forward propagation and loss
            yHat = model(X)
            loss = loss_fun(yHat,y)

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Batch training accuracy
            batch_acc.append( 100*torch.mean(((yHat>0) == y).float()).item() )
            batch_loss.append(loss.item())

        # Average accuracy from batch
        train_acc.append(np.mean(batch_acc))
        losses.append(np.mean(batch_loss))

        # Test accuracy
        model.eval()

        X,y = next(iter(test_loader))
        with torch.no_grad():
            yHat = model(X)
        test_acc.append( 100*torch.mean(((yHat>0) == y).float()).item() )

        model.train()

    # Function output
    return train_acc,test_acc,losses


In [18]:
# %% Functions for 1D smoothing filter

# Improved for edge effects - adaptive window
def smooth_adaptive(x,k):
    smoothed = np.zeros_like(x)
    half_k   = k // 2

    for i in range(len(x)):
        start       = max(0, i-half_k)
        end         = min(len(x), i+half_k + 1)
        smoothed[i] = np.mean(x[start:end])

    return smoothed


In [70]:
# %% Fit the model once to test

# Fresh model instance
ANN_Xavier = ANN()

# Change the weights
for p in ANN_Xavier.named_parameters():
    if 'weight' in p[0]:
        nn.init.xavier_normal_(p[1].data)

# Fit model
train_acc_xavier,test_acc_xavier,losses_xavier = train_model(ANN_Xavier)

# Fresh model instance
ANN_Kaiming = ANN()

# Change the weights
for p in ANN_Kaiming.named_parameters():
    if 'weight' in p[0]:
        nn.init.kaiming_uniform_(p[1].data,nonlinearity='relu')

# Fit model
train_acc_kaiming,test_acc_kaiming,losses_kaiming = train_model(ANN_Kaiming)


In [None]:
# %% Plotting

phi = (1 + np.sqrt(5)) / 2
fig,ax = plt.subplots(1,3,figsize=(2*phi*6,6))

ax[0].plot(smooth_adaptive(losses_xavier,20),label='Xavier')
ax[0].plot(smooth_adaptive(losses_kaiming,20),label='Kaiming')
ax[0].set_title('Loss')

ax[1].plot(smooth_adaptive(train_acc_xavier,20),label='Xavier')
ax[1].plot(smooth_adaptive(train_acc_kaiming,20),label='Kaiming')
ax[1].set_ylabel('Accuracy (%)')
ax[1].set_title('Train')

ax[2].plot(smooth_adaptive(test_acc_xavier,20),label='Xavier')
ax[2].plot(smooth_adaptive(test_acc_kaiming,20),label='Kaiming')
ax[2].set_ylabel('Accuracy (%)')
ax[2].set_title('Test')

for i in range(3):
    ax[i].legend()
    ax[i].grid('on')
    ax[i].set_xlabel('Epochs')

plt.savefig('figure30_code_challenge_22.png')
plt.show()
files.download('figure30_code_challenge_22.png')


In [72]:
# %% Parametric experiment on weight initialisations strategies

# Parameters and preallocation
reps = 10

train_acc_xavier  = np.zeros((reps))
train_acc_kaiming = np.zeros((reps))
test_acc_xavier   = np.zeros((reps))
test_acc_kaiming  = np.zeros((reps))
losses_xavier     = np.zeros((reps))
losses_kaiming    = np.zeros((reps))

# Run experiment (takes ~13 mins)
for i in range(reps):

    # Xavier
    ANN_Xavier = ANN()

    for p in ANN_Xavier.named_parameters():
        if 'weight' in p[0]:
            nn.init.xavier_normal_(p[1].data)

    train_acc,test_acc,losses = train_model(ANN_Xavier)
    train_acc_xavier[i] = torch.mean(torch.tensor(train_acc[-5:])).item()
    test_acc_xavier[i]  = torch.mean(torch.tensor(test_acc[-5:])).item()
    losses_xavier[i]    = torch.mean(torch.tensor(losses[-5:])).item()

    # Kaiming
    ANN_Kaiming = ANN()

    for p in ANN_Kaiming.named_parameters():
        if 'weight' in p[0]:
            nn.init.kaiming_uniform_(p[1].data,nonlinearity='relu')

    train_acc,test_acc,losses = train_model(ANN_Kaiming)
    train_acc_kaiming[i] = torch.mean(torch.tensor(train_acc[-5:])).item()
    test_acc_kaiming[i]  = torch.mean(torch.tensor(test_acc[-5:])).item()
    losses_kaiming[i]    = torch.mean(torch.tensor(losses[-5:])).item()


In [None]:
# %% Run t-tests

# T-tests
t_train_acc,p_train_acc = stats.ttest_ind(train_acc_xavier,train_acc_kaiming)
t_test_acc,p_test_acc   = stats.ttest_ind(test_acc_xavier,test_acc_kaiming)
t_losses,p_losses       = stats.ttest_ind(losses_xavier,losses_kaiming)

# plot
phi    = (1 + np.sqrt(5)) / 2
fig,ax = plt.subplots(1,3,figsize=(2*phi*6,6))

ax[0].plot(np.zeros(reps),losses_xavier,'bo',alpha=0.9)
ax[0].plot(np.ones(reps),losses_kaiming,'ro',alpha=0.9)
ax[0].set_title(f'Loss (t={t_losses:.2f}, p={p_losses:.3f})')

ax[1].plot(np.zeros(reps),train_acc_xavier,'bo',alpha=0.9)
ax[1].plot(np.ones(reps),train_acc_kaiming,'ro',alpha=0.9)
ax[1].set_title(f'Train acc. (t={t_train_acc:.2f}, p={p_train_acc:.3f})')

ax[2].plot(np.zeros(reps),test_acc_xavier,'bo',alpha=0.9)
ax[2].plot(np.ones(reps),test_acc_kaiming,'ro',alpha=0.9)
ax[2].set_title(f'Test acc. (t={t_test_acc:.2f}, p={p_test_acc:.3f})')

for i in range(3):

    ax[i].set_xlim([-1,2])
    ax[i].set_xticks([0,1])
    ax[i].set_xticklabels(['Xavier','Kaiming'])

plt.savefig('figure31_code_challenge_22.png')
plt.show()
files.download('figure31_code_challenge_22.png')


In [None]:
# %% Exercise 1
#    Adam usually works better than SGD with fewer training epochs. Does Adam also equalize the differences attributable
#    to weight initialization?

# Adam does appear to equalise the differences, training accuracy is hitting a
# ceiling effect for both init methods, and also test accuracy is not
# significantly different


In [44]:
# %% Exercise 2
#    The discrepancy between training and test performance suggests that Kaiming initialization involved some overfitting.
#    What are some strategies you could employ to reduce overfitting?

# Using SGD again for comparability. Many options are available, such as dropout
# regularisation, or L1/L2 regularisation. Here, as an example, we can use L2
# with a decay of 0.01

# L2 seems to work a bit, as far as we can see from just one run

# %% Function to train the model
num_epochs = 600

def train_model(model):

    loss_fun  = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.SGD(model.parameters(),lr=0.01,weight_decay=0.01)

    train_acc = []
    test_acc  = []
    losses    = []

    for epoch_i in range(num_epochs):

        batch_acc  = []
        batch_loss = []

        for X,y in train_loader:

            yHat = model(X)
            loss = loss_fun(yHat,y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            batch_acc.append( 100*torch.mean(((yHat>0) == y).float()).item() )
            batch_loss.append(loss.item())

        train_acc.append(np.mean(batch_acc))
        losses.append(np.mean(batch_loss))

        model.eval()

        X,y = next(iter(test_loader))
        with torch.no_grad():
            yHat = model(X)
        test_acc.append( 100*torch.mean(((yHat>0) == y).float()).item() )

        model.train()

    return train_acc,test_acc,losses


In [68]:
# %% Exercise 3
#    The difference between X and K initialization is likely to increase with more weights. Change the number of units in
#    the hidden layers from 32 to 64.

# Also using SGD again for comparability. Indeed, the difference increases for
# the training sets, but on the test data there is no significant difference.

# %% Model class
class ANN(nn.Module):
    def __init__(self):
        super().__init__()

        self.input  = nn.Linear(11,64)
        self.hid1   = nn.Linear(64,64)
        self.hid2   = nn.Linear(64,64)
        self.output = nn.Linear(64,1)

    def forward(self,x):

        x = F.relu(self.input(x))
        x = F.relu(self.hid1(x))
        x = F.relu(self.hid2(x))
        x = self.output(x)

        return x

