In [None]:
# %% Deep learning - Section 9.75
#    L2 regularisation in practise

# This code pertains a deep learning course provided by Mike X. Cohen on Udemy:
#   > https://www.udemy.com/course/deeplearning_x
# The "base" code in this repository is adapted (with very minor modifications)
# from code developed by the course instructor (Mike X. Cohen), while the
# "exercises" and the "code challenges" contain more original solutions and
# creative input from my side. If you are interested in DL (and if you are
# reading this statement, chances are that you are), go check out the course, it
# is singularly good.


In [None]:
# %% Libraries and modules
import numpy               as np
import matplotlib.pyplot   as plt
import torch
import torch.nn            as nn
import seaborn             as sns
import copy
import torch.nn.functional as F

from torch.utils.data                 import DataLoader,TensorDataset
from sklearn.model_selection          import train_test_split
from google.colab                     import files
from torchsummary                     import summary
from IPython                          import display
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg')


In [None]:
# %% Import Iris dataset

iris = sns.load_dataset('iris')

# Convert from pandas df to tensor
data = torch.tensor(iris[iris.columns[0:4]].values).float()

# Species to numbers
labels = torch.zeros(len(data),dtype=torch.long)
labels[iris.species=='setosa']     = 0
labels[iris.species=='versicolor'] = 1
labels[iris.species=='virginica']  = 2


In [None]:
# %% Split into train and test data

# Split with scikitlearn
train_data,test_data,train_labels,test_labels = train_test_split(data,labels,test_size=0.2)

# Convert into PyTorch datasets
train_data = TensorDataset(train_data,train_labels)
test_data  = TensorDataset(test_data,test_labels)

# Convert into DataLoader objects
batch_size   = 64
train_loader = DataLoader(train_data,batch_size=batch_size,shuffle=True,drop_last=True)
test_loader  = DataLoader(test_data,batch_size=test_data.tensors[0].shape[0])


In [None]:
# %% Function to generate the model

def gen_model(L2_lambda):

    # Architecture
    ANN = nn.Sequential(
             nn.Linear(4,64),
             nn.ReLU(),
             nn.Linear(64,64),
             nn.ReLU(),
             nn.Linear(64,3))

    # Loss function
    loss_fun = nn.CrossEntropyLoss()

    # Optimizer
    optimizer = torch.optim.SGD(ANN.parameters(),lr=0.005,weight_decay=L2_lambda)

    return ANN,loss_fun,optimizer


In [None]:
# %% Function to train the model

num_epochs = 1000

def train_model():

    # Initialise empty accuracies
    train_acc = []
    test_acc  = []
    losses    = []

    # Loop over epochs
    for epoch_i in range(num_epochs):

        # Loop over training data batches
        batch_acc  = []
        batch_loss = []
        for X,y in train_loader:

            # Forward propagation and loss
            yHat = ANN(X)
            loss = loss_fun(yHat,y)

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Batch accuracy
            batch_acc.append( 100*torch.mean((torch.argmax(yHat,axis=1)==y).float()).item() )
            batch_loss.append( loss.item() )

        # Average training accuracy from batches
        train_acc.append( np.mean(batch_acc) )
        losses.append( np.mean(batch_loss) )

        # Test accuracy
        ANN.eval()
        X,y = next(iter(test_loader))
        pred_labels = torch.argmax(ANN(X),axis=1)
        test_acc.append( 100*torch.mean((pred_labels==y).float()).item() )

        # Reset to train mode (with weight reg. this switching back and forth is technically not needed)
        ANN.train()

    # Output
    return train_acc,test_acc,losses


In [None]:
# %% Test the model

L2_lambda = 0.01
ANN,loss_fun,optimizer    = gen_model(L2_lambda)
train_acc,test_acc,losses = train_model()


In [None]:
# %% Plotting

fig,ax = plt.subplots(1,2,figsize=(15,5))

ax[0].plot(losses,'^-')
ax[0].set_ylabel('Loss')
ax[0].set_xlabel('Epochs')
ax[0].set_title('Losses with L2 $\lambda$=' + str(L2_lambda))

ax[1].plot(train_acc,'o-')
ax[1].plot(test_acc,'s-')
ax[1].set_title('Accuracy with L2 $\lambda$=' + str(L2_lambda))
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('Accuracy (%)')
ax[1].legend(['Train','Test'])

plt.savefig('figure20_weight_regularisation.png')

plt.show()

files.download('figure20_weight_regularisation.png')


In [None]:
# %% Functions for 1D smoothing filter

# Improved for edge effects - adaptive window
def smooth_adaptive(x,k):
    smoothed = np.zeros_like(x)
    half_k   = k // 2

    for i in range(len(x)):
        start       = max(0, i-half_k)
        end         = min(len(x), i+half_k + 1)
        smoothed[i] = np.mean(x[start:end])

    return smoothed


In [None]:
# %% Parametric experiment

# Initialise stuff
l2_lambdas        = np.linspace(0,.1,10)
acc_results_train = np.zeros((num_epochs,len(l2_lambdas)))
acc_results_test  = np.zeros((num_epochs,len(l2_lambdas)))

# Loop over batch sizes
for lambda_i in range(len(l2_lambdas)):

    # Generate and train model
    ANN,loss_fun,optimizer    = gen_model(l2_lambdas[lambda_i])
    train_acc,test_acc,losses = train_model()

    # Store
    acc_results_train[:,lambda_i] = smooth_adaptive(train_acc,10)
    acc_results_test[:,lambda_i]  = smooth_adaptive(test_acc,10)


In [None]:
# %% Plotting

fig,ax = plt.subplots(1,2,figsize=(17,7))

cmaps = plt.cm.plasma(np.linspace(.1,.9,len(l2_lambdas)))
for i in range(len(l2_lambdas)):
    ax[0].plot(acc_results_train[:,i],color=cmaps[i])
    ax[1].plot(acc_results_test[:,i],color=cmaps[i])

ax[0].set_title('Train accuracy')
ax[1].set_title('Test accuracy')

# Make the legend easier to read
leglabels = [np.round(i,2) for i in l2_lambdas]

# Common features
for i in range(2):
    ax[i].legend(leglabels)
    ax[i].set_xlabel('Epoch')
    ax[i].set_ylabel('Accuracy (%)')
    ax[i].set_ylim([50,101])
    ax[i].grid()

plt.savefig('figure21_weight_regularisation.png')

plt.show()

files.download('figure21_weight_regularisation.png')


In [None]:
# %% Show average accuracy by L2 rates

# Pick only a range of epochs
epoch_range = [1000,2000]

plt.plot(l2_lambdas,
         np.mean(acc_results_train[epoch_range[0]:epoch_range[1],:],axis=0),
         'bo-',label='TRAIN')

plt.plot(l2_lambdas,
         np.mean(acc_results_test[epoch_range[0]:epoch_range[1],:],axis=0),
         'rs-',label='TEST')

plt.xlabel('L2 regularization amount')
plt.ylabel('Accuracy')
plt.title('Average accuracy by L2 regularization amount')
plt.legend()

plt.savefig('figure22_weight_regularisation.png')

plt.show()

files.download('figure22_weight_regularisation.png')


In [None]:
# %% Exercise 1
#    In general, regularization tends to benefit large, complex models, and has less impact (and sometimes even a negative
#    impact) on smaller or simpler model architectures. Modify the model architecture to have three hidden layers, and
#    see whether that changes the effect of L2 regularization on performance. (You might want to increase the number of
#    epochs.)

# Indeed a deeper model highlights more dramatic differences in the regularisation amount. Notably, as the regularisation
# increases, the accuracy decreases (or it takes more time to increase), for both training and test data.

# Modified function (num_epochs increased to 2500)
def gen_model(L2_lambda):

    # Architecture
    ANN = nn.Sequential(
             nn.Linear(4,64),
             nn.ReLU(),
             nn.Linear(64,64),
             nn.ReLU(),
             nn.Linear(64,64),
             nn.ReLU(),
             nn.Linear(64,64),
             nn.ReLU(),
             nn.Linear(64,3))

    # Loss function
    loss_fun = nn.CrossEntropyLoss()

    # Optimizer
    optimizer = torch.optim.SGD(ANN.parameters(),lr=0.005,weight_decay=L2_lambda)

    return ANN,loss_fun,optimizer


In [None]:
# %% Exercise 2
#    Multiple regularization methods can be combined. Add 15% dropout to the hidden layer(s) and see how that affects
#    the model's performance.

# A combination of weight (L2) and dropout (15%) regularisation does improve the performace of the model in the
# sense that, even if it requires more training, the training accuracy is reduced but the testing accuracy is boosted.

# Modified function (simpler model, but num_epochs still increased to 2500)
def gen_model(L2_lambda,dropout_rate):

    # Architecture (nn.Dropout(p=...) does not require switching between train and eval)
    ANN = nn.Sequential(
             nn.Linear(4,64),
             nn.ReLU(),
             nn.Dropout(p=dropout_rate),
             nn.Linear(64,64),
             nn.ReLU(),
             nn.Dropout(p=dropout_rate),
             nn.Linear(64,3))

    # Loss function
    loss_fun = nn.CrossEntropyLoss()

    # Optimizer
    optimizer = torch.optim.SGD(ANN.parameters(),lr=0.005,weight_decay=L2_lambda)

    return ANN,loss_fun,optimizer

# [... train model function ...]

# Parametric experiment
l2_lambdas        = np.linspace(0,.1,10)
dropout_rate      = 0.15
acc_results_train = np.zeros((num_epochs,len(l2_lambdas)))
acc_results_test  = np.zeros((num_epochs,len(l2_lambdas)))

# Loop over batch sizes
for lambda_i in range(len(l2_lambdas)):

    # Generate and train model
    ANN,loss_fun,optimizer    = gen_model(l2_lambdas[lambda_i],dropout_rate)
    train_acc,test_acc,losses = train_model()

    # Store
    acc_results_train[:,lambda_i] = smooth_adaptive(train_acc,10)
    acc_results_test[:,lambda_i]  = smooth_adaptive(test_acc,10)
