In [None]:
# %% Deep learning - Section 9.76
#    L1 regularisation in practise

# This code pertains a deep learning course provided by Mike X. Cohen on Udemy:
#   > https://www.udemy.com/course/deeplearning_x
# The "base" code in this repository is adapted (with very minor modifications)
# from code developed by the course instructor (Mike X. Cohen), while the
# "exercises" and the "code challenges" contain more original solutions and
# creative input from my side. If you are interested in DL (and if you are
# reading this statement, chances are that you are), go check out the course, it
# is singularly good.


In [None]:
# %% Libraries and modules
import numpy               as np
import matplotlib.pyplot   as plt
import torch
import torch.nn            as nn
import seaborn             as sns
import copy
import torch.nn.functional as F

from torch.utils.data                 import DataLoader,TensorDataset
from sklearn.model_selection          import train_test_split
from google.colab                     import files
from torchsummary                     import summary
from IPython                          import display
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg')


In [None]:
# %% Import Iris dataset

iris = sns.load_dataset('iris')

# Convert from pandas df to tensor
data = torch.tensor(iris[iris.columns[0:4]].values).float()

# Species to numbers
labels = torch.zeros(len(data),dtype=torch.long)
labels[iris.species=='setosa']     = 0
labels[iris.species=='versicolor'] = 1
labels[iris.species=='virginica']  = 2


In [None]:
# %% Split into train and test data

# Split with scikitlearn
train_data,test_data,train_labels,test_labels = train_test_split(data,labels,test_size=0.2)

# Convert into PyTorch datasets
train_data = TensorDataset(train_data,train_labels)
test_data  = TensorDataset(test_data,test_labels)

# Convert into DataLoader objects
batch_size   = 64
train_loader = DataLoader(train_data,batch_size=batch_size,shuffle=True,drop_last=True)
test_loader  = DataLoader(test_data,batch_size=test_data.tensors[0].shape[0])


In [None]:
# %% Function to generate the model

def gen_model():

    # Architecture
    ANN = nn.Sequential(
             nn.Linear(4,64),
             nn.ReLU(),
             nn.Linear(64,64),
             nn.ReLU(),
             nn.Linear(64,3))

    # Loss function
    loss_fun = nn.CrossEntropyLoss()

    # Optimizer (no weight_decay option)
    optimizer = torch.optim.SGD(ANN.parameters(),lr=0.005)

    return ANN,loss_fun,optimizer


In [None]:
# %% Explore model more in detail

tmp_model = gen_model()[0]

# Model architecture
print(tmp_model)

# Model's parameters
for i in tmp_model.named_parameters():
    print(i[0],i[1].shape,i[1].numel())


In [None]:
# %% Function to train the model

# Parameters
num_epochs = 1000

def train_model(L1_lambda):

    # Initialise accuracies
    train_acc = []
    test_acc  = []
    losses    = []

    # Count total number
    n_weights = 0
    for pname,weights in ANN.named_parameters():
        if 'bias' not in pname:
            n_weights = n_weights + weights.numel()

    # Loop over epochs
    for epoch_i in range(num_epochs):

        batch_acc  = []
        batch_loss = []

        for X,y in train_loader:

            # Forward propagation and loss
            yHat = ANN(X)
            loss = loss_fun(yHat,y)

            # Add L1 regularisation
            L1_term = torch.tensor(0.,requires_grad=True)

            # Sum up all abs weights
            for pname,weight in ANN.named_parameters():
                if 'bias' not in pname:
                    L1_term = L1_term + torch.sum(torch.abs(weight))

            # Add L1 penalty term to loss
            loss = loss + (L1_lambda*L1_term)/n_weights

            # Only now do backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Batch training accuracy
            batch_acc.append( 100*torch.mean((torch.argmax(yHat,axis=1)==y).float()).item() )
            batch_loss.append(loss.item())

        # Average accuracy from batch
        train_acc.append(np.mean(batch_acc))
        losses.append(np.mean(batch_loss))

        # Test accuracy
        X,y = next(iter(test_loader))
        pred_labels = torch.argmax(ANN(X),axis=1)
        test_acc.append(  100*torch.mean((pred_labels==y).float()).item() )

    # Function output
    return train_acc,test_acc,losses


In [None]:
# Test the model

ANN,loss_fun,optimizer = gen_model()
L1_lambda = 0.001
train_acc,test_acc,losses = train_model(L1_lambda)


In [None]:
# %% Plotting

fig,ax = plt.subplots(1,2,figsize=(15,5))

ax[0].plot(losses,'^-')
ax[0].set_ylabel('Loss')
ax[0].set_xlabel('Epochs')
ax[0].set_title('Losses with L1 $\lambda$=' + str(L1_lambda))

ax[1].plot(train_acc,'o-')
ax[1].plot(test_acc,'s-')
ax[1].set_title('Accuracy with L1 $\lambda$=' + str(L1_lambda))
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('Accuracy (%)')
ax[1].legend(['Train','Test'])

plt.savefig('figure27_weight_regularisation.png')

plt.show()

files.download('figure27_weight_regularisation.png')


In [None]:
# %% Functions for 1D smoothing filter

# Improved for edge effects - adaptive window
def smooth_adaptive(x,k):
    smoothed = np.zeros_like(x)
    half_k   = k // 2

    for i in range(len(x)):
        start       = max(0, i-half_k)
        end         = min(len(x), i+half_k + 1)
        smoothed[i] = np.mean(x[start:end])

    return smoothed


In [None]:
# %% Parametric experiment on L1 lambda parameter

# L1 parameters and output preallocation
L1_lambdas = np.linspace(0,0.005,10)
acc_train  = np.zeros((num_epochs,len(L1_lambdas)))
acc_result = np.zeros((num_epochs,len(L1_lambdas)))

# Loop over batch sizes
for L1_i in range(len(L1_lambdas)):

    # Generate and train model
    ANN,loss_fun,optimizer = gen_model()
    train_acc,test_acc,losses = train_model(L1_lambdas[L1_i])

    # Store outputs
    acc_train[:,L1_i]  = smooth_adaptive(train_acc,10)
    acc_result[:,L1_i] = smooth_adaptive(test_acc,10)


In [None]:
# %% Plotting

fig,ax = plt.subplots(1,2,figsize=(17,7))

cmaps = plt.cm.plasma(np.linspace(.1,.9,len(L1_lambdas)))
for i in range(len(L1_lambdas)):
    ax[0].plot(acc_train[:,i],color=cmaps[i])
    ax[1].plot(acc_result[:,i],color=cmaps[i])

ax[0].set_title('Train accuracy')
ax[1].set_title('Test accuracy')

# make the legend easier to read
leglabels = [np.round(i,4) for i in L1_lambdas]

# common features
for i in range(2):
  ax[i].legend(leglabels)
  ax[i].set_xlabel('Epoch')
  ax[i].set_ylabel('Accuracy (%)')
  ax[i].set_ylim([50,101])
  ax[i].grid()


plt.savefig('figure28_weight_regularisation.png')

plt.show()

files.download('figure28_weight_regularisation.png')


In [None]:
# %% Show average accuracy by L1 rates

# Pick only a range of epochs
epoch_range = [160,360]

plt.plot(L1_lambdas,
         np.mean(acc_train[epoch_range[0]:epoch_range[1],:],axis=0),
         'bo-',label='TRAIN')

plt.plot(L1_lambdas,
         np.mean(acc_result[epoch_range[0]:epoch_range[1],:],axis=0),
         'rs-',label='TEST')

plt.xlabel('L1 regularization amount')
plt.ylabel('Accuracy')
plt.title('Average accuracy by L1 regularization amount')
plt.legend()

plt.savefig('figure29_weight_regularisation.png')

plt.show()

files.download('figure29_weight_regularisation.png')


In [None]:
# %% Exercise 1
#    In the previous video we used a pytorch function to implement L2 regularization, and in this video we implemented
#    L1 regularization manually. Modify the code here to create a manual L2 regularizer.

# Given the above code, this can be done relatively easily simply by squaring the
# weights in the computation of the penalty term. Note that the variables in the
# code below are still named L1_term/L1_lambda, but they are effectively an L2
# penalty term; I'm just being lazy

# %% Modified function

# Parameters
num_epochs = 1000

def train_model(L1_lambda):

    # Initialise accuracies
    train_acc = []
    test_acc  = []
    losses    = []

    # Count total number
    n_weights = 0
    for pname,weights in ANN.named_parameters():
        if 'bias' not in pname:
            n_weights = n_weights + weights.numel()

    # Loop over epochs
    for epoch_i in range(num_epochs):

        batch_acc  = []
        batch_loss = []

        for X,y in train_loader:

            # Forward propagation and loss
            yHat = ANN(X)
            loss = loss_fun(yHat,y)

            # Add L1 regularisation
            L1_term = torch.tensor(0.,requires_grad=True)

            # Sum up all squared weights (.abs() not needed because of squaring)
            for pname,weight in ANN.named_parameters():
                if 'bias' not in pname:
                    L1_term = L1_term + torch.sum(weight**2)

            # Add L2 penalty term to loss
            loss = loss + (L1_lambda*L1_term)/n_weights

            # Only now do backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Batch training accuracy
            batch_acc.append( 100*torch.mean((torch.argmax(yHat,axis=1)==y).float()).item() )
            batch_loss.append(loss.item())

        # Average accuracy from batch
        train_acc.append(np.mean(batch_acc))
        losses.append(np.mean(batch_loss))

        # Test accuracy
        X,y = next(iter(test_loader))
        pred_labels = torch.argmax(ANN(X),axis=1)
        test_acc.append(  100*torch.mean((pred_labels==y).float()).item() )

    # Function output
    return train_acc,test_acc,losses

# %% Modified parametric experiment on manual L2 lambda parameter

# L2 parameters and output preallocation
L1_lambdas = np.linspace(0,.1,10)
acc_train  = np.zeros((num_epochs,len(L1_lambdas)))
acc_result = np.zeros((num_epochs,len(L1_lambdas)))

# Loop over batch sizes
for L1_i in range(len(L1_lambdas)):

    # Generate and train model
    ANN,loss_fun,optimizer = gen_model()
    train_acc,test_acc,losses = train_model(L1_lambdas[L1_i])

    # Store outputs
    acc_train[:,L1_i]  = smooth_adaptive(train_acc,10)
    acc_result[:,L1_i] = smooth_adaptive(test_acc,10)


In [None]:
# %% Exercise 2
#    Based on your modification above, create a combined L1+L2 regularizer. Does it make sense to use the same lambda
#    parameter, or do you think it should be adjusted?

# Given the above code, this can also be done relatively easily simply by adding
# the L2 regularisation to the L1 regularisatiom already in place. The variables
# are still named L1_term/L1_lambda, but they are effectively an L1+L2 penalty
# term (elastic net); I'm just still lazy. The modified function, however, allows
# to use different lambda values for L1 and L2, because for L2 regularisation, it
# makes sense to have larger lambdas (due to the squaring)

# %% Modified function to train the model

# Parameters
num_epochs = 1000

def train_model(L1_lambda,L2_lambda):

    # Initialise accuracies
    train_acc = []
    test_acc  = []
    losses    = []

    # Count total number
    n_weights = 0
    for pname,weights in ANN.named_parameters():
        if 'bias' not in pname:
            n_weights = n_weights + weights.numel()

    # Loop over epochs
    for epoch_i in range(num_epochs):

        batch_acc  = []
        batch_loss = []

        for X,y in train_loader:

            # Forward propagation and loss
            yHat = ANN(X)
            loss = loss_fun(yHat,y)

            # Add L1 and L2 regularisation
            L1_term = torch.tensor(0.,requires_grad=True)
            L2_term = torch.tensor(0.,requires_grad=True)

            # Sum up all abs weights
            for pname,weight in ANN.named_parameters():
                if 'bias' not in pname:
                    L1_term = L1_term + torch.sum(torch.abs(weight))
                    L2_term = L2_term + torch.sum(weight**2)

            # Add L1 penalty term to loss
            loss = loss + (L1_lambda*L1_term)/n_weights + (L2_lambda*L2_term)/n_weights

            # Only now do backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Batch training accuracy
            batch_acc.append( 100*torch.mean((torch.argmax(yHat,axis=1)==y).float()).item() )
            batch_loss.append(loss.item())

        # Average accuracy from batch
        train_acc.append(np.mean(batch_acc))
        losses.append(np.mean(batch_loss))

        # Test accuracy
        X,y = next(iter(test_loader))
        pred_labels = torch.argmax(ANN(X),axis=1)
        test_acc.append(  100*torch.mean((pred_labels==y).float()).item() )

    # Function output
    return train_acc,test_acc,losses

# %% Modified parametric experiment on manual L2+L1 lambda parameter

# L1 and L2 parameters and output preallocation
L1_lambdas = np.linspace(0,0.005,10)
L2_lambdas = np.linspace(0,.1,10)
acc_train  = np.zeros((num_epochs,len(L1_lambdas)))
acc_result = np.zeros((num_epochs,len(L1_lambdas)))

# Loop over batch sizes
for L1_i in range(len(L1_lambdas)):

    # Generate and train model
    ANN,loss_fun,optimizer = gen_model()
    train_acc,test_acc,losses = train_model(L1_lambdas[L1_i],L2_lambdas[L1_i])

    # Store outputs
    acc_train[:,L1_i]  = smooth_adaptive(train_acc,10)
    acc_result[:,L1_i] = smooth_adaptive(test_acc,10)
