In [None]:
# %% Deep learning - Section 10.103
#    Learning rate decay

# This code pertains a deep learning course provided by Mike X. Cohen on Udemy:
#   > https://www.udemy.com/course/deeplearning_x
# The "base" code in this repository is adapted (with very minor modifications)
# from code developed by the course instructor (Mike X. Cohen), while the
# "exercises" and the "code challenges" contain more original solutions and
# creative input from my side. If you are interested in DL (and if you are
# reading this statement, chances are that you are), go check out the course, it
# is singularly good.


In [None]:
# %% Libraries and modules
import numpy               as np
import matplotlib.pyplot   as plt
import torch
import torch.nn            as nn
import seaborn             as sns
import copy
import torch.nn.functional as F
import pandas              as pd
import scipy.stats         as stats
import time

from torch.utils.data                 import DataLoader,TensorDataset
from sklearn.model_selection          import train_test_split
from google.colab                     import files
from torchsummary                     import summary
from IPython                          import display
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg')


In [None]:
# %% Reminder

# As mentioned before, one way to adjust the leraning rate is through the so-called
# learning rate decay: in this case lr is set to be proportinal to the training epoch.
# It's unrelated to the actual model performance (unlike Adam or RMSprop), but it
# also works fine sometimes


In [None]:
# %% Create data

# General params
n_by_clust = 300
blurring   = 1

# Centroids
A = [1,1]
B = [5,1]
C = [4,4]

# Generate data
a = [ A[0]+np.random.randn(n_by_clust)*blurring, A[1]+np.random.randn(n_by_clust)*blurring ]
b = [ B[0]+np.random.randn(n_by_clust)*blurring, B[1]+np.random.randn(n_by_clust)*blurring ]
c = [ C[0]+np.random.randn(n_by_clust)*blurring, C[1]+np.random.randn(n_by_clust)*blurring ]

# Labels
labels_np = np.hstack(( np.zeros((n_by_clust)),
                        np.ones((n_by_clust)),
                        2*np.ones((n_by_clust)) ))

# Data matrix
data_np = np.hstack((a,b,c)).T

# Data into PyTorch tensors (long format for CCE)
data   = torch.tensor(data_np).float()
labels = torch.tensor(labels_np).long()


In [None]:
# %% Plotting

phi = ( 1 + np.sqrt(5) ) / 2
fig = plt.figure(figsize=(6*phi,6))

plt.plot(data[np.where(labels==0)[0],0],data[np.where(labels==0)[0],1],'s',alpha=.75)
plt.plot(data[np.where(labels==1)[0],0],data[np.where(labels==1)[0],1],'o',alpha=.75)
plt.plot(data[np.where(labels==2)[0],0],data[np.where(labels==2)[0],1],'^',alpha=.75)

plt.title('Some clusters')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.grid()

plt.savefig('figure96_leraning_rate_decay.png')

plt.show()

files.download('figure96_leraning_rate_decay.png')


In [None]:
# %% Split into train and test data

# Split with scikitlearn
train_data,test_data,train_labels,test_labels = train_test_split(data,labels,test_size=0.1)

# Convert into PyTorch datasets
train_data = TensorDataset(train_data,train_labels)
test_data  = TensorDataset(test_data,test_labels)

# Convert into DataLoader objects
batch_size   = 16
train_loader = DataLoader(train_data,batch_size=batch_size,shuffle=True,drop_last=True)
test_loader  = DataLoader(test_data,batch_size=test_data.tensors[0].shape[0])


In [None]:
# %% Create the model

def gen_model(initial_learning_rate):

    class model(nn.Module):
        def __init__(self):
            super().__init__()

            # Architecture
            self.input  = nn.Linear(2,8)
            self.hid1   = nn.Linear(8,8)
            self.output = nn.Linear(8,3)

        # Forward propagation
        def forward(self,x):
            x = F.relu(self.input(x))
            x = F.relu(self.hid1(x))
            x = self.output(x)

            return x

    # Model instance
    ANN = model()

    # Loss function and optimizer (set learning rate decay)
    loss_fun  = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(ANN.parameters(),lr=initial_learning_rate)

    step_size = batch_size*len(train_loader)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,step_size=step_size,gamma=.5)

    return ANN,loss_fun,optimizer,scheduler


In [None]:
# %% How many steps until the learning rate changes?

# Same number as training datapoints
len(train_loader)*batch_size


In [None]:
# %% Explore learning rate decay parameter

# Create network
ANN = gen_model(0.01)[0]

# Fresh optimizer
optimizer = torch.optim.SGD(ANN.parameters(),lr=0.01)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer,step_size=5,gamma=.5)

# Test change in learning rate
for epoch_i in range(3):
    for batch_num in range(10):

        print(f'Batch {batch_num+1}, epoch {epoch_i}: LR = {scheduler.get_last_lr()[0]}')
        scheduler.step()


In [None]:
# %% Function to train the model

def train_model(initial_learning_rate,toggle_lr_decay):

    # Epochs
    num_epochs = 50

    # Model instance
    ANN,loss_fun,optimizer,scheduler = gen_model(initial_learning_rate)

    # Initialise
    losses     = []
    train_acc  = []
    test_acc   = []
    current_LR = []

    # Epochs loop
    for epoch_i in range(num_epochs):

        # Train mode on
        ANN.train()

        # Initialise and loop over batches
        batch_losses = []
        batch_acc    = []

        for X,y in train_loader:

            # Forward propagation and loss
            yHat = ANN(X)
            loss = loss_fun(yHat,y)

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Toggle learning rate decay (after optimizer step)
            if toggle_lr_decay:
                scheduler.step()

            # Compute loss and accuracy from this batch
            batch_losses.append(loss.item())

            matches     = torch.argmax(yHat,axis=1) == y  # booleans
            matches_num = matches.float()                 # convert to numbers
            acc_percent = 100*torch.mean(matches_num)     # average and percent
            batch_acc.append(acc_percent)

            current_LR.append(scheduler.get_last_lr()[0])

        # Average train accuracy and losses from batches
        train_acc.append(np.mean(batch_acc))
        losses.append(np.mean(batch_losses))

        # Test accuracy (turn autograd off)
        ANN.eval()
        X,y = next(iter(test_loader))
        with torch.no_grad():
            yHat = ANN(X)
        test_acc.append( 100*torch.mean((torch.argmax(yHat,axis=1)==y).float()) )

    return train_acc,test_acc,losses,ANN,current_LR


In [None]:
# %% Sanity check

# On
train_acc,test_acc,losses,ANN,current_LR = train_model(0.01,True)

phi = ( 1 + np.sqrt(5) ) / 2
fig = plt.figure(figsize=(4*phi,4))

plt.plot(current_LR)
plt.title('Learning rate should change')
plt.xlabel('Number of minibatches')
plt.ylabel('Learning rate')

plt.savefig('figure97_leraning_rate_decay.png')

plt.show()

files.download('figure97_leraning_rate_decay.png')

# Off
train_acc,test_acc,losses,ANN,current_LR = train_model(0.01,False)

phi = ( 1 + np.sqrt(5) ) / 2
fig = plt.figure(figsize=(4*phi,4))

plt.plot(current_LR)
plt.title('Learning rate should not change')
plt.xlabel('Number of minibatches')
plt.ylabel('Learning rate')

plt.savefig('figure98_leraning_rate_decay.png')

plt.show()

files.download('figure98_leraning_rate_decay.png')


In [None]:
# %% Parametric experiment over learning rate decay

train_acc_dyn,test_acc_dyn,losses_dyn,ANN,current_LR = train_model(0.01,True)
train_acc_stat,test_acc_stat,losses_stat,ANN,current_LR = train_model(0.01,False)


In [None]:
# %% Plotting

phi = ( 1 + np.sqrt(5) ) / 2
fig = plt.figure(figsize=(6*phi,6))

plt.plot(train_acc_dyn,'-',color='tab:blue',label='Dyn: Train')
plt.plot(test_acc_dyn,'--',color='tab:blue',label='Dyn: Test')

plt.plot(train_acc_stat,'-',color='tab:orange',label='Stat: Train')
plt.plot(test_acc_stat,'--',color='tab:orange',label='Stat: Test')

plt.xlabel('Training epochs')
plt.ylabel('Accuracy (%)')
plt.title('Accuracy over epochs')
plt.legend()

plt.savefig('figure99_leraning_rate_decay.png')

plt.show()

files.download('figure99_leraning_rate_decay.png')


In [None]:
# %% Exercise 1
#    When you run the experiment in the previous cell multiple times, you can get different answers. This indicates
#    that the network and/or training regimen is not stable enough. What can you do to increase the stability of the
#    model and training? That is, what can you change to make the results more similar each time you re-run the experiment?

# Many options are available, virtually any metaparameter can be changed to see
# if the performance becomes more stable (e.g., epochs number, optimizer, batch
# normalisation, activation and loss functions, weight regularisation, optimizers
# optimisation, just to name those discussed in previous videos). Here for example
# I switched to an RMSprop optimizer. I'd say the model is a bit more stable simply
# because learning is achieved faster; on the other hand, the performance is a bit
# more variable with respect to the training epoch (i.e. more stable across runs,
# but less stable across epochs)


In [None]:
# %% Exercise 2
#    There are several more options for dynamic learning rates in Pytorch. Try modifying the code!
#    See https://pytorch.org/docs/stable/optim.html

# Wow I was expecting a lot of options but couldn't imagine there were so many. I tried
# an .EponentialLR() method (i.e. "decays the learning rate of each parameter group
# by gamma every epoch"); it only needs a gamma parameter, and no step size (as far
# as I understood, the lr is adapted at each step), I picked a gamma close to 1 so that
# the lr decay is not too extreme (0.99 and 0.999). Also, I went back to SGD for
# comparability.
# Surprisingly, the two values make a hige difference! I mean ... exponential
# decay is as triky as it is fast, but still interesting to observe how small changes
# in one single parameter produce large and qualitative changes in the output (by
# qualitative I mean that, in practice and given all the other metaparameters, one
# model doesn't work, and one does)

# %% Create the model
def gen_model(initial_learning_rate):

    class model(nn.Module):
        def __init__(self):
            super().__init__()

            # Architecture
            self.input  = nn.Linear(2,8)
            self.hid1   = nn.Linear(8,8)
            self.output = nn.Linear(8,3)

        # Forward propagation
        def forward(self,x):
            x = F.relu(self.input(x))
            x = F.relu(self.hid1(x))
            x = self.output(x)

            return x

    # Model instance
    ANN = model()

    # Loss function and optimizer (set learning rate decay)
    loss_fun  = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(ANN.parameters(),lr=initial_learning_rate)

    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer,gamma=0.999)

    return ANN,loss_fun,optimizer,scheduler


In [None]:
# %% Explore learning rate decay parameter

# Create network
ANN = gen_model(0.01)[0]

# Fresh optimizer
optimizer = torch.optim.SGD(ANN.parameters(),lr=0.01)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer,gamma=0.999)

# Test change in learning rate
for epoch_i in range(3):
    for batch_num in range(10):

        print(f'Batch {batch_num+1}, epoch {epoch_i}: LR = {scheduler.get_last_lr()[0]}')
        scheduler.step()


In [None]:
# %% ( same training function )


In [None]:
# %% Sanity check

# On
train_acc,test_acc,losses,ANN,current_LR = train_model(0.01,True)

phi = ( 1 + np.sqrt(5) ) / 2
fig = plt.figure(figsize=(4*phi,4))

plt.plot(current_LR)
plt.title('Learning rate should change')
plt.xlabel('Number of minibatches')
plt.ylabel('Learning rate')

plt.savefig('figure102_leraning_rate_decay_extra2.png')

plt.show()

files.download('figure102_leraning_rate_decay_extra2.png')

# Off
train_acc,test_acc,losses,ANN,current_LR = train_model(0.01,False)

phi = ( 1 + np.sqrt(5) ) / 2
fig = plt.figure(figsize=(4*phi,4))

plt.plot(current_LR)
plt.title('Learning rate should not change')
plt.xlabel('Number of minibatches')
plt.ylabel('Learning rate')

plt.savefig('figure103_leraning_rate_decay_extra2.png')

plt.show()

files.download('figure103_leraning_rate_decay_extra2.png')


In [None]:
# %% Parametric experiment over learning rate decay

train_acc_dyn,test_acc_dyn,losses_dyn,ANN,current_LR = train_model(0.01,True)
train_acc_stat,test_acc_stat,losses_stat,ANN,current_LR = train_model(0.01,False)


In [None]:
# %% Plotting

phi = ( 1 + np.sqrt(5) ) / 2
fig = plt.figure(figsize=(6*phi,6))

plt.plot(train_acc_dyn,'-',color='tab:blue',label='Dyn: Train')
plt.plot(test_acc_dyn,'--',color='tab:blue',label='Dyn: Test')

plt.plot(train_acc_stat,'-',color='tab:orange',label='Stat: Train')
plt.plot(test_acc_stat,'--',color='tab:orange',label='Stat: Test')

plt.xlabel('Training epochs')
plt.ylabel('Accuracy (%)')
plt.title('Accuracy over epochs')
plt.legend()

plt.savefig('figure104_leraning_rate_decay_extra2.png')

plt.show()

files.download('figure104_leraning_rate_decay_extra2.png')
