In [None]:
# %% Deep learning - Section 11.112
#    Code challenge 17: optimizers and MNIST

#    1) Start from code from video 11.107, 11.111, and 10.101
#    2) Vary systematically the optimizer (SGD, RMSprop, Adam)
#    3) Vary systematically the learning rate (0.0001 to 0.1 in 6 log steps)
#    4) Plot final accuracy from train and test, against the learning rates, for
#       the three optimizers (avg accuracy over last 10 epochs)

# This code pertains a deep learning course provided by Mike X. Cohen on Udemy:
#   > https://www.udemy.com/course/deeplearning_x
# The "base" code in this repository is adapted (with very minor modifications)
# from code developed by the course instructor (Mike X. Cohen), while the
# "exercises" and the "code challenges" contain more original solutions and
# creative input from my side. If you are interested in DL (and if you are
# reading this statement, chances are that you are), go check out the course, it
# is singularly good.


In [None]:
# %% Libraries and modules
import numpy               as np
import matplotlib.pyplot   as plt
import torch
import torch.nn            as nn
import seaborn             as sns
import copy
import torch.nn.functional as F
import pandas              as pd
import scipy.stats         as stats
import time

from torch.utils.data                 import DataLoader,TensorDataset
from sklearn.model_selection          import train_test_split
from google.colab                     import files
from torchsummary                     import summary
from IPython                          import display
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg')


In [None]:
# %% Data

# Load data
data = np.loadtxt(open('sample_data/mnist_train_small.csv','rb'),delimiter=',')

# Split labels from data
labels = data[:,0]
data   = data[:,1:]

# Normalise data (original range is (0,255))
data_norm = data / np.max(data)


In [None]:
# %% Create train and test datasets

# Convert to tensor (float and integers)
data_tensor   = torch.tensor(data_norm).float()
labels_tensor = torch.tensor(labels).long()

# Split data with scikitlearn (10% test data)
train_data,test_data,train_labels,test_labels = train_test_split(data_tensor,labels_tensor,test_size=0.1)

# Convert to PyTorch datasets
train_data = TensorDataset(train_data,train_labels)
test_data  = TensorDataset(test_data,test_labels)

# Convert into DataLoader objects
batch_size   = 32
train_loader = DataLoader(train_data,batch_size=batch_size,shuffle=True,drop_last=True)
test_loader  = DataLoader(test_data,batch_size=test_data.tensors[0].shape[0])


In [None]:
# %% Function to generate the model
#    Keep flexibly loop over model depth/breadth, add flexibility over optimizer

def gen_model(nUnits,nLayers,drop_rate,optimizer_alg,learning_rate):

    class mnist_FFN(nn.Module):
        def __init__(self,nUnits,nLayers,dropout_rate):
            super().__init__()

            # Dictionary to store the layers
            self.layers  = nn.ModuleDict()
            self.nLayers = nLayers

            # Dropout
            self.dropout_rate = dropout_rate

            # Architecture
            self.layers['input'] = nn.Linear(784,nUnits)
            for i in range(nLayers):
                self.layers[f'hidden{i}'] = nn.Linear(nUnits,nUnits)
            self.layers['output'] = nn.Linear(nUnits,10)

        # Forward propagation
        def forward(self,x):

            x = F.relu(self.layers['input'](x))
            x = F.dropout(x, p=self.dropout_rate, training=self.training)

            for i in range(self.nLayers):
                x = F.relu(self.layers[f'hidden{i}'](x))
                x = F.dropout(x, p=self.dropout_rate, training=self.training)

            x = self.layers['output'](x)

            return x

    # Create model instance
    ANN = mnist_FFN(nUnits,nLayers,drop_rate)

    # Loss function
    loss_fun = nn.CrossEntropyLoss()

    # Optimizer (can be selected)
    opti_fun  = getattr( torch.optim,optimizer_alg )
    optimizer = opti_fun(ANN.parameters(),lr=learning_rate)

    return ANN,loss_fun,optimizer


In [None]:
# %% Generate an instance of the model and check it
#    Try 'SGD', 'RMSprop', and 'Adam'

nUnitsPerLayer = 10
nLayers        = 2
optim          = 'Adam'
lr             = 0.01

model,loss_fun,optimizer = gen_model(nUnitsPerLayer,nLayers,0.25,optim,lr)
print(model)

optim = gen_model(nUnitsPerLayer,nLayers,0.25,optim,lr)[2]
print(optim)


In [None]:
# %% Run the model to check its internal consistency

# Samples and dimentions
tmpx = torch.randn(6,784)

# Run the model
y = model(tmpx)

# Show the output shape and the output
print(y.shape)
print()
print(y)


In [None]:
# %% Function to train the model

def train_model(nUnits,nLayers,drop_rate,optimizer_alg,learning_rate):

    # Parameters, model instance, inizialise vars
    num_epochs = 60
    ANN,loss_fun,optimizer = gen_model(nUnits,nLayers,drop_rate,optimizer_alg,learning_rate)

    losses    = []
    train_acc = []
    test_acc  = []

    # Loop over epochs
    for epoch_i in range(num_epochs):

        # Loop over training batches
        batch_acc  = []
        batch_loss = []

        for X,y in train_loader:

            # Forward propagation and loss
            yHat = ANN(X)
            loss = loss_fun(yHat,y)

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Loss and accuracy from this batch
            batch_loss.append(loss.item())

            matches     = torch.argmax(yHat,axis=1) == y
            matches_num = matches.float()
            accuracy    = 100 * torch.mean(matches_num)
            batch_acc.append(accuracy)

        losses.append( np.mean(batch_loss) )
        train_acc.append( np.mean(batch_acc) )

        # Test accuracy
        ANN.eval()

        with torch.no_grad():
            X,y = next(iter(test_loader))
            yHat = ANN(X)
            test_acc.append( 100*torch.mean((torch.argmax(yHat,axis=1)==y).float()) )

        ANN.train()

    return train_acc,test_acc,losses,ANN


In [None]:
# %% Test the whole setting

nUnitsPerLayer = 10
nLayers        = 2
drop_rate      = 0
optim          = 'Adam'
lr             = 0.01

train_acc,test_acc,losses,ANN = train_model(nUnitsPerLayer,nLayers,drop_rate,optim,lr)

print(train_acc[-1])
print(test_acc[-1])


In [None]:
# %% Parametric experiment

# Takes ~28 mins (with only 2 layers, 60 units each)
num_layers     = np.linspace(2,2,1,dtype=int)
num_units      = np.linspace(60,60,1,dtype=int)
drop_rate      = 0
optimizers     = ['SGD','RMSprop','Adam']
learning_rates = np.logspace(-4,-1,6)

# Preallocate output matrices
shape             = (len(optimizers),len(learning_rates),len(num_units),len(num_layers))
performance_train = np.zeros(shape)
performance_test  = np.zeros(shape)
training_times    = np.zeros(shape)

# Buckle up, here's the experiment!
for opt_i, optimizer in enumerate(optimizers):
    for lr_i, lr in enumerate(learning_rates):
        for unit_i, n_units in enumerate(num_units):
            for layer_i, n_layers in enumerate(num_layers):

                start_time = time.time()

                train_acc,test_acc,losses,ANN = train_model(
                                                            n_units,
                                                            n_layers,
                                                            drop_rate,
                                                            optimizer,
                                                            lr)

                duration = time.time() - start_time

                performance_train[opt_i, lr_i, unit_i, layer_i] = np.mean(train_acc[-10:])
                performance_test[opt_i, lr_i, unit_i, layer_i]  = np.mean(test_acc[-10:])
                training_times[opt_i, lr_i, unit_i, layer_i]    = duration


In [None]:
# %% Plotting

# Pick architecture to visualise
selected_units  = 60
selected_layers = 2

# Indices for slicing
unit_idx  = list(num_units).index(selected_units)
layer_idx = list(num_layers).index(selected_layers)

# Plot
phi = (1 + np.sqrt(5)) / 2
fig,axs = plt.subplots(1,2,figsize=(1.5*6*phi,6))

cmaps = plt.cm.plasma(np.linspace(0.2,0.9,len(optimizers)))
for i,opt in enumerate(optimizers):
    axs[0].plot(learning_rates,performance_train[i,:,unit_idx,layer_idx],'-o',color=cmaps[i],label=opt)
axs[0].set_title('Training accuracy')
axs[0].set_xlabel('Learning rate')
axs[0].set_ylabel('Accuracy (%)')
axs[0].set_xscale('log')
axs[0].legend(title='Optimizer')
axs[0].grid(True)
axs[0].set_ylim(5,105)

for i, opt in enumerate(optimizers):
    axs[1].plot(learning_rates, performance_test[i,:,unit_idx,layer_idx],'-o',color=cmaps[i],label=opt)
axs[1].set_title('Test accuracy')
axs[1].set_xlabel('Learning rate')
axs[1].set_ylabel('Accuracy (%)')
axs[1].set_xscale('log')
axs[1].legend(title='Optimizer')
axs[1].grid(True)
axs[1].set_ylim(5,105)

plt.suptitle(f'Performance\n(units={selected_units}, layers={selected_layers})',fontsize=14)
plt.tight_layout()

plt.savefig('figure47_code_challenge_17.png')

plt.show()

files.download('figure47_code_challenge_17.png')


In [None]:
# %% Plotting

phi = (1 + np.sqrt(5)) / 2
fig, axs = plt.subplots(1,2,figsize=(1.5*6*phi,6))

cmaps = plt.cm.plasma(np.linspace(0.2,0.9,len(learning_rates)))
for j, lr in enumerate(learning_rates):
    axs[0].plot(optimizers,performance_train[:,j,unit_idx,layer_idx],'-o',color=cmaps[j],label=f'lr={lr:.4f}')
axs[0].set_title('Training accuracy')
axs[0].set_xlabel('Optimizer')
axs[0].set_ylabel('Accuracy (%)')
axs[0].legend(title='Learning rate')
axs[0].grid(True)
axs[0].set_ylim(10,105)

for j, lr in enumerate(learning_rates):
    axs[1].plot(optimizers,performance_test[:,j,unit_idx,layer_idx],'-o',color=cmaps[j],label=f'lr={lr:.4f}')
axs[1].set_title('Test accuracy')
axs[1].set_xlabel('Optimizer')
axs[1].set_ylabel('Accuracy (%)')
axs[1].legend(title='Learning rate')
axs[1].grid(True)
axs[1].set_ylim(10,105)

plt.suptitle(f'Performance\n(units={selected_units}, layers={selected_layers})',fontsize=14)
plt.tight_layout()

plt.savefig('figure48_code_challenge_17.png')

plt.show()

files.download('figure48_code_challenge_17.png')


In [None]:
# %% Exercise 1
#    Replace the learning rate factor with L2 regularization. Does L2 regularization help all optimization
#    methods in the same way?

# Adding an L2 regularisation factor of 0.001 doesn't help much the performace,
# and the models perform pretty much the same.


In [None]:
# %% Ex. 1 - Continue ...

def gen_model(nUnits,nLayers,drop_rate,optimizer_alg,learning_rate,l2_lambda):

    class mnist_FFN(nn.Module):
        def __init__(self,nUnits,nLayers,dropout_rate):
            super().__init__()

            self.layers  = nn.ModuleDict()
            self.nLayers = nLayers

            self.dropout_rate = dropout_rate

            self.layers['input'] = nn.Linear(784,nUnits)
            for i in range(nLayers):
                self.layers[f'hidden{i}'] = nn.Linear(nUnits,nUnits)
            self.layers['output'] = nn.Linear(nUnits,10)

        def forward(self,x):

            x = F.relu(self.layers['input'](x))
            x = F.dropout(x, p=self.dropout_rate, training=self.training)

            for i in range(self.nLayers):
                x = F.relu(self.layers[f'hidden{i}'](x))
                x = F.dropout(x, p=self.dropout_rate, training=self.training)

            x = self.layers['output'](x)

            return x

    ANN      = mnist_FFN(nUnits,nLayers,drop_rate)
    loss_fun = nn.CrossEntropyLoss()

    opti_fun  = getattr( torch.optim,optimizer_alg )
    optimizer = opti_fun(ANN.parameters(),lr=learning_rate,weight_decay=l2_lambda)

    return ANN,loss_fun,optimizer


def train_model(nUnits,nLayers,drop_rate,optimizer_alg,learning_rate,l2_lambda):

    num_epochs = 60
    ANN,loss_fun,optimizer = gen_model(nUnits,nLayers,drop_rate,optimizer_alg,learning_rate,l2_lambda)

    losses    = []
    train_acc = []
    test_acc  = []

    for epoch_i in range(num_epochs):

        batch_acc  = []
        batch_loss = []

        for X,y in train_loader:

            yHat = ANN(X)
            loss = loss_fun(yHat,y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            batch_loss.append(loss.item())

            matches     = torch.argmax(yHat,axis=1) == y
            matches_num = matches.float()
            accuracy    = 100 * torch.mean(matches_num)
            batch_acc.append(accuracy)

        losses.append( np.mean(batch_loss) )
        train_acc.append( np.mean(batch_acc) )

        ANN.eval()

        with torch.no_grad():
            X,y = next(iter(test_loader))
            yHat = ANN(X)
            test_acc.append( 100*torch.mean((torch.argmax(yHat,axis=1)==y).float()) )

        ANN.train()

    return train_acc,test_acc,losses,ANN


In [None]:
# %% Ex. 1 - Continue ...

# Takes ~36 mins (with only 2 layers, 60 units each)
num_layers     = np.linspace(2,2,1,dtype=int)
num_units      = np.linspace(60,60,1,dtype=int)
drop_rate      = 0
optimizers     = ['SGD','RMSprop','Adam']
learning_rates = np.logspace(-4,-1,6)
l2_lambda      = 1e-3

# Preallocate output matrices
shape             = (len(optimizers),len(learning_rates),len(num_units),len(num_layers))
performance_train = np.zeros(shape)
performance_test  = np.zeros(shape)
training_times    = np.zeros(shape)

# Buckle up, here's the experiment!
for opt_i, optimizer in enumerate(optimizers):
    for lr_i, lr in enumerate(learning_rates):
        for unit_i, n_units in enumerate(num_units):
            for layer_i, n_layers in enumerate(num_layers):

                start_time = time.time()

                train_acc,test_acc,losses,ANN = train_model(
                                                            n_units,
                                                            n_layers,
                                                            drop_rate,
                                                            optimizer,
                                                            lr,
                                                            l2_lambda)

                duration = time.time() - start_time

                performance_train[opt_i, lr_i, unit_i, layer_i] = np.mean(train_acc[-10:])
                performance_test[opt_i, lr_i, unit_i, layer_i]  = np.mean(test_acc[-10:])
                training_times[opt_i, lr_i, unit_i, layer_i]    = duration


In [None]:
# %% Exercise 2
#    We previously observed the batch normalization boosted model performance. Does it help here as well?
#    (Note: Best to pick just one learning rate for this experiment.)

# It does, and it does so increadibly well, as now the performance reaches
# virtually ceiling for any optimizer and leraning rate (here shown just 0.0001
# and 0.1 for the sake of time)


In [None]:
# %% Ex. 2 - Continue ...

def gen_model(nUnits, nLayers, drop_rate, optimizer_alg, learning_rate, l2_lambda, doBN=True):

    class mnist_FFN(nn.Module):
        def __init__(self, nUnits, nLayers, dropout_rate, doBN):
            super().__init__()

            self.nLayers = nLayers
            self.dropout_rate = dropout_rate
            self.doBN = doBN

            self.layers = nn.ModuleDict()
            self.bns    = nn.ModuleDict() if doBN else None

            self.layers['input'] = nn.Linear(784, nUnits)
            if self.doBN:
                self.bns['input'] = nn.BatchNorm1d(nUnits)

            for i in range(nLayers):
                self.layers[f'hidden{i}'] = nn.Linear(nUnits, nUnits)
                if self.doBN:
                    self.bns[f'hidden{i}'] = nn.BatchNorm1d(nUnits)

            self.layers['output'] = nn.Linear(nUnits, 10)

        def forward(self, x):
            x = self.layers['input'](x)
            if self.doBN:
                x = self.bns['input'](x)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout_rate, training=self.training)

            for i in range(self.nLayers):
                x = self.layers[f'hidden{i}'](x)
                if self.doBN:
                    x = self.bns[f'hidden{i}'](x)
                x = F.relu(x)
                x = F.dropout(x, p=self.dropout_rate, training=self.training)

            x = self.layers['output'](x)
            return x

    ANN = mnist_FFN(nUnits, nLayers, drop_rate, doBN)

    loss_fun = nn.CrossEntropyLoss()
    opti_fun = getattr(torch.optim, optimizer_alg)
    optimizer = opti_fun(ANN.parameters(), lr=learning_rate, weight_decay=l2_lambda)

    return ANN, loss_fun, optimizer


def train_model(nUnits,nLayers,drop_rate,optimizer_alg,learning_rate,l2_lambda):

    num_epochs = 60
    ANN,loss_fun,optimizer = gen_model(nUnits,nLayers,drop_rate,optimizer_alg,learning_rate,l2_lambda,doBN=True)

    losses    = []
    train_acc = []
    test_acc  = []

    for epoch_i in range(num_epochs):

        batch_acc  = []
        batch_loss = []

        for X,y in train_loader:

            yHat = ANN(X)
            loss = loss_fun(yHat,y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            batch_loss.append(loss.item())

            matches     = torch.argmax(yHat,axis=1) == y
            matches_num = matches.float()
            accuracy    = 100 * torch.mean(matches_num)
            batch_acc.append(accuracy)

        losses.append( np.mean(batch_loss) )
        train_acc.append( np.mean(batch_acc) )

        ANN.eval()

        with torch.no_grad():
            X,y = next(iter(test_loader))
            yHat = ANN(X)
            test_acc.append( 100*torch.mean((torch.argmax(yHat,axis=1)==y).float()) )

        ANN.train()

    return train_acc,test_acc,losses,ANN


In [None]:
# %% Ex. 2 - Continue ...

# Takes ~12 mins (with only 2 layers, 60 units each, only 2 lr)
num_layers     = np.linspace(2,2,1,dtype=int)
num_units      = np.linspace(60,60,1,dtype=int)
drop_rate      = 0
optimizers     = ['SGD','RMSprop','Adam']
learning_rates = np.logspace(-4,-1,2)
l2_lambda      = 0

# Preallocate output matrices
shape             = (len(optimizers),len(learning_rates),len(num_units),len(num_layers))
performance_train = np.zeros(shape)
performance_test  = np.zeros(shape)
training_times    = np.zeros(shape)

# Buckle up, here's the experiment!
for opt_i, optimizer in enumerate(optimizers):
    for lr_i, lr in enumerate(learning_rates):
        for unit_i, n_units in enumerate(num_units):
            for layer_i, n_layers in enumerate(num_layers):

                start_time = time.time()

                train_acc,test_acc,losses,ANN = train_model(
                                                            n_units,
                                                            n_layers,
                                                            drop_rate,
                                                            optimizer,
                                                            lr,
                                                            l2_lambda)

                duration = time.time() - start_time

                performance_train[opt_i, lr_i, unit_i, layer_i] = np.mean(train_acc[-10:])
                performance_test[opt_i, lr_i, unit_i, layer_i]  = np.mean(test_acc[-10:])
                training_times[opt_i, lr_i, unit_i, layer_i]    = duration


In [None]:
# %% Exercise 3
#    The dataset contains 20,000 images. Do we need that many to get good accuracy? Pick one combination of
#    optimizer and learning rate, and train the model using only 2,000 images. Do you still get >95% accuracy?
#    How about 200 images?

# Surprisingly (or not), the models with RMSprop still work quite well, while
# SGD suffers the most, especially for smaller and smaller leraning rates


In [None]:
# %% Ex. 3 - Continue ...

# Load data
data = np.loadtxt(open('sample_data/mnist_train_small.csv','rb'),delimiter=',')

# Split labels from data
labels = data[:,0]
data   = data[:,1:]

# Normalise data (original range is (0,255))
data_norm = data / np.max(data)

# Set number of samples
n_samples = 200

# Shuffle and slice
np.random.seed(99)
shuffler = np.random.permutation(data_norm.shape[0])

data_norm = data_norm[shuffler[:n_samples]]
labels    = labels[shuffler[:n_samples]]


In [None]:
# %% Ex. 3 - Continue ...

# Takes ~2 mins for 2000 images, and ~15 secs for 200 (with only 2 layers, 60 units each)
num_layers     = np.linspace(2,2,1,dtype=int)
num_units      = np.linspace(60,60,1,dtype=int)
drop_rate      = 0
optimizers     = ['SGD','RMSprop','Adam']
learning_rates = np.logspace(-4,-1,6)

# Preallocate output matrices
shape             = (len(optimizers),len(learning_rates),len(num_units),len(num_layers))
performance_train = np.zeros(shape)
performance_test  = np.zeros(shape)
training_times    = np.zeros(shape)

# Buckle up, here's the experiment!
for opt_i, optimizer in enumerate(optimizers):
    for lr_i, lr in enumerate(learning_rates):
        for unit_i, n_units in enumerate(num_units):
            for layer_i, n_layers in enumerate(num_layers):

                start_time = time.time()

                train_acc,test_acc,losses,ANN = train_model(
                                                            n_units,
                                                            n_layers,
                                                            drop_rate,
                                                            optimizer,
                                                            lr)

                duration = time.time() - start_time

                performance_train[opt_i, lr_i, unit_i, layer_i] = np.mean(train_acc[-10:])
                performance_test[opt_i, lr_i, unit_i, layer_i]  = np.mean(test_acc[-10:])
                training_times[opt_i, lr_i, unit_i, layer_i]    = duration
