In [None]:
# %% Deep learning - Section 10.100
#    Optimizer comparison

# This code pertains a deep learning course provided by Mike X. Cohen on Udemy:
#   > https://www.udemy.com/course/deeplearning_x
# The "base" code in this repository is adapted (with very minor modifications)
# from code developed by the course instructor (Mike X. Cohen), while the
# "exercises" and the "code challenges" contain more original solutions and
# creative input from my side. If you are interested in DL (and if you are
# reading this statement, chances are that you are), go check out the course, it
# is singularly good.


In [None]:
# %% Libraries and modules
import numpy               as np
import matplotlib.pyplot   as plt
import torch
import torch.nn            as nn
import seaborn             as sns
import copy
import torch.nn.functional as F
import pandas              as pd
import scipy.stats         as stats
import time

from torch.utils.data                 import DataLoader,TensorDataset
from sklearn.model_selection          import train_test_split
from google.colab                     import files
from torchsummary                     import summary
from IPython                          import display
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg')


In [None]:
# %% Create data

# General params
n_by_clust = 300
blurring   = 1

# Centroids
A = [1,1]
B = [5,1]
C = [4,4]

# Generate data
a = [ A[0]+np.random.randn(n_by_clust)*blurring, A[1]+np.random.randn(n_by_clust)*blurring ]
b = [ B[0]+np.random.randn(n_by_clust)*blurring, B[1]+np.random.randn(n_by_clust)*blurring ]
c = [ C[0]+np.random.randn(n_by_clust)*blurring, C[1]+np.random.randn(n_by_clust)*blurring ]

# Labels
labels_np = np.hstack(( np.zeros((n_by_clust)),
                        np.ones((n_by_clust)),
                        2*np.ones((n_by_clust)) ))

# Data matrix
data_np = np.hstack((a,b,c)).T

# Data into PyTorch tensors (long format for CCE)
data   = torch.tensor(data_np).float()
labels = torch.tensor(labels_np).long()


In [None]:
# %% Plotting

phi = ( 1 + np.sqrt(5) ) / 2
fig = plt.figure(figsize=(6*phi,6))

plt.plot(data[np.where(labels==0)[0],0],data[np.where(labels==0)[0],1],'s',alpha=.75)
plt.plot(data[np.where(labels==1)[0],0],data[np.where(labels==1)[0],1],'o',alpha=.75)
plt.plot(data[np.where(labels==2)[0],0],data[np.where(labels==2)[0],1],'^',alpha=.75)

plt.title('Some clusters')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.grid()

plt.savefig('figure76_optimizer_comparison.png')

plt.show()

files.download('figure76_optimizer_comparison.png')


In [None]:
# %% Split into train and test data

# Split with scikitlearn
train_data,test_data,train_labels,test_labels = train_test_split(data,labels,test_size=0.1)

# Convert into PyTorch datasets
train_data = TensorDataset(train_data,train_labels)
test_data  = TensorDataset(test_data,test_labels)

# Convert into DataLoader objects
batch_size   = 16
train_loader = DataLoader(train_data,batch_size=batch_size,shuffle=True,drop_last=True)
test_loader  = DataLoader(test_data,batch_size=test_data.tensors[0].shape[0])


In [None]:
# %% Create the model

def gen_model(optimizer_alg):

    class model(nn.Module):
        def __init__(self):
            super().__init__()

            # Architecture
            self.input  = nn.Linear(2,8)
            self.hid1   = nn.Linear(8,8)
            self.output = nn.Linear(8,3)

        # Forward propagation
        def forward(self,x):
            x = F.relu(self.input(x))
            x = F.relu(self.hid1(x))
            x = self.output(x)

            return x

    # Model instance
    ANN = model()

    # Loss function and optimizer (get optimizer attribute)
    loss_fun  = nn.CrossEntropyLoss()
    opti_fun  = getattr( torch.optim,optimizer_alg )
    optimizer = opti_fun(ANN.parameters(),lr=0.1)

    return ANN,loss_fun,optimizer


In [None]:
# %% Test momentum

# Try 'SGD', 'RMSprop', and 'Adam'
optim = gen_model('Adam')[2]
print(optim)


In [None]:
# %% Function to train the model

def train_model(optimizer_alg):

    # Epochs
    num_epochs = 50

    # Model instance
    ANN,loss_fun,optimizer = gen_model(optimizer_alg)

    # Initialise
    losses    = []
    train_acc = []
    test_acc  = []

    # Epochs loop
    for epoch_i in range(num_epochs):

        # Train mode on
        ANN.train()

        # Initialise and loop over batches
        batch_losses = []
        batch_acc    = []

        for X,y in train_loader:

            # Forward propagation and loss
            yHat = ANN(X)
            loss = loss_fun(yHat,y)

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Compute loss and accuracy from this batch
            batch_losses.append(loss.item())

            matches     = torch.argmax(yHat,axis=1) == y  # booleans
            matches_num = matches.float()                 # convert to numbers
            acc_percent = 100*torch.mean(matches_num)     # average and percent
            batch_acc.append(acc_percent)

        # Average train accuracy and losses from batches
        train_acc.append(np.mean(batch_acc))
        losses.append(np.mean(batch_losses))

        # Test accuracy (turn autograd off)
        ANN.eval()
        X,y = next(iter(test_loader))
        with torch.no_grad():
            yHat = ANN(X)
        test_acc.append( 100*torch.mean((torch.argmax(yHat,axis=1)==y).float()) )

    return train_acc,test_acc,losses,ANN


In [None]:
# %% Function to plot the results

def plot_results(optimizer_alg):

    # Compute accuracy over entire dataset (train and test) ...
    yHat        = ANN(data)
    predictions = torch.argmax(yHat,axis=1)
    accuracy    = (predictions==labels).float()
    total_acc   = torch.mean(100*accuracy).item()

    # ... and accuracy by group
    accuracy_by_group = np.zeros(3)
    for i in range(3):
        accuracy_by_group[i] = 100*torch.mean(accuracy[labels==i])

    # Create the figure
    phi = ( 1 + np.sqrt(5) ) / 2
    fig,ax = plt.subplots(2,2,figsize=(6*phi,6))

    # Plot the loss function
    ax[0,0].plot(losses)
    ax[0,0].set_ylabel('Loss')
    ax[0,0].set_xlabel('Epoch')
    ax[0,0].set_title(f'{optimizer_alg}: Losses')

    # plot the accuracy functions
    ax[0,1].plot(train_acc,label='Train')
    ax[0,1].plot(test_acc,label='Test')
    ax[0,1].set_ylabel('Accuracy (%)')
    ax[0,1].set_xlabel('Epoch')
    ax[0,1].set_title(f'{optimizer_alg}: Accuracy')
    ax[0,1].legend()

    # Plot overall accuracy by group
    ax[1,0].bar(range(3),accuracy_by_group)
    ax[1,0].set_ylim([np.min(accuracy_by_group)-5,np.max(accuracy_by_group)+5])
    ax[1,0].set_xticks([0,1,2])
    ax[1,0].set_xlabel('Group')
    ax[1,0].set_ylabel('Accuracy (%)')
    ax[1,0].set_title(f'{optimizer_alg}: Accuracy by group')

    # Scatterplot of correct and incorrect labeled data
    colorShapes = [ 's','o','^' ]
    for i in range(3):
        ax[1,1].plot(data[labels==i,0],data[labels==i,1],
                    colorShapes[i],alpha=.3,label=f'Group {i}')

        idxErr = (accuracy==0) & (labels==i)
        ax[1,1].plot(data[idxErr,0],data[idxErr,1],'rx')

    ax[1,1].set_title(f'{optimizer_alg}: Total accuracy: {total_acc:.2f}%')
    ax[1,1].set_xlabel('Dimension 1')
    ax[1,1].set_ylabel('Dimension 2')
    ax[1,1].legend()

    plt.tight_layout()

    plt.savefig('figure77_optimizer_comparison.png')

    plt.show()

    files.download('figure77_optimizer_comparison.png')


In [None]:
# %% Test model once

optimizer_type = 'Adam'
train_acc,test_acc,losses,ANN = train_model(optimizer_type)
plot_results(optimizer_type)


In [None]:
# %% Parametric experiment over three optimizer

# Average performance
performance = []

for optimizer in ['SGD','RMSprop','Adam']:

    train_acc,test_acc,losses,ANN = train_model(optimizer)
    plot_results(optimizer)

    train = np.mean(train_acc[-10:])
    test  = np.mean(test_acc[-10:])

    performance.append(f'{optimizer}: train {train:.1f}%, test {test:.1f}%')

print(performance)


In [None]:
# %% Exercise 1
#    Accuracy seems to be different between the different qwerties categories. Is this consistent across the optimizers?
#    Or does it seem like some optimizers are better at some categories? How do you interpret your answer, and what does
#    it indicate about metaparameters of DL and their effects on performance?

# There are differences across optimizers, I'd guess it's mostly due to randomness
# in the training, but it's clearl that the distribution of the data has a role too;
# for example in this case there is more overlap between group 1 and 2, and indeed
# the classification is lower for either of those two categories


In [None]:
# %% Exercise 2
#    The results here indicate that all three optimizers perform roughly equally well, but SGD needs more training. Is
#    that still the case with a smaller or larger learning rate?

# Trying with lr = 0.001 and lr = 0.1. With lr = 0.001 SGD can't keep up with the
# learning and the model basically classify accurately only some of the data,
# the performance for RMSprop and Adam is still optimal. With lr = 0.1 all the
# models perform quite well, with SGD even slightly better; interesting that in
# this case the output of Adam is a bit bumpy, I'm not sure whether it's related
# to the losses, because the scale of the losses is still quite small


In [None]:
# %% Exercise 3
#    The method of showing the final performance (printing out a list) is... not very pretty. What kind of visualization
#    do you think would better illustrate the performances across the optimizers? Code it!

# A bar plot should do the trick

# Raw data strings
results = performance

# Extract info
optimizers = []
train_acc  = []
test_acc   = []

for r in results:
    parts  = r.split(':')
    name   = parts[0]
    values = parts[1].split(',')
    train  = float(values[0].strip().split()[1].strip('%'))
    test   = float(values[1].strip().split()[1].strip('%'))

    optimizers.append(name)
    train_acc.append(train)
    test_acc.append(test)

# Bar plot
x = np.arange(len(optimizers))
width = 0.35

phi = ( 1 + np.sqrt(5) ) / 2
fig,ax = plt.subplots(figsize=(6*phi,6))

ax.grid(axis='y',linestyle='--',alpha=0.6)
bars1 = ax.bar(x-width/2,train_acc,width,label='Train accuracy',zorder=2)
bars2 = ax.bar(x+width/2,test_acc,width,label='Test accuracy',zorder=2)

# Labels and formatting
ax.set_ylabel('Accuracy (%)')
ax.set_title('Train and test accuracy by optimizer')
ax.set_xticks(x)
ax.set_xticklabels(optimizers)
ax.set_ylim(85,100)
ax.legend()

# Annotate bars
for bar in bars1 + bars2:
    height = bar.get_height()
    ax.annotate(f'{height:.1f}%',
                xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 3),
                textcoords="offset points",
                ha='center',va='bottom')

plt.tight_layout()

plt.savefig('figure86_optimizer_comparison_extra3.png')

plt.show()

files.download('figure86_optimizer_comparison_extra3.png')
