In [None]:
# %% Deep learning - Section 10.101
#    Code challenge 12: optimizer and ... someting

#    1) Start drom code from video 10.100
#    2) Compare performance of the three optimizers using a range of lr
#    3) Plot test accuracy (avg last 10 epochs) against learning rates
#    4) Use log spaced lr (from 0.0001 to 0.1, 20 vals)

# This code pertains a deep learning course provided by Mike X. Cohen on Udemy:
#   > https://www.udemy.com/course/deeplearning_x
# The "base" code in this repository is adapted (with very minor modifications)
# from code developed by the course instructor (Mike X. Cohen), while the
# "exercises" and the "code challenges" contain more original solutions and
# creative input from my side. If you are interested in DL (and if you are
# reading this statement, chances are that you are), go check out the course, it
# is singularly good.


In [None]:
# %% Libraries and modules
import numpy               as np
import matplotlib.pyplot   as plt
import torch
import torch.nn            as nn
import seaborn             as sns
import copy
import torch.nn.functional as F
import pandas              as pd
import scipy.stats         as stats
import time

from torch.utils.data                 import DataLoader,TensorDataset
from sklearn.model_selection          import train_test_split
from google.colab                     import files
from torchsummary                     import summary
from IPython                          import display
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg')


In [None]:
# %% Create data

# General params
n_by_clust = 300
blurring   = 1

# Centroids
A = [1,1]
B = [5,1]
C = [4,4]

# Generate data
a = [ A[0]+np.random.randn(n_by_clust)*blurring, A[1]+np.random.randn(n_by_clust)*blurring ]
b = [ B[0]+np.random.randn(n_by_clust)*blurring, B[1]+np.random.randn(n_by_clust)*blurring ]
c = [ C[0]+np.random.randn(n_by_clust)*blurring, C[1]+np.random.randn(n_by_clust)*blurring ]

# Labels
labels_np = np.hstack(( np.zeros((n_by_clust)),
                        np.ones((n_by_clust)),
                        2*np.ones((n_by_clust)) ))

# Data matrix
data_np = np.hstack((a,b,c)).T

# Data into PyTorch tensors (long format for CCE)
data   = torch.tensor(data_np).float()
labels = torch.tensor(labels_np).long()


In [None]:
# %% Plotting

phi = ( 1 + np.sqrt(5) ) / 2
fig = plt.figure(figsize=(6*phi,6))

plt.plot(data[np.where(labels==0)[0],0],data[np.where(labels==0)[0],1],'s',alpha=.75)
plt.plot(data[np.where(labels==1)[0],0],data[np.where(labels==1)[0],1],'o',alpha=.75)
plt.plot(data[np.where(labels==2)[0],0],data[np.where(labels==2)[0],1],'^',alpha=.75)

plt.title('Some clusters')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.grid()

plt.savefig('figure87_code_challenge_12.png')

plt.show()

files.download('figure87_code_challenge_12.png')


In [None]:
# %% Split into train and test data

# Split with scikitlearn
train_data,test_data,train_labels,test_labels = train_test_split(data,labels,test_size=0.1)

# Convert into PyTorch datasets
train_data = TensorDataset(train_data,train_labels)
test_data  = TensorDataset(test_data,test_labels)

# Convert into DataLoader objects
batch_size   = 16
train_loader = DataLoader(train_data,batch_size=batch_size,shuffle=True,drop_last=True)
test_loader  = DataLoader(test_data,batch_size=test_data.tensors[0].shape[0])


In [None]:
# %% Create the model

def gen_model(optimizer_alg,learning_rate):

    class model(nn.Module):
        def __init__(self):
            super().__init__()

            # Architecture
            self.input  = nn.Linear(2,8)
            self.hid1   = nn.Linear(8,8)
            self.output = nn.Linear(8,3)

        # Forward propagation
        def forward(self,x):
            x = F.relu(self.input(x))
            x = F.relu(self.hid1(x))
            x = self.output(x)

            return x

    # Model instance
    ANN = model()

    # Loss function and optimizer (get optimizer attribute)
    loss_fun  = nn.CrossEntropyLoss()
    opti_fun  = getattr( torch.optim,optimizer_alg )
    optimizer = opti_fun(ANN.parameters(),lr=learning_rate)

    return ANN,loss_fun,optimizer


In [None]:
# %% Test momentum and lr

# Try 'SGD', 'RMSprop', and 'Adam'
optim = gen_model('Adam',0.01)[2]
print(optim)


In [None]:
# %% Function to train the model

def train_model(optimizer_alg,learning_rate):

    # Epochs
    num_epochs = 50

    # Model instance
    ANN,loss_fun,optimizer = gen_model(optimizer_alg,learning_rate)

    # Initialise
    losses    = []
    train_acc = []
    test_acc  = []

    # Epochs loop
    for epoch_i in range(num_epochs):

        # Train mode on
        ANN.train()

        # Initialise and loop over batches
        batch_losses = []
        batch_acc    = []

        for X,y in train_loader:

            # Forward propagation and loss
            yHat = ANN(X)
            loss = loss_fun(yHat,y)

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Compute loss and accuracy from this batch
            batch_losses.append(loss.item())

            matches     = torch.argmax(yHat,axis=1) == y  # booleans
            matches_num = matches.float()                 # convert to numbers
            acc_percent = 100*torch.mean(matches_num)     # average and percent
            batch_acc.append(acc_percent)

        # Average train accuracy and losses from batches
        train_acc.append(np.mean(batch_acc))
        losses.append(np.mean(batch_losses))

        # Test accuracy (turn autograd off)
        ANN.eval()
        X,y = next(iter(test_loader))
        with torch.no_grad():
            yHat = ANN(X)
        test_acc.append( 100*torch.mean((torch.argmax(yHat,axis=1)==y).float()) )

    return train_acc,test_acc,losses,ANN


In [None]:
# %% Parametric experiment over three optimizer and learning rates

# Takes ~6 mins
optimizers        = ['SGD','RMSprop','Adam']
learning_rates    = np.logspace(-4,-1,20)
performance_train = np.zeros((len(optimizers),len(learning_rates)))
performance_test  = np.zeros((len(optimizers),len(learning_rates)))

for i,optimizer in enumerate(optimizers):
    for j,lr in enumerate(learning_rates):

        train_acc,test_acc,losses,ANN = train_model(optimizer,lr)

        performance_train[i,j] = np.mean(train_acc[-10:])
        performance_test[i,j]  = np.mean(test_acc[-10:])


In [None]:
# %% Plotting

# Train data
phi = ( 1 + np.sqrt(5) ) / 2
fig = plt.figure(figsize=(6*phi,6))

for i,optimizer in enumerate(optimizers):
    plt.plot(learning_rates,performance_train[i,:],'o-',alpha=.75,label=f'{optimizer}')

plt.title('Optimizers over learning rates - Training')
plt.xlabel('Learning rates')
plt.ylabel('Mean accuracy (last 10 epochs)')
plt.xscale('log')
plt.legend()
plt.grid()

plt.savefig('figure88_code_challenge_12.png')

plt.show()

files.download('figure88_code_challenge_12.png')


In [None]:
# %% Plotting

# Train data
phi = ( 1 + np.sqrt(5) ) / 2
fig = plt.figure(figsize=(6*phi,6))

for i,optimizer in enumerate(optimizers):
    plt.plot(learning_rates,performance_test[i,:],'o-',alpha=.75,label=f'{optimizer}')

plt.title('Optimizers over learning rates - Test')
plt.xlabel('Learning rates')
plt.ylabel('Mean accuracy (last 10 epochs)')
plt.xscale('log')
plt.legend()
plt.grid()

plt.savefig('figure89_code_challenge_12.png')

plt.show()

files.download('figure89_code_challenge_12.png')


In [None]:
# %% Exercise 1
#    Which optimizer is best for quick learning? Re-run the code but average accuracy in the FIRST 10 training epochs
#    instead of the FINAL 10 training epochs.

# Similar picture, even in this case RMSprop and Adam allow for a faster learning,
# presumably again because of the adaptive lr intrinsically implemented in these
# two algorithms

# Takes ~6 mins
optimizers        = ['SGD','RMSprop','Adam']
learning_rates    = np.logspace(-4,-1,20)
performance_train = np.zeros((len(optimizers),len(learning_rates)))
performance_test  = np.zeros((len(optimizers),len(learning_rates)))

for i,optimizer in enumerate(optimizers):
    for j,lr in enumerate(learning_rates):

        train_acc,test_acc,losses,ANN = train_model(optimizer,lr)

        performance_train[i,j] = np.mean(train_acc[:10])
        performance_test[i,j]  = np.mean(test_acc[:10])


In [None]:
# %% Exercise 2
#    Similar question but for batch size. Try using batch sizes ranging from 2**4 (16) to 2**7 (128).

# What is this? A meta-parametric experiment! I set it up with a (monstruous)
# triple for loop to also run over some batch sizes. As one would expect,
# smaller batch sizes produces higher accuracies in general, but there's
# probably also an interaction with the effect of optimizer, with basic SGD
# performing worse than RMSprop or Adam. In other words, RMSprop and Adam are
# more robust to variations in batch sizes, and (assuming the data make it
# appropriate) one could use larger batch sizes to save up memory and
# computation time without too many concerns (?)

# Parametric experiment over three optimizer, learning rates and batch sizes
train_data,test_data,train_labels,test_labels = train_test_split(data,labels,test_size=0.1)

train_data = TensorDataset(train_data,train_labels)
test_data  = TensorDataset(test_data,test_labels)

test_loader  = DataLoader(test_data,batch_size=test_data.tensors[0].shape[0])

# Takes ~11 mins
optimizers        = ['SGD','RMSprop','Adam']
learning_rates    = np.logspace(-4,-1,20)
batch_sizes       = [16,32,64,128]
performance_train = np.zeros((len(optimizers),len(learning_rates),len(batch_sizes)))
performance_test  = np.zeros((len(optimizers),len(learning_rates),len(batch_sizes)))

for i,optimizer in enumerate(optimizers):
    for j,lr in enumerate(learning_rates):
        for k,batch_size in enumerate(batch_sizes):

            train_loader = DataLoader(train_data,batch_size=batch_size,shuffle=True,drop_last=True)
            train_acc,test_acc,losses,ANN = train_model(optimizer,lr)

            performance_train[i,j,k] = np.mean(train_acc[-10:])


In [None]:
# %% Plotting

# Train data
phi = ( 1 + np.sqrt(5) ) / 2
fig,ax = plt.subplots(1,3,figsize=(2*6*phi,6))

line_styles = ['-', '--', '-.', ':']
cmaps = plt.cm.plasma(np.linspace(.1,.9,len(batch_sizes)))

# Plot SGD
for i,batch_size in enumerate(batch_sizes):
    style = line_styles[i % len(line_styles)]
    ax[0].plot(learning_rates,performance_train[0,:,i],style+'o',color=cmaps[i],alpha=.75,label=f'SGD - Batch size: {batch_size}')
    ax[0].set_title('SGD')
    ax[0].set_xlabel('Learning rates')
    ax[0].set_ylabel('Mean accuracy (last 10 epochs)')
    ax[0].set_xscale('log')
    ax[0].legend()
    ax[0].grid(True)

# Plot RMSprop
for i,batch_size in enumerate(batch_sizes):
    style = line_styles[i % len(line_styles)]
    ax[1].plot(learning_rates,performance_train[1,:,i],style+'o',color=cmaps[i],label=f'RMSprop - Batch size: {batch_size}')
    ax[1].set_title('RMSprop')
    ax[1].set_xlabel('Learning rates')
    ax[1].set_ylabel('Mean accuracy (last 10 epochs)')
    ax[1].set_xscale('log')
    ax[1].legend()
    ax[1].grid(True)

# Plot Adam
for i,batch_size in enumerate(batch_sizes):
    style = line_styles[i % len(line_styles)]
    ax[2].plot(learning_rates,performance_train[2,:,i],style+'o',color=cmaps[i],label=f'Adam - Batch size: {batch_size}')
    ax[2].set_title('Adam')
    ax[2].set_xlabel('Learning rates')
    ax[2].set_ylabel('Mean accuracy (last 10 epochs)')
    ax[2].set_xscale('log')
    ax[2].legend()
    ax[2].grid(True)

plt.suptitle('Optimizers over learning rates - Train')

plt.savefig('figure91_code_challenge_12_extra2.png')

plt.show()

files.download('figure91_code_challenge_12_extra2.png')
