In [None]:
# %% Deep learning - Section 10.98
#    SGD with momentum

# This code pertains a deep learning course provided by Mike X. Cohen on Udemy:
#   > https://www.udemy.com/course/deeplearning_x
# The "base" code in this repository is adapted (with very minor modifications)
# from code developed by the course instructor (Mike X. Cohen), while the
# "exercises" and the "code challenges" contain more original solutions and
# creative input from my side. If you are interested in DL (and if you are
# reading this statement, chances are that you are), go check out the course, it
# is singularly good.


In [None]:
# %% Libraries and modules
import numpy               as np
import matplotlib.pyplot   as plt
import torch
import torch.nn            as nn
import seaborn             as sns
import copy
import torch.nn.functional as F
import pandas              as pd
import scipy.stats         as stats
import time

from torch.utils.data                 import DataLoader,TensorDataset
from sklearn.model_selection          import train_test_split
from google.colab                     import files
from torchsummary                     import summary
from IPython                          import display
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg')


In [None]:
# %% Create data

# General params
n_by_clust = 300
blurring   = 1

# Centroids
A = [1,1]
B = [5,1]
C = [4,4]

# Generate data
a = [ A[0]+np.random.randn(n_by_clust)*blurring, A[1]+np.random.randn(n_by_clust)*blurring ]
b = [ B[0]+np.random.randn(n_by_clust)*blurring, B[1]+np.random.randn(n_by_clust)*blurring ]
c = [ C[0]+np.random.randn(n_by_clust)*blurring, C[1]+np.random.randn(n_by_clust)*blurring ]

# Labels
labels_np = np.hstack(( np.zeros((n_by_clust)),
                        np.ones((n_by_clust)),
                        2*np.ones((n_by_clust)) ))

# Data matrix
data_np = np.hstack((a,b,c)).T

# Data into PyTorch tensors (long format for CCE)
data   = torch.tensor(data_np).float()
labels = torch.tensor(labels_np).long()


In [None]:
# %% Plotting

phi = ( 1 + np.sqrt(5) ) / 2
fig = plt.figure(figsize=(6*phi,6))

plt.plot(data[np.where(labels==0)[0],0],data[np.where(labels==0)[0],1],'s',alpha=.75)
plt.plot(data[np.where(labels==1)[0],0],data[np.where(labels==1)[0],1],'o',alpha=.75)
plt.plot(data[np.where(labels==2)[0],0],data[np.where(labels==2)[0],1],'^',alpha=.75)

plt.title('Some clusters')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.grid()

plt.savefig('figure69_sgd_momentum.png')

plt.show()

files.download('figure69_sgd_momentum.png')


In [None]:
# %% Split into train and test data

# Split with scikitlearn
train_data,test_data,train_labels,test_labels = train_test_split(data,labels,test_size=0.1)

# Convert into PyTorch datasets
train_data = TensorDataset(train_data,train_labels)
test_data  = TensorDataset(test_data,test_labels)

# Convert into DataLoader objects
batch_size   = 16
train_loader = DataLoader(train_data,batch_size=batch_size,shuffle=True,drop_last=True)
test_loader  = DataLoader(test_data,batch_size=test_data.tensors[0].shape[0])


In [None]:
# %% Create the model

def gen_model(momentum):

    class model(nn.Module):
        def __init__(self):
            super().__init__()

            # Architecture
            self.input  = nn.Linear(2,8)
            self.hid1   = nn.Linear(8,8)
            self.output = nn.Linear(8,3)

        # Forward propagation
        def forward(self,x):
            x = F.relu(self.input(x))
            x = F.relu(self.hid1(x))
            x = self.output(x)

            return x

    # Model instance
    ANN = model()

    # Loss function and optimizer (note the extra input into optimizer)
    loss_fun  = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(ANN.parameters(),lr=0.01,momentum=momentum)

    return ANN,loss_fun,optimizer


In [None]:
# %% Test momentum

optim = gen_model(.9)[2]
print(optim)


In [None]:
# %% Function to train the model

# Epochs
num_epochs = 50

def train_model(momentum):

    # Model instance
    ANN,loss_fun,optimizer = gen_model(momentum)

    # Initialise
    losses    = []
    train_acc = []
    test_acc  = []

    # Epochs loop
    for epoch_i in range(num_epochs):

        # Train mode on
        ANN.train()

        # Initialise and loop over batches
        batch_losses = []
        batch_acc    = []

        for X,y in train_loader:

            # Forward propagation and loss
            yHat = ANN(X)
            loss = loss_fun(yHat,y)

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Compute loss and accuracy from this batch
            batch_losses.append(loss.item())

            matches     = torch.argmax(yHat,axis=1) == y  # booleans
            matches_num = matches.float()                 # convert to numbers
            acc_percent = 100*torch.mean(matches_num)     # average and percent
            batch_acc.append(acc_percent)

        # Average train accuracy and losses from batches
        train_acc.append(np.mean(batch_acc))
        losses.append(np.mean(batch_losses))

        # Test accuracy (turn autograd off)
        ANN.eval()
        X,y = next(iter(test_loader))
        with torch.no_grad():
            yHat = ANN(X)
        test_acc.append( 100*torch.mean((torch.argmax(yHat,axis=1)==y).float()) )

    return train_acc,test_acc,losses,ANN


In [None]:
# %% Functions for 1D smoothing filter

# Improved for edge effects - adaptive window
def smooth_adaptive(x,k):
    smoothed = np.zeros_like(x)
    half_k   = k // 2

    for i in range(len(x)):
        start       = max(0, i-half_k)
        end         = min(len(x), i+half_k + 1)
        smoothed[i] = np.mean(x[start:end])

    return smoothed


In [None]:
# %% Parametric experiment over momenta

momenta = [0,0.5,0.9,0.95,0.999]
results = np.zeros((num_epochs,len(momenta),3))

# Test all momentum vals on the same data
for i,momentum in enumerate(momenta):
    train_acc,test_acc,losses,ANN = train_model(momentum)
    results[:,i,0] = smooth_adaptive(train_acc,5)
    results[:,i,1] = smooth_adaptive(test_acc,5)
    results[:,i,2] = smooth_adaptive(losses,5)


In [None]:
# %% Plotting

# Notice how momenta between 0.9 and 0.95 tend to produce the better learning
# process; lower values are still fine but slower, but higher (~1) make the
# learning collapse

phi = ( 1 + np.sqrt(5) ) / 2
fig,ax = plt.subplots(1,3,figsize=(1.5*6*phi,6))

for i in range(3):
  ax[i].plot(results[:,:,i])
  ax[i].legend(momenta)
  ax[i].set_xlabel('Epochs')
  ax[i].grid()

  if i==0 or i==1:
    ax[i].set_ylabel('Accuracy (%)')
    ax[i].set_ylim([20,100])
  else:
    ax[i].set_ylabel('Loss')

ax[0].set_title('Train')
ax[1].set_title('Test')
ax[2].set_title('Losses')
ax[2].set_ylim([0,2])

plt.savefig('figure70_sgd_momentum.png')

plt.show()

files.download('figure70_sgd_momentum.png')


In [None]:
# %% Exercise 1
#    Now that you see the results across a broad range of beta (momentum) parameters, try re-running the experiment
#    using a narrower range. For example, you don't need to test b=0 or b=.999.

# Indeed there is a general improvement for higher momenta, if ranging between
# 0.5 and 0.95

# Re-run
momenta = np.arange(0.55,0.96,0.05)
results = np.zeros((num_epochs,len(momenta),3))

for i,momentum in enumerate(momenta):
    train_acc,test_acc,losses,ANN = train_model(momentum)
    results[:,i,0] = smooth_adaptive(train_acc,5)
    results[:,i,1] = smooth_adaptive(test_acc,5)
    results[:,i,2] = smooth_adaptive(losses,5)

# Re-plot
phi = ( 1 + np.sqrt(5) ) / 2
fig,ax = plt.subplots(1,3,figsize=(1.5*6*phi,6))

cmaps = plt.cm.plasma(np.linspace(.1,.9,len(momenta)))
for j in range(len(momenta)):
    for i in range(3):
        ax[i].plot(results[:,j,i], color=cmaps[j], alpha=0.8, label=f"{momenta[j]:.2f}")

for i in range(3):
    ax[i].legend(title='Momenta',fontsize=9)
    ax[i].set_xlabel('Epochs')
    ax[i].grid()

    if i==0 or i==1:
        ax[i].set_ylabel('Accuracy (%)')
        ax[i].set_ylim([20,100])
    else:
        ax[i].set_ylabel('Loss')

ax[0].set_title('Train')
ax[1].set_title('Test')
ax[2].set_title('Losses')
ax[2].set_ylim([0,2])

plt.savefig('figure71_sgd_momentum_extra1.png')

plt.show()

files.download('figure71_sgd_momentum_extra1.png')


In [None]:
# %% Exercise 2
#    The beta parameter multiplies the learning rate in the computation (see formula in slides). That means that these
#    results will interact with the learning rate. Repeat the experiment using a different learning rate.

# Trying with lr = 0.001 and lr = 0.1; with a smaller lr, the gradual effect of
# higher momenta becomes more evident, while with a smaller lr, the effect is
# reversed and the moementum is actually detrimental (i.e., the basic SGD seems
# more stable); not sure how to explain this to myself, beside that bigger steps
# might interact with the momentum and make the gradient jump here and there
# without settling in one minimum (?)


In [None]:
# %% Exercise 3
#    If you wanted to test the relationship between momentum and learning rate in a full parametric experiment, how would
#    you set it up? Would you store the loss/accuracy over all epochs?

# One can slightly modify the functions to generate and train the model, and then
# running a nested loop to go over changing lrs and momenta. One technically does
# not need the loss/accuracy over all epochs for visualisation (only the final
# accuracies and losses). See code below.
# As for the interpretation, one can see that there are two main 'corners' of the
# parameters space, where the (final) accuracy/losses are lower/higher: 1) high
# lr and large momentum, and 2) low lr and small momentum. Everything in-between
# looks like a fairly happy parameters flatland.


In [None]:
# %% Create the model

def gen_model(momentum,lr):

    class model(nn.Module):
        def __init__(self):
            super().__init__()

            # Architecture
            self.input  = nn.Linear(2,8)
            self.hid1   = nn.Linear(8,8)
            self.output = nn.Linear(8,3)

        # Forward propagation
        def forward(self,x):
            x = F.relu(self.input(x))
            x = F.relu(self.hid1(x))
            x = self.output(x)

            return x

    # Model instance
    ANN = model()

    # Loss function and optimizer (note the extra input into optimizer)
    loss_fun  = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(ANN.parameters(),lr=lr,momentum=momentum)

    return ANN,loss_fun,optimizer


In [None]:
# %% Function to train the model

# Epochs
num_epochs = 50

def train_model(momentum,lr):

    # Model instance
    ANN,loss_fun,optimizer = gen_model(momentum,lr)

    # Initialise
    losses    = []
    train_acc = []
    test_acc  = []

    # Epochs loop
    for epoch_i in range(num_epochs):

        # Train mode on
        ANN.train()

        # Initialise and loop over batches
        batch_losses = []
        batch_acc    = []

        for X,y in train_loader:

            # Forward propagation and loss
            yHat = ANN(X)
            loss = loss_fun(yHat,y)

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Compute loss and accuracy from this batch
            batch_losses.append(loss.item())

            matches     = torch.argmax(yHat,axis=1) == y  # booleans
            matches_num = matches.float()                 # convert to numbers
            acc_percent = 100*torch.mean(matches_num)     # average and percent
            batch_acc.append(acc_percent)

        # Average train accuracy and losses from batches
        train_acc.append(np.mean(batch_acc))
        losses.append(np.mean(batch_losses))

        # Test accuracy (turn autograd off)
        ANN.eval()
        X,y = next(iter(test_loader))
        with torch.no_grad():
            yHat = ANN(X)
        test_acc.append( 100*torch.mean((torch.argmax(yHat,axis=1)==y).float()) )

    return train_acc,test_acc,losses,ANN


In [None]:
# %% Parametric experiment over momenta and learning rates

# Takes ~8 mins
momenta = np.arange(0.55,0.96,0.05)
lrs     = np.logspace(-3,-1,20)
results = np.zeros((num_epochs,len(momenta),len(lrs),3))

for i,momentum in enumerate(momenta):
    for j,lr in enumerate(lrs):
        train_acc,test_acc,losses,ANN = train_model(momentum,lr)
        results[:,i,j,0] = train_acc
        results[:,i,j,1] = test_acc
        results[:,i,j,2] = losses


In [None]:
# %% Plotting

from mpl_toolkits.mplot3d import Axes3D
from matplotlib           import cm

# Meshgrid of momenta and learning rates
M, LR = np.meshgrid(momenta, lrs, indexing='ij')

# Get final epoch values
final_train = results[-1, :, :, 0]
final_test  = results[-1, :, :, 1]
final_loss  = results[-1, :, :, 2]

# Plot
phi = ( 1 + np.sqrt(5) ) / 2
fig = plt.figure(figsize=(1.5*6*phi, 6))

titles = ['Final Train Accuracy','Final Test Accuracy','Final Loss']
data   = [final_train,final_test,final_loss]
zlabels = ['Accuracy (%)','Accuracy (%)','Losses']

for i in range(3):
    ax   = fig.add_subplot(1,3,i+1,projection='3d')
    surf = ax.plot_surface(M,LR,data[i],cmap='plasma',edgecolors='none',linewidth=0,antialiased=True,alpha=0.9)

    ax.set_xlabel('Momentum')
    ax.set_ylabel('Learning Rate')
    ax.set_zlabel(zlabels[i])
    ax.set_title(titles[i])
    fig.colorbar(surf,ax=ax,shrink=0.4,aspect=10,pad=0.2)

fig.tight_layout()

plt.savefig('figure74_sgd_momentum_extra3.png')

plt.show()

files.download('figure74_sgd_momentum_extra3.png')


In [None]:
# %% More plotting

phi = (1 + np.sqrt(5)) / 2
fig,ax = plt.subplots(1,3,figsize=(1.75*6*phi,6))

titles = ["Final Train Accuracy", "Final Test Accuracy", "Final Loss"]
data   = [final_train, final_test, final_loss]
cmap   = "plasma"

for i,ax in enumerate(ax):

    if i < 2:
        im = ax.imshow(
            data[i],
            origin='lower',
            cmap='plasma',
            interpolation='bicubic',
            aspect='auto',
            vmin=30,
            vmax=100)

    else:
        im = ax.imshow(
            data[i],
            origin='lower',
            cmap='plasma',
            interpolation='bicubic',
            aspect='auto')

    ax.set_xticks(np.arange(len(lrs)))
    ax.set_xticklabels([f"{lr:.3f}" for lr in lrs], rotation=45, ha="right")
    ax.set_yticks(np.arange(len(momenta)))
    ax.set_yticklabels([f"{m:.2f}" for m in momenta])

    ax.set_xlabel("Learning Rate")
    ax.set_ylabel("Momentum")
    ax.set_title(titles[i])

    cbar = fig.colorbar(im,ax=ax,shrink=0.6,pad=0.03)
    cbar.ax.tick_params(labelsize=9)

plt.tight_layout()

plt.savefig('figure75_sgd_momentum_extra3.png')

plt.show()

files.download('figure75_sgd_momentum_extra3.png')
