In [None]:
# %% Deep learning - Section 15.150
#    Learning-related changes in weights

# This code pertains a deep learning course provided by Mike X. Cohen on Udemy:
#   > https://www.udemy.com/course/deeplearning_x
# The "base" code in this repository is adapted (with very minor modifications)
# from code developed by the course instructor (Mike X. Cohen), while the
# "exercises" and the "code challenges" contain more original solutions and
# creative input from my side. If you are interested in DL (and if you are
# reading this statement, chances are that you are), go check out the course, it
# is singularly good.


In [31]:
# %% Libraries and modules
import numpy               as np
import matplotlib.pyplot   as plt
import torch
import torch.nn            as nn
import seaborn             as sns
import copy
import torch.nn.functional as F
import pandas              as pd
import scipy.stats         as stats
import sklearn.metrics     as skm
import time

from torch.utils.data                 import DataLoader,TensorDataset
from sklearn.model_selection          import train_test_split
from google.colab                     import files
from torchsummary                     import summary
from scipy.stats                      import zscore
from IPython                          import display
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg')
plt.style.use('default')


In [2]:
# %% Data

# Load data
data = np.loadtxt(open('sample_data/mnist_train_small.csv','rb'),delimiter=',')

# Split labels from data
labels = data[:,0]
data   = data[:,1:]

# Normalise data (original range is (0,255))
data_norm = data / np.max(data)


In [3]:
# %% Create train and test datasets

# Convert to tensor (float and integers)
data_tensor   = torch.tensor(data_norm).float()
labels_tensor = torch.tensor(labels).long()

# Split data with scikitlearn (10% test data)
train_data,test_data,train_labels,test_labels = train_test_split(data_tensor,labels_tensor,test_size=0.1)

# Convert to PyTorch datasets
train_data = TensorDataset(train_data,train_labels)
test_data  = TensorDataset(test_data,test_labels)

# Convert into DataLoader objects
batch_size   = 32
train_loader = DataLoader(train_data,batch_size=batch_size,shuffle=True,drop_last=True)
test_loader  = DataLoader(test_data,batch_size=test_data.tensors[0].shape[0])


In [59]:
# %% Model class

def gen_model():

    class mnist_FFN(nn.Module):
        def __init__(self):
            super().__init__()

            # Architecture
            self.input   = nn.Linear(784,64)
            self.hidden1 = nn.Linear(64,32)
            self.hidden2 = nn.Linear(32,32)
            self.output  = nn.Linear(32,10)

        # Forward propagation
        def forward(self,x):

            x = F.relu(self.input(x))
            x = F.relu(self.hidden1(x))
            x = F.relu(self.hidden2(x))
            x = self.output(x)

            return x

    # Generate model instance
    ANN = mnist_FFN()

    # Loss function
    loss_fun = nn.CrossEntropyLoss()

    # Optimizer (SGD and small lr for illustration purposes)
    optimizer = torch.optim.SGD(ANN.parameters(),lr=0.001)

    return ANN,loss_fun,optimizer


In [54]:
# %% Function to train the model

def train_model(ANN,loss_fun,optimizer):

    # Parameters, inizialise vars
    num_epochs = 60

    losses    = []
    train_acc = []
    test_acc  = []

    weight_euclidian = np.zeros((num_epochs,4))
    weight_cond_num  = np.zeros((num_epochs,4))

    # Loop over epochs
    for epoch_i in range(num_epochs):

        # Store weights for each layer ("pre-training")
        pre_w = []
        for p in ANN.named_parameters():
            if 'weight' in p[0]:
                pre_w.append(copy.deepcopy(p[1].data.numpy()))

        # Loop over training batches
        batch_acc  = []
        batch_loss = []

        for X,y in train_loader:

            # Forward propagation and loss
            yHat = ANN(X)
            loss = loss_fun(yHat,y)

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Loss and accuracy from this batch
            batch_loss.append(loss.item())

            matches     = torch.argmax(yHat,axis=1) == y
            matches_num = matches.float()
            accuracy    = 100 * torch.mean(matches_num)
            batch_acc.append(accuracy)

        losses.append( np.mean(batch_loss) )
        train_acc.append( np.mean(batch_acc) )

        # Test accuracy
        ANN.eval()

        with torch.no_grad():
            X,y = next(iter(test_loader))
            yHat = ANN(X)
            test_acc.append( 100*torch.mean((torch.argmax(yHat,axis=1)==y).float()) )

        ANN.train()

        # Get weights for each layer ("post-training") and compute matrix change
        # measures (Euclidean distance/Frobenius norm, and condition number)
        for (i,p) in enumerate(ANN.named_parameters()):
            if 'weight' in p[0]:

                weight_euclidian[epoch_i,int(i/2)] = np.linalg.norm( pre_w[int(i/2)]-p[1].data.numpy(),ord='fro' )
                weight_cond_num[epoch_i,int(i/2)]  = np.linalg.cond( p[1].data )

    return train_acc,test_acc,losses,ANN,weight_euclidian,weight_cond_num,pre_w


In [60]:
# %% Model instance and fitting

ANN,loss_fun,optimizer = gen_model()
train_acc,test_acc,losses,ANN,weight_euclidian,weight_cond_num,pre_w = train_model(ANN,loss_fun,optimizer)


In [None]:
# %% Plotting

# Get layer names
layer_name = []
for (i,p) in enumerate(ANN.named_parameters()):
    if 'weight' in p[0]:
        layer_name.append(p[0][:-7])

phi = (1 + np.sqrt(5)) / 2
fig,ax = plt.subplots(1,3,figsize=(2*phi*5,5))

c_i  = len(weight_euclidian[1,:])
cmap = plt.cm.plasma(np.linspace(0.2,0.9,c_i))

# Accuracies
ax[0].plot(train_acc)
ax[0].plot(test_acc)
ax[0].set_xlabel('Epochs')
ax[0].set_ylabel('Accuracy (%)')
ax[0].set_title('Accuracy')
ax[0].legend(['Train','Test'])

# Frobenius norm
for i in range(c_i):
    ax[1].plot(weight_euclidian[:,i-1],color=cmap[i])
ax[1].set_xlabel('Epochs')
ax[1].set_title('Frobenius norm')
ax[1].legend(layer_name)

# Condition numbers
for i in range(c_i):
    ax[2].plot(weight_cond_num[:,i-1],color=cmap[i])
ax[2].set_xlabel('Epochs')
ax[2].set_title('Condition number')
ax[2].legend(layer_name)
ax[2].set_ylim([0,20])

plt.savefig('figure43_weight_changes.png')
plt.show()
files.download('figure43_weight_changes.png')


In [None]:
# %% Plotting

# Check the derivative of accuracy against the weight change (zscore for scaling
# offset)

phi = (1 + np.sqrt(5)) / 2
fig,ax = plt.subplots(1,figsize=(phi*5,5))

plt.plot(zscore(np.diff(train_acc)),label="diff(train_acc)")
plt.plot(zscore(np.mean(weight_euclidian,axis=1)),label='Weight change')
plt.legend()
plt.title('Change in weights by change in accuracy')
plt.xlabel('Epoch')

plt.savefig('figure44_weight_changes.png')
plt.show()
files.download('figure44_weight_changes.png')


In [53]:
# %% Exercise 1
#    Re-run the training and visualization with L2 regularization (lambda=.01). Does that have a major noticeable effect?

# Not much, maybe the learning is a bit smoother ? If anything the Frobenius
# norm distribution seems to have a larger variance

# Model class with L2
def gen_model():

    class mnist_FFN(nn.Module):
        def __init__(self):
            super().__init__()

            self.input   = nn.Linear(784,64)
            self.hidden1 = nn.Linear(64,32)
            self.hidden2 = nn.Linear(32,32)
            self.output  = nn.Linear(32,10)

        def forward(self,x):

            x = F.relu(self.input(x))
            x = F.relu(self.hidden1(x))
            x = F.relu(self.hidden2(x))
            x = self.output(x)

            return x

    ANN       = mnist_FFN()
    loss_fun  = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(ANN.parameters(),lr=0.001,weight_decay=0.01)

    return ANN,loss_fun,optimizer


In [58]:
# %% Exercise 2
#    Then try with L1 regularization. (Hint: you might want to copy code from DUDL_overfitting_L1regu).

# Similar, but the Frobenius norm distribution is somewhat pointier; remarkable
# given that indeed L1 reg. uses the Euclidean norm, while L2 reg. use the
# squared version (i.e. L1 puts a cost on any w that is not zero, while L2 puts
# more cost on larger weights)

# Function to train the model with L1
def train_model(ANN,loss_fun,optimizer):

    num_epochs = 60

    losses    = []
    train_acc = []
    test_acc  = []

    weight_euclidian = np.zeros((num_epochs,4))
    weight_cond_num  = np.zeros((num_epochs,4))

    # Count weight number
    n_weights = 0
    for pname,weights in ANN.named_parameters():
        if 'bias' not in pname:
            n_weights = n_weights + weights.numel()

    for epoch_i in range(num_epochs):

        pre_w = []
        for p in ANN.named_parameters():
            if 'weight' in p[0]:
                pre_w.append(copy.deepcopy(p[1].data.numpy()))

        batch_acc  = []
        batch_loss = []

        for X,y in train_loader:

            yHat = ANN(X)
            loss = loss_fun(yHat,y)

            # Add L1 regularisation
            L1_term   = torch.tensor(0.,requires_grad=True)
            L1_lambda = 0.001

            for pname,weight in ANN.named_parameters():
                if 'bias' not in pname:
                    L1_term = L1_term + torch.sum(torch.abs(weight))

            loss = loss + (L1_lambda*L1_term)/n_weights

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            batch_loss.append(loss.item())

            matches     = torch.argmax(yHat,axis=1) == y
            matches_num = matches.float()
            accuracy    = 100 * torch.mean(matches_num)
            batch_acc.append(accuracy)

        losses.append( np.mean(batch_loss) )
        train_acc.append( np.mean(batch_acc) )

        ANN.eval()

        with torch.no_grad():
            X,y = next(iter(test_loader))
            yHat = ANN(X)
            test_acc.append( 100*torch.mean((torch.argmax(yHat,axis=1)==y).float()) )

        ANN.train()

        for (i,p) in enumerate(ANN.named_parameters()):
            if 'weight' in p[0]:

                weight_euclidian[epoch_i,int(i/2)] = np.linalg.norm( pre_w[int(i/2)]-p[1].data.numpy(),ord='fro' )
                weight_cond_num[epoch_i,int(i/2)]  = np.linalg.cond( p[1].data )

    return train_acc,test_acc,losses,ANN,weight_euclidian,weight_cond_num,pre_w
