In [None]:
# %% Deep learning - Section 15.145
#    Code challenge 21: weight variance inits
#
#    1) Start from code from video 13.144 (mnist dataset)
#    2) Initialise the weights to be normally distributed random numbers, and
#       vary parametrically the std between 0.001 and 10 in 25 log steps
#    3) Plot the training accuracy averaged over the last 3 epochs per std step
#    4) Plot an histogram of all the post-training weights (i.e. one histogram
#       per std step)

# This code pertains a deep learning course provided by Mike X. Cohen on Udemy:
#   > https://www.udemy.com/course/deeplearning_x
# The "base" code in this repository is adapted (with very minor modifications)
# from code developed by the course instructor (Mike X. Cohen), while the
# "exercises" and the "code challenges" contain more original solutions and
# creative input from my side. If you are interested in DL (and if you are
# reading this statement, chances are that you are), go check out the course, it
# is singularly good.


In [2]:
# %% Libraries and modules
import numpy               as np
import matplotlib.pyplot   as plt
import torch
import torch.nn            as nn
import seaborn             as sns
import copy
import torch.nn.functional as F
import pandas              as pd
import scipy.stats         as stats
import sklearn.metrics     as skm
import time

from torch.utils.data                 import DataLoader,TensorDataset
from sklearn.model_selection          import train_test_split
from google.colab                     import files
from torchsummary                     import summary
from IPython                          import display
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg')
plt.style.use('default')


In [3]:
# %% Data

# Load data
data = np.loadtxt(open('sample_data/mnist_train_small.csv','rb'),delimiter=',')

# Split labels from data
labels = data[:,0]
data   = data[:,1:]

# Normalise data (original range is (0,255))
data_norm = data / np.max(data)


In [4]:
# %% Create train and test datasets

# Convert to tensor (float and integers)
data_tensor   = torch.tensor(data_norm).float()
labels_tensor = torch.tensor(labels).long()

# Split data with scikitlearn (10% test data)
train_data,test_data,train_labels,test_labels = train_test_split(data_tensor,labels_tensor,test_size=0.1)

# Convert to PyTorch datasets
train_data = TensorDataset(train_data,train_labels)
test_data  = TensorDataset(test_data,test_labels)

# Convert into DataLoader objects
batch_size   = 32
train_loader = DataLoader(train_data,batch_size=batch_size,shuffle=True,drop_last=True)
test_loader  = DataLoader(test_data,batch_size=test_data.tensors[0].shape[0])


In [5]:
# %% Model class

def gen_model():

    class mnist_FFN(nn.Module):
        def __init__(self):
            super().__init__()

            # Architecture
            self.input   = nn.Linear(784,64)
            self.hidden1 = nn.Linear(64,32)
            self.hidden2 = nn.Linear(32,32)
            self.output  = nn.Linear(32,10)

        # Forward propagation
        def forward(self,x):

            x = F.relu(self.input(x))
            x = F.relu(self.hidden1(x))
            x = F.relu(self.hidden2(x))
            x = self.output(x)

            return x


    # Generate model instance
    ANN = mnist_FFN()

    # Loss function
    loss_fun = nn.CrossEntropyLoss()

    # Optimizer (use Adam for optimal optimisation)
    optimizer = torch.optim.Adam(ANN.parameters(),lr=0.01)

    return ANN,loss_fun,optimizer


In [8]:
# %% Function to train the model

def train_model(ANN,loss_fun,optimizer):

    # Parameters, inizialise vars
    num_epochs = 10

    losses    = []
    train_acc = []
    test_acc  = []

    # Loop over epochs
    for epoch_i in range(num_epochs):

        # Loop over training batches
        batch_acc  = []
        batch_loss = []

        for X,y in train_loader:

            # Forward propagation and loss
            yHat = ANN(X)
            loss = loss_fun(yHat,y)

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Loss and accuracy from this batch
            batch_loss.append(loss.item())

            matches     = torch.argmax(yHat,axis=1) == y
            matches_num = matches.float()
            accuracy    = 100 * torch.mean(matches_num)
            batch_acc.append(accuracy)

        losses.append( np.mean(batch_loss) )
        train_acc.append( np.mean(batch_acc) )

        # Test accuracy
        ANN.eval()

        with torch.no_grad():
            X,y = next(iter(test_loader))
            yHat = ANN(X)
            test_acc.append( 100*torch.mean((torch.argmax(yHat,axis=1)==y).float()) )

        ANN.train()

    return train_acc,test_acc,losses,ANN


In [7]:
# %% Functions for 1D smoothing filter

# Improved for edge effects - adaptive window
def smooth_adaptive(x,k):
    smoothed = np.zeros_like(x)
    half_k   = k // 2

    for i in range(len(x)):
        start       = max(0, i-half_k)
        end         = min(len(x), i+half_k + 1)
        smoothed[i] = np.mean(x[start:end])

    return smoothed


In [70]:
# %% Train models with various weight variance initialisation

# Parameters and preallocations
stds             = np.logspace(np.log10(10e-5),np.log10(10),25)
train_acc        = []
saved_params     = []
saved_params_pre = []

# Loop over variances (takes ~8 mins)
for std in stds:

    # Fresh model instance
    ANN,loss_fun,optimizer = gen_model()

    # Set all weights to normally distributed random number
    for name,param in ANN.named_parameters():
        if "weight" in name:
            param.data.normal_(mean=0.0,std=std)
        elif "bias" in name:
            param.data.normal_(mean=0.0,std=std)

    # Save initialised weights before training
    saved_params_pre.append(copy.deepcopy(ANN.state_dict()))

    # Run the model
    train_acc_i,test_acc_i,losses_i,ANN_i = train_model(ANN,loss_fun,optimizer)
    train_acc.append(np.mean(train_acc_i[-3:]))

    # Save initialised weights after training
    saved_params.append(copy.deepcopy(ANN_i.state_dict()))


In [None]:
# %% Plotting

phi = (1 + np.sqrt(5)) / 2
fig = plt.figure(figsize=(phi*6,6))

plt.plot(stds,train_acc,'s-',color='tab:red')
plt.xscale('log')
plt.axhline(y=20,color='grey',linestyle=':',linewidth=0.8)
plt.axhline(y=80,color='grey',linestyle=':',linewidth=0.8)

plt.legend(["Accuracy over models"],loc="center left")
plt.xlabel("Standar deviation for weight initialisation")
plt.ylabel("Average accuracy over last 3 epochs (%)")
plt.title("Models' accuracy over inits")

plt.savefig('figure14_code_challenge_21.png')
plt.show()
files.download('figure14_code_challenge_21.png')


In [None]:
# %% Plotting

# Preallocate hist bin centres and count
n_bins    = 80
n_nodels  = len(saved_params)
hist_data = np.zeros((n_nodels,2,n_bins))

# Retrieve weights (leave out biases)
for i,model_dict in enumerate(saved_params):

    all_weights = []

    for param_name, param_tensor in model_dict.items():
        if "weight" in param_name:
            all_weights.append(param_tensor.detach().numpy().flatten())

    # Concatenate and compute histogram
    all_weights      = np.concatenate(all_weights)
    counts,bin_edges = np.histogram(all_weights,bins=n_bins,range=(-2,2),density=True)

    # Bin centers
    bin_centers = (bin_edges[1:] + bin_edges[:-1]) / 2

    # Store
    hist_data[i,0,:] = bin_centers
    hist_data[i,1,:] = counts

# Plot
cmap = plt.cm.plasma(np.linspace(0.2,0.9,len(stds)))

phi = (1 + np.sqrt(5)) / 2
fig = plt.figure(figsize=(phi*7,7))

for i in range(len(stds)):
    plt.plot(hist_data[i,0,:],hist_data[i,1,:],color=cmap[-i-1],lw=1)

plt.title('Histogram of post-training weights by std initialisation')
plt.xlabel('Weight value')
plt.ylabel('Count (normalised)')
plt.legend(np.round(stds,4),bbox_to_anchor=(1,1),loc='upper left')

plt.savefig('figure15_code_challenge_21.png')
plt.show()
files.download('figure15_code_challenge_21.png')


In [None]:
# %% Execise 1
#    Are you sure we calculated the standard deviations correctly? Immediately after the code that initializes the weights,
#    write some more code that gets all of the weights (across all layers), compute the standard deviation, and then print
#    out the desired and actual standard deviations. Note that they won't correspond exactly, due to sampling variability.
#    Also note that because this is just a sanity check, you don't actually need to train the model; just verify that the
#    weights have been correctly calculated.

# Compute mismatch
actual_stds = []

for model_dict in saved_params_pre:

    all_weights = []

    for param_name, param_tensor in model_dict.items():
        if "weight" in param_name:
            all_weights.append(param_tensor.detach().numpy().flatten())

    all_weights = np.concatenate(all_weights)
    actual_stds.append(np.std(all_weights))

# Plot
phi = (1 + np.sqrt(5)) / 2
fig = plt.figure(figsize=(phi*6,6))

plt.plot(stds,stds,'o',label='Target std',color='tab:green',markersize=10)
plt.plot(stds,actual_stds,'o',color='tab:red',markersize=5,label='Measured std')
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Desired std for weight initialization')
plt.ylabel('Measured std of initialized weights')
plt.title(f'Check initialization accuracy\n(max mismatch = {round(np.max(np.abs(stds-actual_stds)),5)})')
plt.legend()

plt.savefig('figure16_code_challenge_21_extra1.png')
plt.show()
files.download('figure16_code_challenge_21_extra1.png')


In [None]:
# %% Execise 2
#    Here we used torch.randn to assign the weights. randn creates Gaussian random numbers with a mean of 0, and thus
#    the weights were initialized with both positive and negative values. Try running the experiment again using
#    torch.rand, which creates uniformly distributed numbers between 0 and 1.
#    NOTE: Specifying the standard deviation of a uniform distribution is slightly more involved compared to a normal
#    distribution. See https://math.stackexchange.com/a/140081 for instructions.

# Re-run with uniform distribution
train_acc        = []
saved_params     = []
saved_params_pre = []

ANN,loss_fun,optimizer = gen_model()

for name,param in ANN.named_parameters():
    if "weight" in name:
        param.data.uniform_(0,1)
    elif "bias" in name:
        param.data.uniform_(0,1)

saved_params_pre.append(copy.deepcopy(ANN.state_dict()))

train_acc_i,test_acc_i,losses_i,ANN_i = train_model(ANN,loss_fun,optimizer)
train_acc.append(np.mean(train_acc_i[-3:]))

saved_params.append(copy.deepcopy(ANN_i.state_dict()))

print(f"Accuracy with uniform distribution = {train_acc[-1]:.5f} %")


In [109]:
# %% Exercise 2
#    Continue ...

# Same as above (uniform) but matching the stds from the normal distributions

# Note: variance for a uniform distribution from a normal std :
#       > var_u = (b-a)^2 / 12
#       > var_u = std^2
# but we want a symmetrical distribution so a=b :
#       > std^2 = (2a)^2 /12
#       > a = sqrt(3)std

# Re-run with uniform distribution (takes ~8 mins)
stds             = np.logspace(np.log10(10e-5),np.log10(10),25)
train_acc        = []
saved_params     = []
saved_params_pre = []

for std in stds:

    ANN,loss_fun,optimizer = gen_model()
    a = np.sqrt(3) * std

    for name,param in ANN.named_parameters():
        if "weight" in name:
            param.data.uniform_(-a,a)
        elif "bias" in name:
            param.data.uniform_(-a,a)

    saved_params_pre.append(copy.deepcopy(ANN.state_dict()))

    train_acc_i,test_acc_i,losses_i,ANN_i = train_model(ANN,loss_fun,optimizer)
    train_acc.append(np.mean(train_acc_i[-3:]))

    saved_params.append(copy.deepcopy(ANN_i.state_dict()))
