In [None]:
# %% Deep learning - Section 15.143
#    A surprising demo of weight initialisations

# This code pertains a deep learning course provided by Mike X. Cohen on Udemy:
#   > https://www.udemy.com/course/deeplearning_x
# The "base" code in this repository is adapted (with very minor modifications)
# from code developed by the course instructor (Mike X. Cohen), while the
# "exercises" and the "code challenges" contain more original solutions and
# creative input from my side. If you are interested in DL (and if you are
# reading this statement, chances are that you are), go check out the course, it
# is singularly good.


In [1]:
# %% Libraries and modules
import numpy               as np
import matplotlib.pyplot   as plt
import torch
import torch.nn            as nn
import seaborn             as sns
import copy
import torch.nn.functional as F
import pandas              as pd
import scipy.stats         as stats
import sklearn.metrics     as skm
import time

from torch.utils.data                 import DataLoader,TensorDataset
from sklearn.model_selection          import train_test_split
from google.colab                     import files
from torchsummary                     import summary
from IPython                          import display
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg')
plt.style.use('default')


In [2]:
# %% Data

# Load data
data = np.loadtxt(open('sample_data/mnist_train_small.csv','rb'),delimiter=',')

# Split labels from data
labels = data[:,0]
data   = data[:,1:]

# Normalise data (original range is (0,255))
data_norm = data / np.max(data)


In [3]:
# %% Create train and test datasets

# Convert to tensor (float and integers)
data_tensor   = torch.tensor(data_norm).float()
labels_tensor = torch.tensor(labels).long()

# Split data with scikitlearn (10% test data)
train_data,test_data,train_labels,test_labels = train_test_split(data_tensor,labels_tensor,test_size=0.1)

# Convert to PyTorch datasets
train_data = TensorDataset(train_data,train_labels)
test_data  = TensorDataset(test_data,test_labels)

# Convert into DataLoader objects
batch_size   = 32
train_loader = DataLoader(train_data,batch_size=batch_size,shuffle=True,drop_last=True)
test_loader  = DataLoader(test_data,batch_size=test_data.tensors[0].shape[0])


In [7]:
# %% Model class

def gen_model():

    class mnist_FFN(nn.Module):
        def __init__(self):
            super().__init__()

            # Architecture
            self.input   = nn.Linear(784,64)
            self.hidden1 = nn.Linear(64,32)
            self.hidden2 = nn.Linear(32,32)
            self.output  = nn.Linear(32,10)

        # Forward propagation
        def forward(self,x):

            x = F.relu(self.input(x))
            x = F.relu(self.hidden1(x))
            x = F.relu(self.hidden2(x))
            x = self.output(x)

            return x


    # Generate model instance
    ANN = mnist_FFN()

    # Loss function
    loss_fun = nn.CrossEntropyLoss()

    # Optimizer (use Adam for optimal optimisation)
    optimizer = torch.optim.Adam(ANN.parameters(),lr=0.01)

    return ANN,loss_fun,optimizer


In [None]:
# %% Explore weights

temp_net = gen_model()[0]

print('\nHave a look at the weights from hidden layer 1 (note the fresh randomisation at each iteration):\n')
print(temp_net.hidden1.weight.data)

In [17]:
# %% Function to train the model

def train_model(ANN,loss_fun,optimizer):

    # Parameters, inizialise vars
    num_epochs = 10

    losses    = []
    train_acc = []
    test_acc  = []

    # Loop over epochs
    for epoch_i in range(num_epochs):

        # Loop over training batches
        batch_acc  = []
        batch_loss = []

        for X,y in train_loader:

            # Forward propagation and loss
            yHat = ANN(X)
            loss = loss_fun(yHat,y)

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Loss and accuracy from this batch
            batch_loss.append(loss.item())

            matches     = torch.argmax(yHat,axis=1) == y
            matches_num = matches.float()
            accuracy    = 100 * torch.mean(matches_num)
            batch_acc.append(accuracy)

        losses.append( np.mean(batch_loss) )
        train_acc.append( np.mean(batch_acc) )

        # Test accuracy
        ANN.eval()

        with torch.no_grad():
            X,y = next(iter(test_loader))
            yHat = ANN(X)
            test_acc.append( 100*torch.mean((torch.argmax(yHat,axis=1)==y).float()) )

        ANN.train()

    return train_acc,test_acc,losses,ANN


In [19]:
# %% Functions for 1D smoothing filter

# Improved for edge effects - adaptive window
def smooth_adaptive(x,k):
    smoothed = np.zeros_like(x)
    half_k   = k // 2

    for i in range(len(x)):
        start       = max(0, i-half_k)
        end         = min(len(x), i+half_k + 1)
        smoothed[i] = np.mean(x[start:end])

    return smoothed


In [18]:
# %% Run the model with the default PyTorch settings

ANN,loss_fun,optimizer = gen_model()
train_acc,test_acc,losses,ANN = train_model(ANN,loss_fun,optimizer)


In [None]:
# %% Plotting

phi = (1 + np.sqrt(5)) / 2
fig,ax = plt.subplots(1,figsize=(phi*6,6))

ax.plot(smooth_adaptive(train_acc,2),label='Train accuracy')
ax.plot(smooth_adaptive(test_acc,2),label='Test accuracy')

ax.set_title("Training and test set accuracy\n(standard weight initialisation)")
ax.set_xlabel("Epoch")
ax.set_ylabel("Accuracy (%)")

plt.tight_layout()

plt.savefig('figure1_weight_init_demo.png')
plt.show()
files.download('figure1_weight_init_demo.png')


In [None]:
# %% Set all the weights of (only) hidden layer 1 to zero

# Fresh model instance
ANN_zero,loss_fun,optimizer = gen_model()

# Set weights to zero
ANN_zero.hidden1.weight.data = torch.zeros_like(ANN_zero.hidden1.data)
print(ANN_zero.hidden1.weight.data)

# Run the model with one layer initialised to zero
train_acc_z,test_acc_z,losses_z,ANN_zero = train_model(ANN_zero,loss_fun,optimizer)


In [None]:
# %% Plotting

phi = (1 + np.sqrt(5)) / 2
fig,ax = plt.subplots(1,figsize=(phi*6,6))

ax.plot(range(len(train_acc)),smooth_adaptive(train_acc,2),'-',color='tab:blue')
ax.plot(range(len(train_acc)),smooth_adaptive(test_acc,2),':',color='tab:blue')

ax.plot(range(len(train_acc_z)),smooth_adaptive(train_acc_z,2),'-',color='tab:orange')
ax.plot(range(len(train_acc_z)),smooth_adaptive(test_acc_z,2),':',color='tab:orange')

ax.legend(['Train default','Test default','Train hid_1=zero','Test hid_1=zero'])
ax.set_title("Training and test set accuracy\n(standard weight initialisation vs. one layer set to zero)")
ax.set_xlabel("Epoch")
ax.set_ylabel("Accuracy (%)")

plt.tight_layout()

plt.savefig('figure2_weight_init_demo.png')
plt.show()
files.download('figure2_weight_init_demo.png')


In [None]:
# %% Check the weights values after training

print('Weights values after training:\n')
print(ANN_zero.hidden1.weight.data)

# Plotting
phi = (1 + np.sqrt(5)) / 2
fig,ax = plt.subplots(1,figsize=(phi*6,6))

y,x = np.histogram(ANN.hidden1.weight.data.flatten(),30)
ax.plot((x[1:]+x[:-1])/2,y,color='tab:blue',label='Deafult weights')

y,x = np.histogram(ANN_zero.hidden1.weight.data.flatten(),30)
ax.plot((x[1:]+x[:-1])/2,y,color='tab:red',label='hid_1=zero')

ax.legend()
ax.set_xlim(-1.5,1.5)

ax.set_title("Histogram of weights values after training")
ax.set_xlabel("Weight value")
ax.set_ylabel("Count")

plt.savefig('figure3_weight_init_demo.png')
plt.show()
files.download('figure3_weight_init_demo.png')


In [None]:
# %% Set all the weights of all the layers to zero

# Fresh model instance
ANN_all_zero,loss_fun,optimizer = gen_model()

# Set weights to zero
for p in ANN_all_zero.named_parameters():
    p[1].data = torch.zeros_like(p[1].data)

# Plot a few select parameters  to confirm (y-axis offset for visibility)
phi = (1 + np.sqrt(5)) / 2
fig = plt.figure(figsize=(phi*6,6))

plt.plot(0+ANN_all_zero.hidden1.weight.data.flatten(),'x',color='tab:blue')
plt.plot(1+ANN_all_zero.hidden2.weight.data.flatten(),'x',color='tab:orange')
plt.plot(2+ANN_all_zero.hidden1.bias.data.flatten(),'x',color='tab:green')

plt.legend(['hidden1.weight','hidden2.weight','hidden1.bias'])
plt.xlabel('Parameter index')
plt.ylim([-1,3])
plt.ylabel("Parameter value (shifted for viz.)")
plt.title("Model's parameters set to zero")

plt.savefig('figure4_weight_init_demo.png')
plt.show()
files.download('figure4_weight_init_demo.png')

# Run the model with all layers initialised to zero
train_acc_all_z,test_acc_all_z,losses_all_z,ANN_all_zero = train_model(ANN_all_zero,loss_fun,optimizer)


In [None]:
# %% Plotting

phi = (1 + np.sqrt(5)) / 2
fig,ax = plt.subplots(1,figsize=(phi*6,6))

ax.plot(range(len(train_acc)),smooth_adaptive(train_acc,2),'-',color='tab:blue')
ax.plot(range(len(train_acc)),smooth_adaptive(test_acc,2),':',color='tab:blue')

ax.plot(range(len(train_acc_all_z)),smooth_adaptive(train_acc_all_z,2),'-',color='tab:orange')
ax.plot(range(len(test_acc_all_z)),smooth_adaptive(test_acc_all_z,2),':',color='tab:orange')

ax.legend(['Train default','Test default','Train all zero','Test all zero'])
ax.set_title("Training and test set accuracy\n(standard weight initialisation vs. all layers set to zero)")
ax.set_xlabel("Epoch")
ax.set_ylabel("Accuracy (%)")

plt.tight_layout()

plt.savefig('figure5_weight_init_demo.png')
plt.show()
files.download('figure5_weight_init_demo.png')


In [None]:
# %% Check the weights values after training

# Plotting
phi = (1 + np.sqrt(5)) / 2
fig,ax = plt.subplots(1,figsize=(phi*6,6))

y,x = np.histogram(ANN.hidden1.weight.data.flatten(),30)
ax.plot((x[1:]+x[:-1])/2,y,color='tab:blue',label='Deafult weights')

y,x = np.histogram(ANN_all_zero.hidden1.weight.data.flatten(),30)
ax.plot((x[1:]+x[:-1])/2,y,color='tab:red',label='hid_1=zero')

ax.legend()
ax.set_xlim(-1.5,1.5)

ax.set_title("Histogram of weights values after training")
ax.set_xlabel("Weight value")
ax.set_ylabel("Count")

plt.savefig('figure6_weight_init_demo.png')
plt.show()
files.download('figure6_weight_init_demo.png')


In [63]:
# %% Maybe it's because they are all zeros? What about another constant?

# Fresh model instance
ANN_all_one,loss_fun,optimizer = gen_model()

# Set all weights to one
for p in ANN_all_one.named_parameters():
    p[1].data = torch.zeros_like(p[1].data) + 1

# Run the model with all layers initialised to one
train_acc_all_one,test_acc_all_one,losses_all_one,ANN_all_one = train_model(ANN_all_one,loss_fun,optimizer)


In [None]:
# %% Plotting

phi = (1 + np.sqrt(5)) / 2
fig,ax = plt.subplots(1,figsize=(phi*6,6))

ax.plot(range(len(train_acc)),smooth_adaptive(train_acc,2),'-',color='tab:blue')
ax.plot(range(len(train_acc)),smooth_adaptive(test_acc,2),':',color='tab:blue')

ax.plot(range(len(train_acc_all_one)),smooth_adaptive(train_acc_all_one,2),'-',color='tab:orange')
ax.plot(range(len(test_acc_all_one)),smooth_adaptive(test_acc_all_one,2),':',color='tab:orange')

ax.legend(['Train default','Test default','Train all zero','Test all zero'])
ax.set_title("Training and test set accuracy\n(standard weight initialisation vs. all layers set to one)")
ax.set_xlabel("Epoch")
ax.set_ylabel("Accuracy (%)")

plt.tight_layout()

plt.savefig('figure7_weight_init_demo.png')
plt.show()
files.download('figure7_weight_init_demo.png')


In [None]:
# %% Exercise 1
#    You saw that initializing only the weights in fc1 still allowed for good performance, while having all weights be
#    the same value led to HORRIBLE performance. Try setting all weights to ones and all biases to zeros (across all
#    layers). Does that allow for learning? If so, how does it compare to the baseline model performance?

# Fresh model instance
ANN_extra1,loss_fun,optimizer = gen_model()

# Set all weights to one and biases to 0
for name, param in ANN_extra1.named_parameters():
    if "weight" in name:
        param.data.fill_(1.0)
    elif "bias" in name:
        param.data.zero_()

# Plot
phi = (1 + np.sqrt(5)) / 2
fig = plt.figure(figsize=(phi*6,6))

plt.plot(0.1+ANN_extra1.hidden1.weight.data.flatten(),'x',color='tab:blue')
plt.plot(-0.1+ANN_extra1.hidden2.weight.data.flatten(),'x',color='tab:orange')
plt.plot(0.0+ANN_extra1.hidden1.bias.data.flatten(),'x',color='tab:green')

plt.legend(['hidden1.weight','hidden2.weight','hidden1.bias'])
plt.xlabel('Parameter index')
plt.ylim([-1,3])
plt.ylabel("Parameter value (shifted for viz.)")
plt.title("Model's parameters set to zero or one")

plt.savefig('figure8_weight_init_demo_extra1.png')
plt.show()
files.download('figure8_weight_init_demo_extra1.png')

# Run the model with all layers initialised to one
train_acc_extra1,test_acc_extra1,losses_extra1,ANN_extra1 = train_model(ANN_extra1,loss_fun,optimizer)


In [None]:
# %% Exercise 1
#    Continue ...

phi = (1 + np.sqrt(5)) / 2
fig,ax = plt.subplots(1,figsize=(phi*6,6))

ax.plot(range(len(train_acc)),smooth_adaptive(train_acc,2),'-',color='tab:blue')
ax.plot(range(len(train_acc)),smooth_adaptive(test_acc,2),':',color='tab:blue')

ax.plot(range(len(train_acc_extra1)),smooth_adaptive(train_acc_extra1,2),'-',color='tab:orange')
ax.plot(range(len(test_acc_extra1)),smooth_adaptive(test_acc_extra1,2),':',color='tab:orange')

ax.legend(['Train default','Test default','Train extra 1','Test extra 1'])
ax.set_title("Training and test set accuracy\n(standard weight initialisation vs. weight manipulation)")
ax.set_xlabel("Epoch")
ax.set_ylabel("Accuracy (%)")

plt.tight_layout()

plt.savefig('figure9_weight_init_demo_extra1.png')
plt.show()
files.download('figure9_weight_init_demo_extra1.png')


In [None]:
# %% Exercise 2
#    Now try setting all the weights from all layers to zeros, but leave the bias terms with their initial random values.

# Fresh model instance
ANN_extra2,loss_fun,optimizer = gen_model()

# Set all weights to one and biases to random
for name, param in ANN_extra2.named_parameters():
    if "weight" in name:
        param.data.fill_(1.0)

# Plot
phi = (1 + np.sqrt(5)) / 2
fig = plt.figure(figsize=(phi*6,6))

plt.plot(0.1+ANN_extra2.hidden1.weight.data.flatten(),'x',color='tab:blue')
plt.plot(-0.1+ANN_extra2.hidden2.weight.data.flatten(),'x',color='tab:orange')
plt.plot(0.0+ANN_extra2.hidden1.bias.data.flatten(),'x',color='tab:green')

plt.legend(['hidden1.weight','hidden2.weight','hidden1.bias'])
plt.xlabel('Parameter index')
plt.ylim([-1,3])
plt.ylabel("Parameter value (shifted for viz.)")
plt.title("Model's parameters set to zero or one")

plt.savefig('figure10_weight_init_demo_extra2.png')
plt.show()
files.download('figure10_weight_init_demo_extra2.png')

# Run the model with all layers initialised to one
train_acc_extra2,test_acc_extra2,losses_extra2,ANN_extra2 = train_model(ANN_extra2,loss_fun,optimizer)


In [None]:
# %% Exercise 2
#    Continue ...

phi = (1 + np.sqrt(5)) / 2
fig,ax = plt.subplots(1,figsize=(phi*6,6))

ax.plot(range(len(train_acc)),smooth_adaptive(train_acc,2),'-',color='tab:blue')
ax.plot(range(len(train_acc)),smooth_adaptive(test_acc,2),':',color='tab:blue')

ax.plot(range(len(train_acc_extra2)),smooth_adaptive(train_acc_extra2,2),'-',color='tab:orange')
ax.plot(range(len(test_acc_extra2)),smooth_adaptive(test_acc_extra2,2),':',color='tab:orange')

ax.legend(['Train default','Test default','Train extra 2','Test extra 2'])
ax.set_title("Training and test set accuracy\n(standard weight initialisation vs. weight manipulation)")
ax.set_xlabel("Epoch")
ax.set_ylabel("Accuracy (%)")

plt.tight_layout()

plt.savefig('figure11_weight_init_demo_extra2.png')
plt.show()
files.download('figure11_weight_init_demo_extra2.png')


In [None]:
# %% Exercise 3
#    Finally, the opposite of #2: Set all bias terms to zero and leave the weights random. Make a plot of test accuracy
#    performance for the baseline model, weights=0, and biases=0.

# Fresh model instance
ANN_extra3,loss_fun,optimizer = gen_model()

# Set all weights to random and biases to 0
for name, param in ANN_extra3.named_parameters():
    if "bias" in name:
        param.data.zero_()

# Plot
phi = (1 + np.sqrt(5)) / 2
fig = plt.figure(figsize=(phi*6,6))

plt.plot(1.0+ANN_extra3.hidden1.weight.data.flatten(),'x',color='tab:blue')
plt.plot(2.0+ANN_extra3.hidden2.weight.data.flatten(),'x',color='tab:orange')
plt.plot(0.0+ANN_extra3.hidden1.bias.data.flatten(),'x',color='tab:green')

plt.legend(['hidden1.weight','hidden2.weight','hidden1.bias'])
plt.xlabel('Parameter index')
plt.ylim([-1,3])
plt.ylabel("Parameter value (shifted for viz.)")
plt.title("Model's parameters set to zero or one")

plt.savefig('figure12_weight_init_demo_extra3.png')
plt.show()
files.download('figure12_weight_init_demo_extra3.png')

# Run the model with all layers initialised to one
train_acc_extra3,test_acc_extra3,losses_extra3,ANN_extra3 = train_model(ANN_extra3,loss_fun,optimizer)


In [None]:
# %% Exercise 3
#    Continue ...

phi = (1 + np.sqrt(5)) / 2
fig,ax = plt.subplots(1,figsize=(phi*6,6))

ax.plot(range(len(train_acc)),smooth_adaptive(train_acc,2),'-',color='tab:blue')
ax.plot(range(len(train_acc)),smooth_adaptive(test_acc,2),':',color='tab:blue')

ax.plot(range(len(train_acc_extra2)),smooth_adaptive(train_acc_extra2,2),'-',color='tab:orange')
ax.plot(range(len(test_acc_extra2)),smooth_adaptive(test_acc_extra2,2),':',color='tab:orange')

ax.plot(range(len(train_acc_extra3)),smooth_adaptive(train_acc_extra3,2),'-',color='tab:red')
ax.plot(range(len(test_acc_extra3)),smooth_adaptive(test_acc_extra3,2),':',color='tab:red')

ax.legend(['Train default','Test default','Train extra 2','Test extra 2','Train extra 3','Test extra 3'])
ax.set_title("Training and test set accuracy\n(standard weight initialisation vs. weight manipulation)")
ax.set_xlabel("Epoch")
ax.set_ylabel("Accuracy (%)")

plt.tight_layout()

plt.savefig('figure13_weight_init_demo_extra3.png')
plt.show()
files.download('figure13_weight_init_demo_extra3.png')