In [None]:
# %% Deep learning - Section 12.118
#    Data size and network size

# This code pertains a deep learning course provided by Mike X. Cohen on Udemy:
#   > https://www.udemy.com/course/deeplearning_x
# The "base" code in this repository is adapted (with very minor modifications)
# from code developed by the course instructor (Mike X. Cohen), while the
# "exercises" and the "code challenges" contain more original solutions and
# creative input from my side. If you are interested in DL (and if you are
# reading this statement, chances are that you are), go check out the course, it
# is singularly good.


In [None]:
# %% Libraries and modules
import numpy               as np
import matplotlib.pyplot   as plt
import torch
import torch.nn            as nn
import seaborn             as sns
import copy
import torch.nn.functional as F
import pandas              as pd
import scipy.stats         as stats
import time

from torch.utils.data                 import DataLoader,TensorDataset
from sklearn.model_selection          import train_test_split
from google.colab                     import files
from torchsummary                     import summary
from IPython                          import display
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg')


In [None]:
# %% Function to generate data

def gen_data(n_per_cluster):

    # Centroids
    A = [ 1,1 ]
    B = [ 5,1 ]
    C = [ 4,4 ]

    # Data
    a = [ A[0]+np.random.randn(n_per_cluster), A[1]+np.random.randn(n_per_cluster) ]
    b = [ B[0]+np.random.randn(n_per_cluster), B[1]+np.random.randn(n_per_cluster) ]
    c = [ C[0]+np.random.randn(n_per_cluster), C[1]+np.random.randn(n_per_cluster) ]

    # Labels
    labels_np = np.hstack(( np.zeros((n_per_cluster)),
                            np.ones((n_per_cluster)),
                            np.ones((n_per_cluster))+1 ))

    # Concatenate data into matrix, put then in a dictionary and convert to pytorch tensor
    data_np = np.hstack((a,b,c)).T

    output = {}
    output['data']   = torch.tensor(data_np).float()
    output['labels'] = torch.tensor(labels_np).long()

    # Split data with scikitlearn
    train_data,test_data, train_labels,test_labels = train_test_split(output['data'],output['labels'],train_size=0.9)

    # Convert to PyTorch Datasets
    train_data = TensorDataset(train_data,train_labels)
    test_data  = TensorDataset(test_data,test_labels)

    # Convert to dataloader object
    batch_size = 8
    output['train_data'] = DataLoader(train_data,batch_size=batch_size,shuffle=True,drop_last= True)
    output['test_data']  = DataLoader(test_data,batch_size=test_data.tensors[0].shape[0])

    return output


In [None]:
# %% Test data function

# Data
some_data = gen_data(50)

data   = some_data['data']
labels = some_data['labels']

print(data.shape)
print(labels.shape)

# Plotting
phi = ( 1 + np.sqrt(5) ) / 2
fig = plt.figure(figsize=(6*phi,6))
plt.plot(data[np.where(labels==0)[0],0],data[np.where(labels==0)[0],1],'s',alpha=.75)
plt.plot(data[np.where(labels==1)[0],0],data[np.where(labels==1)[0],1],'o',alpha=.75)
plt.plot(data[np.where(labels==2)[0],0],data[np.where(labels==2)[0],1],'^',alpha=.75)
plt.title('Some data')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')

plt.savefig('figure1_data_size_network_size.png')

plt.show()

files.download('figure1_data_size_network_size.png')


In [None]:
# %% Model class

def gen_model(n_units,n_layers):

    class model(nn.Module):
        def __init__(self,n_units,n_layers):
            super().__init__()

            # Dictionary to store layers
            self.layers   = nn.ModuleDict()
            self.n_layers = n_layers

            # Architecture
            self.layers['input'] = nn.Linear(2,n_units)

            for i in range(1,n_layers):
                self.layers[f'hidden_{i}'] = nn.Linear(n_units,n_units)

            self.layers['output'] = nn.Linear(n_units,3)

        # Forward propagation
        def forward(self,x):

            x = self.layers['input'](x)
            for i in range(1,self.n_layers):
                x = F.relu(self.layers[f'hidden_{i}'](x))
            x = self.layers['output'](x)

            return x

    # Generate model instance
    network = model(n_units,n_layers)

    # Loss function
    loss_function = nn.CrossEntropyLoss()

    # Optimizer (SGD to slow down training and appreciate parametrisation)
    optimizer = torch.optim.SGD(network.parameters(),lr=0.01)

    return network,loss_function,optimizer


In [None]:
# %% Test model function on random data

n_units  = 12
n_layers = 3

network,loss_function,optimizer = gen_model(n_units,n_layers)
print(network)

input = torch.rand(10,2)
print(network(input))


In [None]:
# %% Function to train the model

def train_model(n_units,n_layers):

    # Number of epochs and model instance
    num_epochs = 50
    network,loss_function,optimizer = gen_model(n_units,n_layers)

    # Preallocate variables
    losses    = torch.zeros(num_epochs)
    train_acc = torch.zeros(num_epochs)
    test_acc  = torch.zeros(num_epochs)

    # Training loop
    for epoch_i in range(num_epochs):

        # Batches loop
        batch_acc  = []
        batch_loss = []

        for X,y in train_data:

            # Forward prop and loss
            yHat = network(X)
            loss = loss_function(yHat,y)

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Compute loss and accuracy for this batch
            batch_loss.append(loss.item())
            batch_acc.append(100*torch.mean((torch.argmax(yHat,axis=1)==y).float()).item())

        # Compute loss and accuracy for the epoch
        losses[epoch_i]    = np.mean(batch_loss)
        train_acc[epoch_i] = np.mean(batch_acc)

        # Test accuracy (switch to evaluation mode and then back to training
        # mode to save up computation)
        network.eval()
        X,y = next(iter(test_data))
        with torch.no_grad():
            yHat = network(X)

        test_acc[epoch_i] = 100*torch.mean((torch.argmax(yHat,axis=1)==y).float()).item()
        network.train()

    return train_acc,test_acc,losses,network


In [None]:
# %% Test the entire machinery with some data

some_data = gen_data(200)
train_data = some_data['train_data']
test_data  = some_data['test_data']

train_acc,test_acc,losses,model = train_model(60,1)

# Plotting
phi = ( 1 + np.sqrt(5) ) / 2
fig,ax = plt.subplots(1,2,figsize=(1.5*6*phi,6))

ax[0].plot(losses.detach())
ax[0].set_ylabel('Loss')
ax[0].set_xlabel('Epoch')
ax[0].set_title('Losses')

ax[1].plot(train_acc,label='Train accuracy')
ax[1].plot(test_acc,label='Test accuracy')
ax[1].set_ylabel('Accuracy (%)')
ax[1].set_xlabel('Epoch')
ax[1].set_title('Accuracy')
ax[1].legend()

plt.savefig('figure2_data_size_network_size.png')

plt.show()

files.download('figure2_data_size_network_size.png')


In [None]:
# %% Parametric experiment

# Parametrically vary depth of the model, while keeping the number of units
# constant; parametrically vary the amount of data as well


In [None]:
# %% Before the experiment, configure and confirm the metaparameters

# specify the parameters for the model
total_nodes    = 80
layers_range   = [ 1,5,10,20 ]
n_datapoints   = np.arange(50,551,50)

# Legend for later plotting
legend = []

# Print out model architectures
for layer_i,layers in enumerate(layers_range):

    # create a model
    units_by_layer = int(total_nodes/layers_range[layer_i])
    network        = gen_model(units_by_layer,layers)[0]

    # count its parameters (see also lecture ANNs: depth vs. breadth)
    n_params = np.sum([ p.numel() for p in network.parameters() if p.requires_grad ])

    legend.append( '%s layers, %s units, %s parameters' %(layers,units_by_layer,n_params) )
    print('This model will have %s layers, each with %s units, totalling %s parameters' %(layers,units_by_layer,n_params))


In [None]:
# %% Proper experiment

# initialize results matrix
results = np.zeros(( len(n_datapoints),len(layers_range),2 ))

# Takes ~6 mins
for data_i,datapoints in enumerate(n_datapoints):

    # create data (note: same data for each layer manipulation!)
    data       = gen_data(datapoints)
    train_data = data['train_data']
    test_data  = data['test_data']

    # Loop over layers
    for layer_i,layers in enumerate(layers_range):

        units_by_layer = int(total_nodes/layers_range[layer_i])
        train_acc,test_acc,losses,model = train_model(units_by_layer,layers)

        # Average of last 5 accuracies and losses
        results[data_i,layer_i,0] = torch.mean(test_acc[-5:])
        results[data_i,layer_i,1] = torch.mean(losses[-5:]).item()


In [None]:
# %% Plotting

phi = ( 1 + np.sqrt(5) ) / 2
fig,ax = plt.subplots(1,2,figsize=(1.5*6*phi,6))

cmaps = plt.cm.plasma(np.linspace(0.2,0.9,len(layers_range)))
for layer_i,layers in enumerate(layers_range):
    ax[0].plot(n_datapoints,results[:,layer_i,1],'o-',label=legend[layer_i],color=cmaps[layer_i],alpha=0.5)
ax[0].set_ylabel('Loss')
ax[0].set_xlabel('Number of data points')
ax[0].legend(legend)
ax[0].set_title('Losses')

for layer_i,layers in enumerate(layers_range):
    ax[1].plot(n_datapoints,results[:,layer_i,0],'o-',label=legend[layer_i],color=cmaps[layer_i],alpha=0.5)
ax[1].set_ylabel('Accuracy (%)')
ax[1].set_xlabel('Number of data points')
ax[1].set_title('Test accuracy')
ax[1].legend(legend)

plt.savefig('figure3_data_size_network_size.png')

plt.show()

files.download('figure3_data_size_network_size.png')


In [None]:
# %% Exercise 1
#    The model learns faster and better with the Adam optimizer. In fact, I intentionally used SGD here to make the
#    model worse for this demonstration! Change the optimizer to Adam. What do you think is a good learning rate?
#    More importantly: Do the conclusions of this experiment hold for the Adam optimizer?

# The learning rate is relatively ""irrelevant"" (stress on the quotes), because
# with the various strategies like adding momentum, RMSprop or Adam, the lr is
# set to be adaptive on some parameters. As for the conclusions (it takes a bit
# longer; ~11 mins), there are some quantitative changes in the sense that the
# 3rd deepest model performs better, but the overall conclusions do not change,
# for these data, shallow models do better than deep models


In [None]:
# %% Exercise 2
#    Add a timer to the experiment loop. Does the training duration relate to the number of layers or the number
#    of parameters?

# The training duration seems to nicely depend on both the amount of data and the
# number of layers; the number of trainable parameters, however, doesn't seem
# to be directly related to training time

# Experiment with timer (with SGD)
results  = np.zeros(( len(n_datapoints),len(layers_range),2 ))
timings  = np.zeros(( len(n_datapoints),len(layers_range) ))
n_params = np.zeros(( len(n_datapoints),len(layers_range) ))

for data_i,datapoints in enumerate(n_datapoints):

    data       = gen_data(datapoints)
    train_data = data['train_data']
    test_data  = data['test_data']

    for layer_i,layers in enumerate(layers_range):

        units_by_layer = int(total_nodes/layers_range[layer_i])

        start_time = time.time()

        train_acc,test_acc,losses,model = train_model(units_by_layer,layers)

        elapsed_time = time.time() - start_time
        timings[data_i, layer_i] = elapsed_time

        results[data_i,layer_i,0] = torch.mean(test_acc[-5:])
        results[data_i,layer_i,1] = torch.mean(losses[-5:]).item()
        n_params[data_i,layer_i]  = sum(p.numel() for p in model.parameters())

        print(f"Data: {datapoints}, Layers: {layers}, Time: {elapsed_time:.2f}s")

# Plotting
x_labels     = []
x_param_vals = []

for j,layers in enumerate(layers_range):

    num_params = int(n_params[0, j])
    x_labels.append(f'{num_params:,}\n({layers} layers)')
    x_param_vals.append(num_params)

y_labels = [str(n) for n in n_datapoints]

fig, ax = plt.subplots(figsize=(phi*6,6))
im = ax.imshow(timings,cmap='plasma',aspect='auto')

ax.set_xticks(range(len(x_labels)))
ax.set_xticklabels(x_labels,rotation=45,ha='right')
ax.set_xlabel('Number of parameters and of layers')

ax.set_yticks(range(len(y_labels)))
ax.set_yticklabels(y_labels)
ax.set_ylabel('Number of datapoints')

cbar = plt.colorbar(im,ax=ax)
cbar.set_label('Training time (s)',rotation=270,labelpad=15)

for i in range(len(n_datapoints)):
    for j in range(len(layers_range)):
        val = timings[i,j]
        text_color = 'white' if val < np.max(timings) * 0.5 else 'black'
        ax.text(j,i,f'{val:.0f} s',ha='center',va='center',color=text_color)

ax.set_title('Training time for number of parameters, layers and datapoints')
plt.tight_layout()

plt.savefig('figure5_data_size_network_size_extra2.png')

plt.show()

files.download('figure5_data_size_network_size_extra2.png')


In [None]:
# %% Exercise 3
#    Do the two deepest models eventually learn if you increase the number of training epochs? (Note: because this
#    question is only about the deepest models and because training time will increase, you need only test the two
#    models, not all four.)

# Tried with 200 iterations (takes ~26 min) and SGD. There is a slight improvement
# in which the 3rd deepest model reaches a good performance in a couple of runs
# with relatively a lot of data; that said, more iterations or a better optimizer
# would definitely help
