In [None]:
# %% Deep learning - Section 12.122
#    Data noise augmentation (with dev set and test set)

# This code pertains a deep learning course provided by Mike X. Cohen on Udemy:
#   > https://www.udemy.com/course/deeplearning_x
# The "base" code in this repository is adapted (with very minor modifications)
# from code developed by the course instructor (Mike X. Cohen), while the
# "exercises" and the "code challenges" contain more original solutions and
# creative input from my side. If you are interested in DL (and if you are
# reading this statement, chances are that you are), go check out the course, it
# is singularly good.


In [None]:
# %% Libraries and modules
import numpy               as np
import matplotlib.pyplot   as plt
import torch
import torch.nn            as nn
import seaborn             as sns
import copy
import torch.nn.functional as F
import pandas              as pd
import scipy.stats         as stats
import time

from torch.utils.data                 import DataLoader,TensorDataset
from sklearn.model_selection          import train_test_split
from google.colab                     import files
from torchsummary                     import summary
from IPython                          import display
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg')


In [None]:
# %% Function to get the data (train, dev, test)

# Load data and normalise
data_all       = np.loadtxt(open('sample_data/mnist_train_small.csv','rb'),delimiter=',')
data_all[:,1:] = data_all[:,1:] / np.max(data_all[:,1:])

# Function
def get_dataset(n,double_data=False):

    # Remove labels (i.e., numbers IDs) from dataset, and select only n data
    labels = data_all[:n,0]
    data   = data_all[:n,1:]

    # Make a noisy copy of all the data (uniform noise with range [0,0.5], half
    # the data range, normalised before to [0,1], so by adding some noise we
    # need to renormalise)
    #if double_data==True:
    #    data_noisy = data + np.random.random_sample(data.shape)/2
    #    data       = np.concatenate((data,data_noisy),axis=0)
    #    data       = data / np.max(data)
    #    labels     = np.concatenate((labels,labels),axis=0)

    # Covert to tensor
    data_T   = torch.tensor(data).float()
    labels_T = torch.tensor(labels).long()

    # Split data with scikitlearn
    train_data,dev_data, train_labels,dev_labels = train_test_split(data_T,labels_T,test_size=0.1)

    # Make an exact copy of the train data after splitting, to avoid twin items
    # in train and dev sets (normalise each image individually to its max; it
    # shouldn't be a big deal here)
    if double_data==True:
        train_data_noisy = train_data + torch.rand_like(train_data)/2
        train_data       = torch.cat((train_data,train_data_noisy),axis=0)
        train_data       = train_data / train_data.max(dim=1,keepdim=True)[0]
        train_labels     = torch.cat((train_labels,train_labels),axis=0)

    # PyTorch datasets
    train_data = TensorDataset(train_data,train_labels)
    dev_data   = TensorDataset(dev_data,dev_labels)

    # DataLoader objects
    batch_size   = 20
    train_loader = DataLoader(train_data,batch_size=batch_size,shuffle=True)
    dev_loader   = DataLoader(dev_data,batch_size=dev_data.tensors[0].shape[0])

    # Get fresh test set from unused data (basically all the remaining data)
    test_data   = torch.tensor(data_all[n:,1:]).float()
    test_labels = torch.tensor(data_all[n:,0]).long()

    return train_loader,dev_loader,(test_data,test_labels)


In [None]:
# %% Test the data function and visualise

# Function
train_loader,dev_loader,test_dataset = get_dataset(12,True)

# Plot
images, _ = next(iter(train_loader))
img = images.detach()

phi = (1 + np.sqrt(5)) / 2
fig,ax = plt.subplots(3,4,figsize=(phi*6,6))
for i,ax in enumerate(ax.flatten()):
    ax.imshow(np.reshape(img[i,:],(28,28)),cmap='gray')
    ax.axis('off')

plt.suptitle('Some data and some augmented noisy data')

plt.savefig('figure18_data_noise_augmentation_devset_test_set.png')

plt.show()

files.download('figure18_data_noise_augmentation_devset_test_set.png')


In [None]:
# %% Model class

def gen_model():

    class model(nn.Module):
        def __init__(self):
            super().__init__()

            # Architecture
            self.input  = nn.Linear(784,64)
            self.hid1   = nn.Linear(64,32)
            self.hid2   = nn.Linear(32,32)
            self.output = nn.Linear(32,10)

        def forward(self,x):
            x = F.relu(self.input(x))
            x = F.relu(self.hid1(x))
            x = F.relu(self.hid2(x))

            return self.output(x)

    # Model instance, loss function, and optimizer
    ANN       = model()
    loss_fun  = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(ANN.parameters(),lr=0.01)

    return ANN,loss_fun,optimizer


In [None]:
# %% Function to train the model

def train_model():

    # Epochs (few to keep some varaibility in performace) and fresh model instance
    num_epochs = 50
    ANN,loss_fun,optimizer = gen_model()

    # Preallocate vars
    losses    = torch.zeros(num_epochs)
    train_acc = torch.zeros(num_epochs)
    dev_acc   = torch.zeros(num_epochs)

    # Loop over epochs
    for epoch_i in range(num_epochs):

        # Loop over training data batches
        batch_acc  = []
        batch_loss = []

        for X,y in train_loader:

            # Forward pass, backpropagation, and optimizer step
            yHat = ANN(X)
            loss = loss_fun(yHat,y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Loss and accuracy from this batch
            batch_loss.append(loss.item())
            batch_acc.append( 100*torch.mean((torch.argmax(yHat,axis=1)==y).float()) )

        losses[epoch_i]    = np.mean(batch_loss).item()
        train_acc[epoch_i] = np.mean(batch_acc).item()

        # Dev set accuracy
        ANN.eval()

        with torch.no_grad():
            X,y = next(iter(dev_loader))
            yHat = ANN(X)
            dev_acc[epoch_i] = 100*torch.mean((torch.argmax(yHat,axis=1)==y).float())

        ANN.train()

    return train_acc,dev_acc,losses,ANN


In [None]:
# %% Parametric experiment on amount of data

# Parameters and preallocated vars
sample_sizes   = np.arange(500,4001,500)
results_single = torch.zeros((len(sample_sizes),3))
results_double = torch.zeros((len(sample_sizes),3))

# Run the experiment (takes ~3 mins)
for i,sample_size in enumerate(sample_sizes):

    # Non-doubled data
    train_loader,dev_loader,test_dataset = get_dataset(sample_size,False)
    train_acc,dev_acc,losses,ANN         = train_model()

    # Get results
    results_single[i,0] = torch.mean(train_acc[-5:]).item()
    results_single[i,1] = torch.mean(dev_acc[-5:]).item()
    results_single[i,2] = torch.mean(losses[-5:]).item()

    # Doubled data
    train_loader,dev_loader,test_loader = get_dataset(sample_size,True)
    train_acc,dev_acc,losses,ANN        = train_model()

    # Get results
    results_double[i,0] = torch.mean(train_acc[-5:]).item()
    results_double[i,1] = torch.mean(dev_acc[-5:]).item()
    results_double[i,2] = torch.mean(losses[-5:]).item()


In [None]:
# %% Plotting

phi = (1 + np.sqrt(5)) / 2
fig,ax = plt.subplots(1,3,figsize=(1.5*6*phi,6))

# Axes and title labels
titles    = ['Train accuracy','Dev. set accuracy','Losses']
yaxlabels = ['Accuracy','Accuracy','Losses']

# Common features
for i in range(3):

    # Plot the train and dev set info
    ax[i].plot(sample_sizes,results_single[:,i],'s-',label='Original')
    ax[i].plot(sample_sizes,results_double[:,i],'s-',label='Doubled')

    # Make it nicer
    ax[i].set_ylabel(yaxlabels[i])
    ax[i].set_title(titles[i])
    ax[i].legend()
    ax[i].set_xlabel('Unique sample size')
    ax[i].grid('on')

    if i<2:
        ax[i].set_ylim([20,102])

plt.tight_layout()

plt.savefig('figure19_data_noise_augmentation_devset_test_set.png')

plt.show()

files.download('figure19_data_noise_augmentation_devset_test_set.png')


In [None]:
# %% Re-run model with n = 500

sample_size = 500

# Original data
train_loader,dev_loader,test_dataset = get_dataset(sample_size,False)
train_acc_O,dev_acc_O,losses_O,ANN_O = train_model()

# Augmented data
train_loader,dev_loader,test_dataset = get_dataset(sample_size,True)
train_acc_A,dev_acc_A,losses_A,ANN_A = train_model()


In [None]:
# %% Test on actual test data

# Test data
X,y = test_dataset

# Original data
yHat_O     = ANN_O(X)
test_acc_O = 100*torch.mean((torch.argmax(yHat_O,axis=1)==y).float())

# Augmented data
yHat_A     = ANN_A(X)
test_acc_A = 100*torch.mean((torch.argmax(yHat_A,axis=1)==y).float())

# Print output
print(f'ORIGINAL MODEL (N={sample_size}):\n  Train: {train_acc_O[-1]:.2f}%, devset: {dev_acc_O[-1]:.2f}%, test: {test_acc_O:.2f}%\n')
print(f'AUGMENTED MODEL (N={sample_size}):\n  Train: {train_acc_A[-1]:.2f}%, devset: {dev_acc_A[-1]:.2f}%, test: {test_acc_A:.2f}%')


In [None]:
# %% Exercise 0
#    Could we have added Gaussian noise instead of uniform noise?

# Sure, adapt code and re run:
# > train_data_noisy = train_data + torch.randn(train_data.shape)*0.2
# The results are similar for a gaussian noise  scaled by 1/5 (one can try
# change the noise but without scaling dawn, the std of the Gaussian would be as
# large as te range of the normalised data)


In [None]:
# %% Exercise 1
#    It looks like noise augmentation only helped for small sample sizes (<1000). Write code to run a new experiment that
#    reproduces this experiment 10 times, but only using sample sizes [500,1000,2000]. Then make a plot showing the increase
#    in devset accuracy for all 10 runs. That will help determine whether our finding above was a quirk of sampling
#    variability or a meaningful effect.

# Assuming I understood the exercise right, these parametric experiment shows that
# for low sample sizes the model underperforms, but there is also more variability.
# With more data the accuracy increases and the variability also shrinks considerably.
# Also, the augmented model shows better performance and lower variability even
# with less data.


In [None]:
# %% Exercise 1 - continue ...
#    Run experiment for multiple sample sizes and repeat 10 times each

# Parameters
sample_sizes = [500,1000,2000]
reps         = 10

# Preallocate vars
mean_acc_O = np.zeros(len(sample_sizes))
std_acc_O  = np.zeros(len(sample_sizes))
mean_acc_A = np.zeros(len(sample_sizes))
std_acc_A  = np.zeros(len(sample_sizes))

all_accs_O = [[] for _ in sample_sizes]
all_accs_A = [[] for _ in sample_sizes]

# Go with the experiment! (takes ~6 mins)
for i,sample_size in enumerate(sample_sizes):
    print(f'\n<<< SAMPLE SIZE: {sample_size} >>>')

    # Store test accuracies
    test_accs_O = []
    test_accs_A = []

    for run in range(reps):
        print(f'\n--- Run {run+1}/{reps} ---')

        # Original data
        train_loader,dev_loader,test_dataset = get_dataset(sample_size,False)
        train_acc_O,dev_acc_O,losses_O,ANN_O = train_model()

        # Augmented data
        train_loader,dev_loader,test_dataset = get_dataset(sample_size,True)
        train_acc_A,dev_acc_A,losses_A,ANN_A = train_model()

        # Test data
        X,y = test_dataset

        # Original model accuracy
        yHat_O     = ANN_O(X)
        test_acc_O = 100*torch.mean((torch.argmax(yHat_O,axis=1)==y).float())
        all_accs_O[i].append(test_acc_O)

        # Augmented model accuracy
        yHat_A     = ANN_A(X)
        test_acc_A = 100*torch.mean((torch.argmax(yHat_A,axis=1)==y).float())
        all_accs_A[i].append(test_acc_A)

        # Print for this run
        print(f'Original  - Train: {train_acc_O[-1]:.2f}%, Dev: {dev_acc_O[-1]:.2f}%, Test: {test_acc_O:.2f}%')
        print(f'Augmented - Train: {train_acc_A[-1]:.2f}%, Dev: {dev_acc_A[-1]:.2f}%, Test: {test_acc_A:.2f}%')

    # Store mean and std
    mean_acc_O[i] = np.mean(all_accs_O[i])
    std_acc_O[i]  = np.std(all_accs_O[i])
    mean_acc_A[i] = np.mean(all_accs_A[i])
    std_acc_A[i]  = np.std(all_accs_A[i])

    # Print average results
    print(f'\n>>> AVERAGE RESULTS FOR N={sample_size} <<<')
    print(f'Original  - Mean ± std test accuracy: {mean_acc_O[i]:.2f}% ± {std_acc_O[i]:.2f}%')
    print(f'Augmented - Mean ± std test accuracy: {mean_acc_A[i]:.2f}% ± {std_acc_A[i]:.2f}%\n\n')


In [None]:
# %% Exercise 1 - continue ...
#    Plotting

phi = (1 + np.sqrt(5)) / 2
fig = plt.figure(figsize=(6*phi,6))

# Convert sample sizes to x-locations
x = np.array(sample_sizes)

# Plot error bars
offset = 10
plt.errorbar(x-offset,mean_acc_O, yerr=std_acc_O, fmt='s-',label='Original',capsize=5)
plt.errorbar(x+offset,mean_acc_A, yerr=std_acc_A, fmt='o-',label='Augmented',capsize=5)

# Overlay individual runs
for i, sample_size in enumerate(sample_sizes):
    jitter = 10
    xs_O   = np.random.normal(loc=sample_size-30,scale=jitter,size=reps)
    xs_A   = np.random.normal(loc=sample_size+30,scale=jitter,size=reps)
    plt.scatter(xs_O,all_accs_O[i],color='tab:blue',alpha=0.3,s=40)
    plt.scatter(xs_A,all_accs_A[i],color='tab:orange',alpha=0.3,s=40)

# Plot settings
plt.xlabel('Sample size')
plt.ylabel('Test Accuracy (%)')
plt.xticks(sample_sizes, [str(n) for n in sample_sizes])
plt.ylim([20,100])
plt.suptitle('Test Accuracy')
plt.title(f'Mean ± Std and individual runs (n = {reps})')
plt.grid(True)
plt.legend()
plt.tight_layout()

plt.savefig('figure22_data_noise_augmentation_devset_test_set_extra1.png')

plt.show()

files.download('figure22_data_noise_augmentation_devset_test_set_extra1.png')


In [None]:
# %% Exercise 2
#    Immediately after loading in the MNIST data (top of the script), there is the following code:
#    dataFull[:,1:] = dataFull[:,1:] / np.max(dataFull)
#    This is different from the corresponding normalization code in the previous MNIST videos. Do you need the [:,1:]?
#    What happens when you change that line to: dataFull = dataFull / np.max(dataFull)
#    Can you still train the model?

# I'd say the indexing is needed because we want to normalise the pixels in the
# images, ranging [1,256], not the labels, stored in the 1st column. In this
# regard I was wondering whether one should normalise as:
#
# > data_all[:,1:] = data_all[:,1:] / np.max(data_all[:,1:])
#
# I mean, here the max is 255 (256-1) so no problems in practice, but maybe it's
# less confusing
# That said, if we normalise as:
#
# > dataFull = dataFull / np.max(dataFull)
#
# we'll normailise also the labels; this is not a problem per se, the labels are
# just labels, but of course it's more practical to have meaningful labels


In [None]:
# %% Exercise 3
#    We augmented the data using noisy copies. Another idea is to augment the dataset using color-flipped copies. Thus,
#    the numbers will be black on a white background. Try it and see how the results compare to the noise-added version!

# Surprisingly or not, there is still an effect, and augmented model performs a
# bit better. I'm not sure whether that was expected or not, so happy
# to hear any consideration on that.
# Intuitively I'd say that since we are adding the inverse images only in the
# training data, this kind of augmentation is quite irrelevant, because then in
# the dev set or in the test set we only have "regural" images, and the image
# inversion doesn't really help in adding noise (i.e., the data distribution
# is still the same). On the other hand, learning to classify also inverted images
# should reduce overfitting, and thus improve classification on the dev set.


In [None]:
# %% Exercise 3 - continue ...
#    Modify data function

# %% Function to get the data (train, dev, test)

# Load data and normalise
data_all       = np.loadtxt(open('sample_data/mnist_train_small.csv','rb'),delimiter=',')
data_all[:,1:] = data_all[:,1:] / np.max(data_all[:,1:])

# Function
def get_dataset(n,double_data=False):

    # Remove labels (i.e., numbers IDs) from dataset, and select only n data
    labels = data_all[:n,0]
    data   = data_all[:n,1:]

    # Covert to tensor
    data_T   = torch.tensor(data).float()
    labels_T = torch.tensor(labels).long()

    # Split data with scikitlearn
    train_data,dev_data, train_labels,dev_labels = train_test_split(data_T,labels_T,test_size=0.1)

    # Make an exact copy of the train data after splitting, to avoid twin items
    # in train and dev sets (normalise each image individually to its max)
    if double_data==True:
        train_data_noisy = train_data + torch.rand_like(train_data)/2
        train_data       = torch.cat((train_data,train_data_noisy),axis=0)
        train_data       = train_data / train_data.max(dim=1,keepdim=True)[0]
        train_labels     = torch.cat((train_labels,train_labels),axis=0)

    # PyTorch datasets
    train_data = TensorDataset(train_data,train_labels)
    dev_data   = TensorDataset(dev_data,dev_labels)

    # DataLoader objects
    batch_size   = 20
    train_loader = DataLoader(train_data,batch_size=batch_size,shuffle=True)
    dev_loader   = DataLoader(dev_data,batch_size=dev_data.tensors[0].shape[0])

    # Get fresh test set from unused data (basically all the remaining data)
    test_data   = torch.tensor(data_all[n:,1:]).float()
    test_labels = torch.tensor(data_all[n:,0]).long()

    return train_loader,dev_loader,(test_data,test_labels)
