In [None]:
# %% Deep learning - Section 12.121
#    Data oversampling in MNIST

# This code pertains a deep learning course provided by Mike X. Cohen on Udemy:
#   > https://www.udemy.com/course/deeplearning_x
# The "base" code in this repository is adapted (with very minor modifications)
# from code developed by the course instructor (Mike X. Cohen), while the
# "exercises" and the "code challenges" contain more original solutions and
# creative input from my side. If you are interested in DL (and if you are
# reading this statement, chances are that you are), go check out the course, it
# is singularly good.


In [None]:
# %% Libraries and modules
import numpy               as np
import matplotlib.pyplot   as plt
import torch
import torch.nn            as nn
import seaborn             as sns
import copy
import torch.nn.functional as F
import pandas              as pd
import scipy.stats         as stats
import time

from torch.utils.data                 import DataLoader,TensorDataset
from sklearn.model_selection          import train_test_split
from google.colab                     import files
from torchsummary                     import summary
from IPython                          import display
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg')


In [None]:
# %% Function to get the data

# Load data
data_all = np.loadtxt(open('sample_data/mnist_train_small.csv','rb'),delimiter=',')

# Function
def get_dataset(n,double_data=False):

    # Remove labels (i.e., numbers IDs) from dataset, and select only n data
    labels = data_all[:n,0]
    data   = data_all[:n,1:]

    # Normalize to [0,1]
    data_norm = data / np.max(data)

    # Make an exact copy of all the data (note: if done here it will generate
    # problems with the devset, because there will be copies of items in both
    # train and dev sets)
    #if double_data==True:
    #    data_norm = np.concatenate((data_norm,data_norm),axis=0)
    #    labels    = np.concatenate((labels,labels),axis=0)

    # Covert to tensor
    data_T   = torch.tensor(data_norm).float()
    labels_T = torch.tensor(labels).long()

    # Split data with scikitlearn
    train_data,test_data, train_labels,test_labels = train_test_split(data_T,labels_T,test_size=0.1)

    # Make an exact copy of the train data after splitting, to avoid
    # above-mentioned issue
    if double_data==True:
        train_data   = torch.cat((train_data,train_data),axis=0)
        train_labels = torch.cat((train_labels,train_labels),axis=0)

    # PyTorch datasets
    train_data = TensorDataset(train_data,train_labels)
    test_data  = TensorDataset(test_data,test_labels)

    # DataLoader objects
    batch_size   = 20
    train_loader = DataLoader(train_data,batch_size=batch_size,shuffle=True)
    test_loader  = DataLoader(test_data,batch_size=test_data.tensors[0].shape[0])

    return train_loader,test_loader


In [None]:
# %% Test the data function

# Do double data
r,t = get_dataset(200,False)
print(r.dataset.tensors[0].shape)
print(t.dataset.tensors[0].shape)

# Do not double data
r,t = get_dataset(200,True)
print(r.dataset.tensors[0].shape)
print(t.dataset.tensors[0].shape)


In [None]:
# %% Model class

def gen_model():

    class model(nn.Module):
        def __init__(self):
            super().__init__()

            # Architecture
            self.input  = nn.Linear(784,64)
            self.hid1   = nn.Linear(64,32)
            self.hid2   = nn.Linear(32,32)
            self.output = nn.Linear(32,10)

        def forward(self,x):
            x = F.relu(self.input(x))
            x = F.relu(self.hid1(x))
            x = F.relu(self.hid2(x))

            return self.output(x)

    # Model instance, loss function, and optimizer
    ANN       = model()
    loss_fun  = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(ANN.parameters(),lr=0.01)

    return ANN,loss_fun,optimizer


In [None]:
# %% Function to train the model

def train_model():

    # Epochs (few to keep some varaibility in performace) and fresh model instance
    num_epochs = 50
    ANN,loss_fun,optimizer = gen_model()

    # Preallocate vars
    losses    = torch.zeros(num_epochs)
    train_acc = torch.zeros(num_epochs)
    test_acc  = torch.zeros(num_epochs)

    # Loop over epochs
    for epoch_i in range(num_epochs):

        # Loop over training data batches
        batch_acc  = []
        batch_loss = []

        for X,y in train_loader:

            # Forward pass, backpropagation, and optimizer step
            yHat = ANN(X)
            loss = loss_fun(yHat,y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Loss and accuracy from this batch
            batch_loss.append(loss.item())
            batch_acc.append( 100*torch.mean((torch.argmax(yHat,axis=1)==y).float()) )

        losses[epoch_i]    = np.mean(batch_loss).item()
        train_acc[epoch_i] = np.mean(batch_acc).item()

        # Test accuracy
        ANN.eval()

        with torch.no_grad():
            X,y = next(iter(test_loader))
            yHat = ANN(X)
            test_acc[epoch_i] = 100*torch.mean((torch.argmax(yHat,axis=1)==y).float())

        ANN.train()

    return train_acc,test_acc,losses,ANN


In [None]:
# %% Test the whole setting

train_loader,test_loader      = get_dataset(5000,False)
train_acc,test_acc,losses,ANN = train_model()

# plot the results
phi = (1 + np.sqrt(5)) / 2
fig,ax = plt.subplots(1,2,figsize=(1.5*6*phi,6))

ax[0].plot(losses)
ax[0].set_xlabel('Epochs')
ax[0].set_ylabel('Loss')
ax[0].set_ylim([0,3])
ax[0].set_title('Model loss')

ax[1].plot(train_acc,label='Train')
ax[1].plot(test_acc,label='Test')
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('Accuracy (%)')
ax[1].set_ylim([20,102])
ax[1].set_title(f'Final model test accuracy: {test_acc[-1]:.2f}%')
ax[1].legend()

plt.savefig('figure11_data_oversampling_mnist.png')

plt.show()

files.download('figure11_data_oversampling_mnist.png')


In [None]:
# %% Parametric experiment on amount of data

# Parameters and preallocated vars
sample_sizes   = np.arange(500,4001,500)
results_single = torch.zeros((len(sample_sizes),3))
results_double = torch.zeros((len(sample_sizes),3))

# Run the experiment (takes ~3 mins)
for i,sample_size in enumerate(sample_sizes):

    # Non-doubled data
    train_loader,test_loader      = get_dataset(sample_size,False)
    train_acc,test_acc,losses,ANN = train_model()

    # Get results
    results_single[i,0] = torch.mean(train_acc[-5:]).item()
    results_single[i,1] = torch.mean(test_acc[-5:]).item()
    results_single[i,2] = torch.mean(losses[-5:]).item()

    # Doubled data
    train_loader,test_loader      = get_dataset(sample_size,True)
    train_acc,test_acc,losses,ANN = train_model()

    # Get results
    results_double[i,0] = torch.mean(train_acc[-5:]).item()
    results_double[i,1] = torch.mean(test_acc[-5:]).item()
    results_double[i,2] = torch.mean(losses[-5:]).item()


In [None]:
# %% Plotting

phi = (1 + np.sqrt(5)) / 2
fig,ax = plt.subplots(1,3,figsize=(1.5*6*phi,6))

# Axes and title labels
titles    = ['Train accuracy','Dev. set accuracy','Losses']
yaxlabels = ['Accuracy','Accuracy','Losses']

# Common features
for i in range(3):

    # Plot the lines
    ax[i].plot(sample_sizes,results_single[:,i],'s-',label='Original')
    ax[i].plot(sample_sizes,results_double[:,i],'s-',label='Doubled')

    # Make it nicer
    ax[i].set_ylabel(yaxlabels[i])
    ax[i].set_title(titles[i])
    ax[i].legend()
    ax[i].set_xlabel('Unique sample size')
    ax[i].grid('on')

    if i<2:
        ax[i].set_ylim([20,102])

plt.tight_layout()

plt.savefig('figure12_data_oversampling_mnist.png')

plt.show()

files.download('figure12_data_oversampling_mnist.png')


In [None]:
# %% Exercise 1
#    Notice that we're using the "test_dataset" multiple times, which really means that it's the devset,
#    aka hold-out set, and not a true TEST set. A real test set gets evaluated only once. Modify the code
#    to create a test set, using images in dataFull that are not in dataNorm. Note that you don't need
#    to re-run the entire experiment; you only need to train two models (and save their 'net' outputs), so that
#    you can run the test data through (make sure to normalize the test data!). Then you can evaluate the test
#    performance relative to train and devset from those two models.

# A bit over-engineered here, but recoded a bit everything to avoid going mad
# over the variable names. Now, I guess here the test set doesn't really give
# insanely different results, because we didn't really twisted a lot of
# parameters to optimise the training with the dev set (e.g., architecture,
# regularisations, and any other metaparameters)


In [None]:
# %% Exercise 1 - continue ...
#    Function to get the data (train, dev, test)

# Load data
data_all = np.loadtxt(open('sample_data/mnist_train_small.csv','rb'),delimiter=',')

# Function
def get_dataset(n,double_data=False):

    # Remove labels (i.e., numbers IDs) from dataset, and select only n data
    labels = data_all[:n,0]
    data   = data_all[:n,1:]

    # Normalize to [0,1]
    data_norm = data / np.max(data)

    # Covert to tensor
    data_T   = torch.tensor(data_norm).float()
    labels_T = torch.tensor(labels).long()

    # Split data with scikitlearn
    train_data,dev_data, train_labels,dev_labels = train_test_split(data_T,labels_T,test_size=0.1)

    # Make an exact copy of the train data after splitting, to avoid
    # above-mentioned issue
    if double_data==True:
        train_data   = torch.cat((train_data,train_data),axis=0)
        train_labels = torch.cat((train_labels,train_labels),axis=0)

    # PyTorch datasets
    train_data = TensorDataset(train_data,train_labels)
    dev_data  = TensorDataset(dev_data,dev_labels)

    # DataLoader objects
    batch_size   = 20
    train_loader = DataLoader(train_data,batch_size=batch_size,shuffle=True)
    dev_loader   = DataLoader(dev_data,batch_size=dev_data.tensors[0].shape[0])

    # Get actual test set from unused rows
    n_test         = dev_data.tensors[0].shape[0]
    remaining_data = data_all[n:]
    if remaining_data.shape[0] < n_test:
        raise ValueError("Not enough data for an actual test set of this size.")

    test_labels = remaining_data[:n_test,0]
    test_data   = remaining_data[:n_test,1:] / np.max(data_all)

    test_data  = torch.tensor(test_data).float()
    test_labels = torch.tensor(test_labels).long()

    test_data  = TensorDataset(test_data,test_labels)
    test_loader = DataLoader(test_data,batch_size=n_test)

    return train_loader,dev_loader,test_loader

# Test the data function

# Do double data
r,d,t = get_dataset(200,False)
print(r.dataset.tensors[0].shape)
print(d.dataset.tensors[0].shape)
print(t.dataset.tensors[0].shape)

# Do not double data
r,d,t = get_dataset(200,True)
print(r.dataset.tensors[0].shape)
print(d.dataset.tensors[0].shape)
print(t.dataset.tensors[0].shape)


In [None]:
# %% Exercise 1 - continue ...
#    Model class

def gen_model():

    class model(nn.Module):
        def __init__(self):
            super().__init__()

            # Architecture
            self.input  = nn.Linear(784,64)
            self.hid1   = nn.Linear(64,32)
            self.hid2   = nn.Linear(32,32)
            self.output = nn.Linear(32,10)

        def forward(self,x):
            x = F.relu(self.input(x))
            x = F.relu(self.hid1(x))
            x = F.relu(self.hid2(x))

            return self.output(x)

    # Model instance, loss function, and optimizer
    ANN       = model()
    loss_fun  = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(ANN.parameters(),lr=0.01)

    return ANN,loss_fun,optimizer


In [None]:
# %% Exercise 1 - continue ...
#    Function to train the model

def train_model():

    # Epochs (few to keep some varaibility in performace) and fresh model instance
    num_epochs = 50
    ANN,loss_fun,optimizer = gen_model()

    # Preallocate vars
    losses    = torch.zeros(num_epochs)
    train_acc = torch.zeros(num_epochs)
    dev_acc   = torch.zeros(num_epochs)

    # Loop over epochs
    for epoch_i in range(num_epochs):

        # Loop over training data batches
        batch_acc  = []
        batch_loss = []

        for X,y in train_loader:

            # Forward pass, backpropagation, and optimizer step
            yHat = ANN(X)
            loss = loss_fun(yHat,y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Loss and accuracy from this batch
            batch_loss.append(loss.item())
            batch_acc.append( 100*torch.mean((torch.argmax(yHat,axis=1)==y).float()) )

        losses[epoch_i]    = np.mean(batch_loss).item()
        train_acc[epoch_i] = np.mean(batch_acc).item()

        # Dev set accuracy
        ANN.eval()

        with torch.no_grad():
            X,y = next(iter(dev_loader))
            yHat = ANN(X)
            dev_acc[epoch_i] = 100*torch.mean((torch.argmax(yHat,axis=1)==y).float())

        ANN.train()

    return train_acc,dev_acc,losses,ANN


In [None]:
# %% Exercise 1 - continue ...
#    Parametric experiment on amount of data

# Parameters and preallocated vars
sample_sizes   = np.arange(500,4001,500)
results_single = torch.zeros((len(sample_sizes),4))
results_double = torch.zeros((len(sample_sizes),4))

# Run the experiment (takes ~3 mins)
for i,sample_size in enumerate(sample_sizes):

    # Non-doubled data
    train_loader,dev_loader,test_loader = get_dataset(sample_size,False)
    train_acc,dev_acc,losses,ANN        = train_model()

    # Evaluate on real test set
    ANN.eval()
    X_test,y_test = next(iter(test_loader))
    with torch.no_grad():
        yHat_test = ANN(X_test)
        test_acc = 100*torch.mean((yHat_test.argmax(1)==y_test).float())

    # Get results
    results_single[i,0] = torch.mean(train_acc[-5:]).item()
    results_single[i,1] = torch.mean(dev_acc[-5:]).item()
    results_single[i,2] = torch.mean(losses[-5:]).item()
    results_single[i,3] = (test_acc).item()


    # Doubled data
    train_loader,dev_loader,test_loader = get_dataset(sample_size,True)
    train_acc,dev_acc,losses,ANN        = train_model()

    # Evaluate on real test set
    ANN.eval()
    X_test,y_test = next(iter(test_loader))
    with torch.no_grad():
        yHat_test = ANN(X_test)
        test_acc = 100*torch.mean((yHat_test.argmax(1)==y_test).float())

    # Get results
    results_double[i,0] = torch.mean(train_acc[-5:]).item()
    results_double[i,1] = torch.mean(dev_acc[-5:]).item()
    results_double[i,2] = torch.mean(losses[-5:]).item()
    results_double[i,3] = (test_acc).item()


In [None]:
# %% Exercise 1 - continue ...
#    Plotting

phi = (1 + np.sqrt(5)) / 2
fig,ax = plt.subplots(1,3,figsize=(1.5*6*phi,6))

# Axes and title labels
titles    = ['Train accuracy','Dev. and test set accuracy','Losses']
yaxlabels = ['Accuracy','Accuracy','Losses']

# Common features
for i in range(3):

    # Plot the train and dev set info
    ax[i].plot(sample_sizes,results_single[:,i],'s-',label='Original')
    ax[i].plot(sample_sizes,results_double[:,i],'s-',label='Doubled')

    # Plot test accuracy only in the middle plot
    if i == 1:
        ax[i].plot(sample_sizes,results_single[:,3],'o:',color='tab:blue',label='Original (test)')
        ax[i].plot(sample_sizes,results_double[:,3],'o:',color='tab:orange',label='Doubled (test)')

    # Make it nicer
    ax[i].set_ylabel(yaxlabels[i])
    ax[i].set_title(titles[i])
    ax[i].legend()
    ax[i].set_xlabel('Unique sample size')
    ax[i].grid('on')

    if i<2:
        ax[i].set_ylim([20,102])

plt.tight_layout()

plt.savefig('figure15_data_oversampling_mnist_extra1.png')

plt.show()

files.download('figure15_data_oversampling_mnist_extra1.png')


In [None]:
# %% Exercise 2
#    We've previously discovered that Adam can outperform SGD on the MNIST dataset. I used SGD here on purpose
#    to make performance worse (!) so we could test for effects of oversampling. Re-run the experiment using
#    Adam to see whether you still get the same effects.

# With Adam the model reaches a ceiling performance for basically any
# configuration, a bit better for the doubled data on the dev set accuracy,
# but it's probably due to the fact that we doubled the data before splitting
# into train and dev sets, so some data in the train set are also in the dev
# set, which is a problem. Indeed, if one double only the train data after the
# split, the picture becomes more coherent and less suspicious
