In [None]:
# %% Deep learning - Section 12.123
#    Data feature augmentation

# This code pertains a deep learning course provided by Mike X. Cohen on Udemy:
#   > https://www.udemy.com/course/deeplearning_x
# The "base" code in this repository is adapted (with very minor modifications)
# from code developed by the course instructor (Mike X. Cohen), while the
# "exercises" and the "code challenges" contain more original solutions and
# creative input from my side. If you are interested in DL (and if you are
# reading this statement, chances are that you are), go check out the course, it
# is singularly good.


In [None]:
# %% Libraries and modules
import numpy               as np
import matplotlib.pyplot   as plt
import torch
import torch.nn            as nn
import seaborn             as sns
import copy
import torch.nn.functional as F
import pandas              as pd
import scipy.stats         as stats
import time

from torch.utils.data                 import DataLoader,TensorDataset
from sklearn.model_selection          import train_test_split
from google.colab                     import files
from torchsummary                     import summary
from IPython                          import display
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg')


In [None]:
# %% Data

n_clust = 300
blur    = 1

A = [ 1,1 ]
B = [ 5,1 ]
C = [ 4,3 ]

a = [ A[0]+np.random.randn(n_clust)*blur, A[1]+np.random.randn(n_clust)*blur ]
b = [ B[0]+np.random.randn(n_clust)*blur, B[1]+np.random.randn(n_clust)*blur ]
c = [ C[0]+np.random.randn(n_clust)*blur, C[1]+np.random.randn(n_clust)*blur ]

# True labels
labels_np = np.hstack(( np.zeros((n_clust)),
                        np.ones( (n_clust)),
                        np.ones( (n_clust))+1 ))

# Concatanate into a matrix
data_np = np.hstack((a,b,c)).T

# Convert to PyTorch tensor
data   = torch.tensor(data_np).float()
labels = torch.tensor(labels_np).long()

# Plotting (with distance from origin)
phi = (1 + np.sqrt(5)) / 2
fig = plt.figure(figsize=(phi*6,6))

cmaps = plt.cm.plasma(np.linspace(0.2,0.9,len(np.unique(labels))))

for i in range(len(data)):
  plt.plot([0,data[i,0]],[0,data[i,1]],color=cmaps[labels[i]],alpha=.2)

plt.plot(data[np.where(labels==0)[0],0],data[np.where(labels==0)[0],1],'s',color=cmaps[0],alpha=.5)
plt.plot(data[np.where(labels==1)[0],0],data[np.where(labels==1)[0],1],'o',color=cmaps[1],alpha=.5)
plt.plot(data[np.where(labels==2)[0],0],data[np.where(labels==2)[0],1],'^',color=cmaps[2],alpha=.5)

plt.grid(color=[.9,.9,.9])
plt.title('Some data')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')

plt.savefig('figure25_data_feature_augmentation.png')

plt.show()

files.download('figure25_data_feature_augmentation.png')


In [None]:
# %% A note

# The native data only have 2 dimensions (x,y), here we are going to use the
# distance from origin as a new feature and see if it helps the model


In [None]:
# %% Compute Euclidian distance of datapoints from origin

# Distance
dist2origin = torch.sqrt( data[:,0]**2 + data[:,1]**2 )

# Plotting
phi = (1 + np.sqrt(5)) / 2
fig = plt.figure(figsize=(phi*6,6))

cmaps = plt.cm.plasma(np.linspace(0.2,0.9,len(np.unique(labels))))

for i,lab in enumerate(np.unique(labels)):
    idx = labels==lab
    plt.plot(labels[labels==lab]+torch.randn(n_clust)/10,dist2origin[idx],'o',color=cmaps[i])

plt.xticks([0,1,2],labels=['Cluster 1','Cluster 2','Cluster 3'])
plt.ylabel('Euclidean distance (a.u.)')
plt.title('Distance from origin')

plt.savefig('figure26_data_feature_augmentation.png')

plt.show()

files.download('figure26_data_feature_augmentation.png')


In [None]:
# %% Add new feature to data set

data_aug = torch.cat( (data,dist2origin.view(len(data),1)),axis=1 )

print(data.shape)
print(data_aug.shape)
print()

print(data)
print(data_aug)


In [None]:
# %% Create train and test datasets

# Split data with scikitlearn
train_data,test_data, train_labels,test_labels = train_test_split(data_aug,labels,train_size=0.9)

# Convert to PyTorch Datasets
train_data = TensorDataset(train_data,train_labels)
test_data  = TensorDataset(test_data,test_labels)

# Convert to dataloader object
batch_size   = 15
train_loader = DataLoader(train_data,batch_size=batch_size,shuffle=True,drop_last= True)
test_loader  = DataLoader(test_data,batch_size=test_data.tensors[0].shape[0])


In [None]:
# %% Model class

def gen_model(use_extra_feature=False):
    class model(nn.Module):
        def __init__(self):
            super().__init__()

            # Input layer (flexible use of extra feature)
            if use_extra_feature:
                self.input = nn.Linear(3,8)
            else:
                self.input = nn.Linear(2,8)

            # Hidden layer
            self.hid = nn.Linear(8,8)

            # Output layer
            self.output = nn.Linear(8,3)

        def forward(self,x):

            #print(x.shape) # comment out during training
            if not use_extra_feature:
                x = x[:,:2]
            #print(x.shape) # comment out during training

            x = F.relu(self.input(x))
            x = F.relu(self.hid(x))
            x = self.output(x)

            return x

    # Model instance, loss function, and optimiser
    ANN       = model()
    loss_fun  = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(ANN.parameters(),lr=0.001)

    return ANN,loss_fun,optimizer


In [None]:
# %% Test model

print('Using augmented data set :')
ANN = gen_model(use_extra_feature=True)[0]
ANN(next(iter(train_loader))[0]);

print('\nNot using augmented data set :')
ANN = gen_model(use_extra_feature=False)[0]
ANN(next(iter(train_loader))[0]);


In [None]:
# %% Function to train the model

def train_model(use_extra_feature=False):

    # Number of epochs and model instance
    num_epochs = 200
    ANN,loss_function,optimizer = gen_model(use_extra_feature)

    # Preallocate variables
    losses    = torch.zeros(num_epochs)
    train_acc = torch.zeros(num_epochs)
    test_acc  = torch.zeros(num_epochs)

    # Training loop
    for epoch_i in range(num_epochs):

        # Batches loop
        batch_acc  = []
        batch_loss = []

        for X,y in train_loader:

            # Forward prop and loss
            yHat = ANN(X)
            loss = loss_function(yHat,y)

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Compute loss and accuracy for this batch
            batch_loss.append(loss.item())
            batch_acc.append(100*torch.mean((torch.argmax(yHat,axis=1)==y).float()).item())

        # Compute loss and accuracy for the epoch
        losses[epoch_i]    = np.mean(batch_loss)
        train_acc[epoch_i] = np.mean(batch_acc)

        # Test accuracy (switch to evaluation mode and then back to training
        # mode to save up computation)
        ANN.eval()
        X,y = next(iter(test_loader))
        with torch.no_grad():
            yHat = ANN(X)

        test_acc[epoch_i] = 100*torch.mean((torch.argmax(yHat,axis=1)==y).float()).item()
        ANN.train()

    return train_acc,test_acc,losses,ANN


In [None]:
# %% Function to plot the results

def plot_results():

    # Accuracy over entire data set (train + test)
    yHat  = ANN(data_aug)
    preds = torch.argmax(yHat,axis=1)
    acc   = (preds==labels).float()

    # Accuracy by cluster
    acc_by_clust = np.zeros(len(np.unique(labels)))
    for i in range(len(np.unique(labels))):
        acc_by_clust[i] = 100*torch.mean(acc[labels==i])

    # Plotting
    phi = (1 + np.sqrt(5)) / 2
    fig,ax = plt.subplots(2,2,figsize=(phi*6,6))

    # plot the loss function
    ax[0,0].plot(losses)
    ax[0,0].set_ylabel('Loss')
    ax[0,0].set_xlabel('epoch')
    ax[0,0].set_title('Losses')

    # plot the accuracy functions
    ax[0,1].plot(train_acc,label='Train')
    ax[0,1].plot(test_acc,label='Test')
    ax[0,1].set_ylabel('Accuracy (%)')
    ax[0,1].set_xlabel('Epoch')
    ax[0,1].set_title(f'Accuracy ({test_acc[-1].item():.2f}%)')
    ax[0,1].legend()

    # plot overall accuracy by group
    ax[1,0].bar(range(3),acc_by_clust)
    ax[1,0].set_ylim([np.min(acc_by_clust)-5,np.max(acc_by_clust)+5])
    ax[1,0].set_xticks([0,1,2])
    ax[1,0].set_xlabel('Group')
    ax[1,0].set_ylabel('Accuracy (%)')
    ax[1,0].set_title('Accuracy by group')

    # scatterplot of correct and incorrect labeled data
    color_shapes = [ 's','o','^' ]
    cmaps = plt.cm.plasma(np.linspace(0.2,0.9,len(np.unique(labels))))
    for i in range(3):
        # plot all data points
        ax[1,1].plot(data_aug[labels==i,0],data_aug[labels==i,1],
                     color_shapes[i],color=cmaps[i],alpha=.3,label=f'Clust. {i}')

        # cross-out the incorrect ones
        idxErr = (acc==0) & (labels==i)
        ax[1,1].plot(data_aug[idxErr,0],data_aug[idxErr,1],'rx',alpha=0.6)

    ax[1,1].set_title('All groups')
    ax[1,1].set_xlabel('Dimension 1')
    ax[1,1].set_ylabel('Dimension 2')
    ax[1,1].legend()

    plt.tight_layout()

    plt.savefig('figure27_data_feature_augmentation.png')

    plt.show()

    files.download('figure27_data_feature_augmentation.png')


In [None]:
# %% Test model without augmented data

train_acc,test_acc,losses,ANN = train_model(False)
print('Final accuracy: %.2f%%' %test_acc[-1].item())
plot_results()


In [None]:
# %% Test model with augmented data

train_acc,test_acc,losses,ANN = train_model(True)
print('Final accuracy: %.2f%%' %test_acc[-1].item())
plot_results()


In [None]:
# %% Run parametric experiment

# Run the model many times and carry out a t-test on the result distributions to
# check whether the augmented data do make a difference; if no difference,
# prefer the simpler model

# Spoiler alert : the feature we added here is a linear mixture

# Run
repetitions       = 10
acc_not_augmented = np.zeros(repetitions)
acc_augmented     = np.zeros(repetitions)

for i in range(repetitions):

    acc_not_augmented[i] = train_model(False)[1][-1]
    acc_augmented[i]     = train_model(True)[1][-1]

print( np.round(np.vstack((acc_not_augmented,acc_augmented)).T,2) )

# Stats
t,p = stats.ttest_ind(acc_not_augmented,acc_augmented)
print('\nT-test for independent samples :')
print(f't = {t:.2f}, p = {p:.3f}')


In [None]:
# %% Exercise 1
#    Add code to the "distance-to-origin" plot (top of the script) so the color and shape of the dots matches those
#    used in the previous qwerties plot. Also, change the colors of the bars in the barplots to match the qwerties.

# Ended up already implementing this part


In [None]:
# %% Exercise 2
#    If you increase the learning rate, or increase the number of epochs, or use Adam instead of SGD, you'll find that
#    both datasets lead to equally good -- and high -- performance. Based on the graph of the data, do you think it's
#    even possible to reach 100% accuracy? What does this tell you about ceiling effects in DL?

# Even just switching to Adam improves the learning rate dramatically, so that
# the performance reaches ceil very quickly (~85/87%); however, even adding more
# epochs - in case there was some hidden local minimun - doesn't really improves
# the performance, and by looking at the data plot (see also above), it seems
# quite obvious that the model will never be able to classify the regions of
# overlap with an 100% accuracy,  at least not only based on x and y coordinates
# and Euclidian distance (and even if it happened, it would be a lucky chance
# every now and then).
# To prove this conceptual point, one can try to make the data more segregated
# in themselves, for example by reducing he blur. Now even the basic model, with
# augmented or non-augmented data, SGD and 200 iterations, performes nearly
# perfectly (beside the same stubborn point).
