In [None]:
# %% Deep learning - Section 8.64
#    Cross-validation - Manual separation

# This code pertains a deep learning course provided by Mike X. Cohen on Udemy:
#   > https://www.udemy.com/course/deeplearning_x
# The "base" code in this repository is adapted (with very minor modifications)
# from code developed by the course instructor (Mike X. Cohen), while the
# "exercises" and the "code challenges" contain more original solutions and
# creative input from my side. If you are interested in DL (and if you are
# reading this statement, chances are that you are), go check out the course, it
# is singularly good.


In [None]:
# %% Libraries and modules
import numpy               as np
import matplotlib.pyplot   as plt
import torch
import torch.nn            as nn
import seaborn             as sns
import copy
import torch.nn.functional as F

from google.colab                     import files
from torchsummary                     import summary
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg')


In [None]:
# %% Import Iris dataset

iris = sns.load_dataset('iris')

# Convert from pandas df to tensor
data = torch.tensor(iris[iris.columns[0:4]].values).float()

# Species to numbers
labels = torch.zeros(len(data),dtype=torch.long)
labels[iris.species=='setosa']     = 0
labels[iris.species=='versicolor'] = 1
labels[iris.species=='virginica']  = 2


In [None]:
# %% Separate data into train set and test set (no devset)

# Proportion of training examples
prop_training = .8
num_training  = int(len(labels)*prop_training)

# Preallocate boolean vector to select data and labels
train_test_bool = np.zeros(len(labels),dtype=bool)

# Is it the correct way to select samples? No because it gets only first 80%
train_test_bool[range(num_training)] = True

# Use a random selection instead
train_test_bool = np.zeros(len(labels),dtype=bool)
item4training   = np.random.choice(range(len(labels)),num_training,replace=False)
train_test_bool[item4training] = True

print(f'Number of training examples: \n{train_test_bool}')


In [None]:
# %% Test wehther the selection is balanced

print(f'Average of all data: {torch.mean(labels.float())}') # 1 by definition
print(f'Average of training data: {torch.mean(labels[train_test_bool].float()):.4f}') # Should be ~1
print(f'Average of test data: {torch.mean(labels[~train_test_bool].float()):.4f}')    # Should be ~1


In [None]:
# %% Inspect sizes

print(data.shape)
print(data[train_test_bool,:].shape)
print(data[~train_test_bool,:].shape)


In [None]:
# generate the model

# Architecture
ANNiris = nn.Sequential(
             nn.Linear(4,64),  # input layer
             nn.ReLU(),        # a.f.
             nn.Linear(64,64), # hidden layer
             nn.ReLU(),        # a.f.
             nn.Linear(64,3)    # Output layer
             )

# Loss function
loss_fun = nn.CrossEntropyLoss()

# Optimizer
optimizer = torch.optim.SGD(ANNiris.parameters(),lr=0.01)


In [None]:
# %% Train and test the model,

num_epochs = 1000

# Initialise losses
losses      = torch.zeros(num_epochs)
ongoing_acc = []

# Loop over epochs
for epoch_i in range(num_epochs):

    # Forward propagation
    yHat = ANNiris(data[train_test_bool,:])

    # Compute accuracy
    ongoing_acc.append( 100*torch.mean( (torch.argmax(yHat,axis=1)==labels[train_test_bool]).float() ) )

    # Compute loss
    loss = loss_fun(yHat,labels[train_test_bool])
    losses[epoch_i] = loss

    # Backpropagation
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


In [None]:
# %% Compute train and test accuracies

# Final forward pass using "trainig data"
predictions = ANNiris(data[train_test_bool,:])
train_acc   = 100*torch.mean( (torch.argmax(predictions,axis=1)==labels[train_test_bool]).float() )

# Final forward pass using "test data"
predictions = ANNiris(data[~train_test_bool,:])
test_acc    = 100*torch.mean( (torch.argmax(predictions,axis=1)==labels[~train_test_bool]).float() )

# Print
print(f'Final train accuracy: {train_acc:.2f}')
print(f'Final test accuracy: {test_acc:.2f}')


In [None]:
# %% Inspect losses and accuracies

fig,ax = plt.subplots(1,2,figsize=(12,4))

ax[0].plot(losses.detach())
ax[0].set_ylabel('Loss')
ax[0].set_xlabel('Epoch')
ax[0].set_title('Losses over epochs')

ax[1].plot(ongoing_acc)
ax[1].set_ylabel('Accuracy')
ax[1].set_xlabel('Epoch')
ax[1].set_title('Accuracy over epochs')

plt.savefig('figure1_cross_validation_manual.png')

plt.show()

files.download('figure1_cross_validation_manual.png')


In [None]:
# %% Exercise 1
#    Randomly assigning data samples to be in the train vs test phase produced a statistical balance, but it was
#    not perfect. Write an algorithm that will guarantee a balance of flower types while also randomly assigning
#    samples to be in train vs. test.

# Select 80% of data from each label class
train_test_bool = np.zeros(len(labels),dtype=bool)

for l in labels.unique():

    l_idx  = np.where(labels==l)[0]
    l_prop = int(len(l_idx)*prop_training)

    item4training = np.random.choice(l_idx,l_prop,replace=False)
    train_test_bool[item4training] = True

print(f'Number of training examples: \n{train_test_bool}')

# Test whether the selection is balanced
print(f'Average of all data: {torch.mean(labels.float())}') # 1 by definition
print(f'Average of training data: {torch.mean(labels[train_test_bool].float()):.4f}') # Should be 1
print(f'Average of test data: {torch.mean(labels[~train_test_bool].float()):.4f}')    # Should be 1


In [None]:
# %% Exercise 2
#    Revert the code to its original form -- with the strong imbalance in flower types. Then train the model. What are
#    the train and test accuracies? Compute the accuracy separately for each type of flower to see whether the model
#    learned some categories, or whether it performed equally on all three categories. Are you surprised at the results?

# Compute train and test accuracies (re-run code above with biased selection)

# Final forward pass using "trainig data"
predictions = ANNiris(data[train_test_bool,:])
pred_labels = torch.argmax(predictions, axis=1)

print(f'Final train accuracy: {train_acc:.2f}%')

for l in [0,1,2]:
    l_idx = labels[train_test_bool] == l
    l_acc = 100 * torch.mean((pred_labels[l_idx] == l).float())
    print(f'Accuracy for label {l}: {l_acc:.2f}%')

print( )

# Final forward pass using "trainig data"
predictions = ANNiris(data[~train_test_bool,:])
pred_labels = torch.argmax(predictions, axis=1)

print(f'Final test accuracy: {test_acc:.2f}%')

for l in [0,1,2]:
    l_idx = labels[~train_test_bool] == l
    l_acc = 100 * torch.mean((pred_labels[l_idx] == l).float())
    print(f'Accuracy for label {l}: {l_acc:.2f}%')

# The train accuracy is lower, as one would expect because there are fewer example
# in the training dataset; in the test accuracy, label 0 and 1 are basically absent,
# so the whole accuracy is given by the ability to discriminate labels 2 alone
