In [None]:
# %% Deep learning - Section 8.65
#    Cross-validation - Scikitlearn

# This code pertains a deep learning course provided by Mike X. Cohen on Udemy:
#   > https://www.udemy.com/course/deeplearning_x
# The "base" code in this repository is adapted (with very minor modifications)
# from code developed by the course instructor (Mike X. Cohen), while the
# "exercises" and the "code challenges" contain more original solutions and
# creative input from my side. If you are interested in DL (and if you are
# reading this statement, chances are that you are), go check out the course, it
# is singularly good.


In [None]:
# %% Libraries and modules
import numpy               as np
import matplotlib.pyplot   as plt
import torch
import torch.nn            as nn
import seaborn             as sns
import copy
import torch.nn.functional as F

from sklearn.model_selection          import train_test_split
from google.colab                     import files
from torchsummary                     import summary
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg')


In [None]:
# %% A tanget from the code

# The 80/20 split we have been using so far is not a golden rule, in fact, it's rather
# arbitrary. Ideally you want to use as many data as possible in the training while
# still having enough data for the devset and the test set; what is "enough" is up
# to the speculation of the user, and it depends on the nature and size of the dataset.
# For larger dataset, for instance, one can also do more extreme splits such as 98/2


In [None]:
# %% Import Iris dataset

iris = sns.load_dataset('iris')

# Convert from pandas df to tensor
data = torch.tensor(iris[iris.columns[0:4]].values).float()

# Species to numbers
labels = torch.zeros(len(data),dtype=torch.long)
labels[iris.species=='setosa']     = 0
labels[iris.species=='versicolor'] = 1
labels[iris.species=='virginica']  = 2


In [None]:
# %% How to use train_test_split()


In [None]:
# Fake dataset

fake_data   = np.tile( np.array([1,2,3,4]),(10,1) ) + np.tile( 10*np.arange(1,11),(4,1) ).T
fake_labels = np.arange(10)>4

print(fake_data)
print()
print(fake_labels)


In [None]:
# %% Use scikitlearn to split data

# Note that the 3rd parameter can be specified as "test_size" or "train_size", be mindful
train_data,test_data,train_labels,test_labels = train_test_split(fake_data,fake_labels,test_size=0.2)

# Print sizes and data; notice how the labels are not randomised within the test.
# If you don't want the shuffle, specify shuffle=False as option, but you would do that
# only if the data are already randomised
print(f'Training data size: {train_data.shape}')
print(f'Test data size: {test_data.shape}\n')

print(f'Training data: \n{train_data}')
print(f'Training data: \n{test_data}')


In [None]:
# %% Now back to the model


In [None]:
# %% Function to generate the model

def gen_model():

    # Architecture
    ANNiris = nn.Sequential(
                 nn.Linear(4,64),
                 nn.ReLU(),
                 nn.Linear(64,64),
                 nn.ReLU(),
                 nn.Linear(64,3)
                 )

    # Loss function
    loss_fun = nn.CrossEntropyLoss()

    # Optimizer
    optimizer = torch.optim.SGD(ANNiris.parameters(),lr=0.01)

    return ANNiris,loss_fun,optimizer


In [None]:
# %% Function to train the model

# Global parameters
num_epochs = 200

# Function
def train_model(train_proportion):

    # Initialise losses
    losses    = torch.zeros(num_epochs)
    train_acc = []
    test_acc  = []

    # Split train and test data
    # 1) Split before entering iterations (otherwise it's overfitting!)
    # 2) Specify train_size
    x_train,x_test,y_train,y_test = train_test_split(data,labels,train_size=train_proportion)

    # Loop over epochs
    for epoch_i in range(num_epochs):

        # Forward propagation
        yHat = ANNiris(x_train)

        # Compute loss
        loss = loss_fun(yHat,y_train)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Taining accuracy
        train_acc.append( 100*torch.mean((torch.argmax(yHat,axis=1)==y_train).float()).item() )

        # Test accuracy (final pass on test data)
        pred_labels = torch.argmax(ANNiris(x_test),axis=1)
        test_acc.append( 100*torch.mean((pred_labels==y_test).float()).item() )

    return train_acc,test_acc


In [None]:
# %% Test the model once

ANNiris,loss_fun,optimizer = gen_model()
train_acc,test_acc         = train_model(0.8)


In [None]:
# %% Plotting

fig = plt.figure(figsize=(10,5))

plt.plot(train_acc,'o-',alpha=.8)
plt.plot(test_acc,'s-',alpha=.8)
plt.xlabel('Epochs')
plt.ylabel('Accuracy (%)')
plt.legend(['Train','Test'])
plt.title('Train/test example')

plt.savefig('figure3_cross_validation_scikitlearn.png')

plt.show()

files.download('figure3_cross_validation_scikitlearn.png')


In [None]:
# %% Parametric experiment on training data proportion

# Parameters
train_prop_vec = np.linspace(.2,.95,15)
all_train_acc  = np.zeros((len(train_prop_vec),num_epochs))
all_test_acc   = np.zeros((len(train_prop_vec),num_epochs))

# Loop over training
for i in range(len(train_prop_vec)):

    # Generate model
    ANNiris,loss_fun,optimizer = gen_model()

    # Train it
    train_acc,test_acc = train_model(train_prop_vec[i])

    # Store results
    all_train_acc[i,:] = train_acc
    all_test_acc[i,:]  = test_acc


In [None]:
# Plotting

fig,ax = plt.subplots(1,2,figsize=(13,5))

ax[0].imshow(all_train_acc,aspect='auto',cmap='jet',
             vmin=50,vmax=90, extent=[0,num_epochs,train_prop_vec[-1],train_prop_vec[0]])
ax[0].set_xlabel('Epochs')
ax[0].set_ylabel('Training size proportion')
ax[0].set_title('Training accuracy')

p = ax[1].imshow(all_test_acc,aspect='auto',cmap='jet',
                 vmin=50,vmax=90, extent=[0,num_epochs,train_prop_vec[-1],train_prop_vec[0]])
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('Training size proportion')
ax[1].set_title('Test accuracy')
fig.colorbar(p,ax=ax[1])

plt.savefig('figure4_cross_validation_scikitlearn.png')

plt.show()

files.download('figure4_cross_validation_scikitlearn.png')


In [None]:
# %% Exercise 1
#    The images above suggest that the training proportion doesn't really affect learning success (for this data and this
#    model). Does increasing the number of epochs to 1000 change the conclusion? How about with a lr=.001?

# As one would expect, increasing the iterations allows the model to learn for longer, and the accuracy reaches a ceil;
# decreasing the step size, on the other hand, does not allow the model to reach an optimal solution before the training
# iterations are over (assuming they are set back to 200)


In [None]:
# %% Exercise 2
#    According to the help doc for train_test_split(), the train_size input can be either a float between 0.0 and 1.0, or
#    an int. Here we only used float inputs to indicate the proportion of the data used for training. Modify the code to
#    specify the training size as an integer corresponding to the number of samples.

# In this case, this can be easily achieved by setting a vector of absolute sample number
# instead of the proportion vector, and then running the parametrix experiment as usual:

train_abs_sample = np.linspace(30,data.shape[0]-8,15,dtype=int)
