In [None]:
# %% Deep learning - Section 8.66
#    Cross-validation - DataLoader

# This code pertains a deep learning course provided by Mike X. Cohen on Udemy:
#   > https://www.udemy.com/course/deeplearning_x
# The "base" code in this repository is adapted (with very minor modifications)
# from code developed by the course instructor (Mike X. Cohen), while the
# "exercises" and the "code challenges" contain more original solutions and
# creative input from my side. If you are interested in DL (and if you are
# reading this statement, chances are that you are), go check out the course, it
# is singularly good.


In [None]:
# %% Libraries and modules
import numpy               as np
import matplotlib.pyplot   as plt
import torch
import torch.nn            as nn
import seaborn             as sns
import copy
import torch.nn.functional as F

from torch.utils.data                 import DataLoader,TensorDataset
from sklearn.model_selection          import train_test_split
from google.colab                     import files
from torchsummary                     import summary
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg')


In [None]:
# %% Import Iris dataset

iris = sns.load_dataset('iris')

# Convert from pandas df to tensor
data = torch.tensor(iris[iris.columns[0:4]].values).float()

# Species to numbers
labels = torch.zeros(len(data),dtype=torch.long)
labels[iris.species=='setosa']     = 0
labels[iris.species=='versicolor'] = 1
labels[iris.species=='virginica']  = 2


In [None]:
# %% How to use DataLoader


In [None]:
# Fake dataset

fake_data   = np.tile( np.array([1,2,3,4]),(10,1) ) + np.tile( 10*np.arange(1,11),(4,1) ).T
fake_labels = np.arange(10)>4

print(fake_data)
print()
print(fake_labels)


In [None]:
# %% Use DataLoader to shuffle the data

# DataLoader object
fake_data_LDr = DataLoader(fake_data,shuffle=True)

# Print sizes and data
print(fake_data_LDr.batch_size)
print(fake_data_LDr)
print()

# To see the content, you have to iterate over it
for i,sample in enumerate(fake_data_LDr):
  print(i,sample,sample.shape)

# But we still don't see the labels!


In [None]:
# %% Create a Dataset that contains data and labels

# Merge data and labels into a Dataset object
fake_Dataset = TensorDataset(torch.tensor(fake_data),torch.tensor(fake_labels))

print(fake_Dataset.tensors)
print()

# Now create another DataLoader object to shuffle the data
fake_data_LDr = DataLoader(fake_Dataset, shuffle=True)

# Print data
for data,label in fake_data_LDr:
  print(data,label)


In [None]:
# %% Use scikitlearn to then split the data

# Split
train_data,test_data,train_labels,test_labels = train_test_split(fake_data,fake_labels,test_size=0.2)

# Convert back to PyTorch dataset
train_data = TensorDataset( torch.tensor(train_data),torch.tensor(train_labels) )
test_data  = TensorDataset( torch.tensor(test_data),torch.tensor(test_labels) )

# Convert back to DataLoader to specify the batch sizes and shuffle
train_data_LDr = DataLoader(train_data,batch_size=4)
test_data_LDr  = DataLoader(test_data)


In [None]:
# %% Examine the contents of the DataLoader

print('Training data:')
for batch,label in train_data_LDr:
    print(batch,label)
    print()

print('Test data:')
for batch,label in test_data_LDr:
    print(batch,label)
    print()


In [None]:
# %% Now back to the actual data


In [None]:
# %% Split with scikitlearn

# Split
train_data,test_data,train_labels,test_labels = train_test_split(data,labels,train_size=0.8,shuffle=True)

# Convert to PyTorch Datasets
train_data = TensorDataset(train_data,train_labels)
test_data  = TensorDataset(test_data,test_labels)

# Convert to DataLoader (soft-code for 1 test batch)
batch_size     = 12
train_data_LDr = DataLoader(train_data,shuffle=True,batch_size=batch_size)
test_data_LDr  = DataLoader(test_data,batch_size=test_data.tensors[0].shape[0])


In [None]:
# %% Check sizes of data batches

for X,y in train_data_LDr:
  print(X.shape,y.shape)

print()
print(X,y)

for X,y in test_data_LDr:
  print(X.shape,y.shape)

print()
print(X,y)


In [None]:
# %% Function to generate the model

def gen_model():

    # Architecture
    ANNiris = nn.Sequential(
                 nn.Linear(4,64),
                 nn.ReLU(),
                 nn.Linear(64,64),
                 nn.ReLU(),
                 nn.Linear(64,3)
                 )

    # Loss function
    loss_fun = nn.CrossEntropyLoss()

    # Optimizer
    optimizer = torch.optim.SGD(ANNiris.parameters(),lr=0.01)

    return ANNiris,loss_fun,optimizer


In [None]:
# %% Function to train the model (in mini batches)

# Global parameters
num_epochs = 500

# Function
def train_model():

    # Preallocate losses
    losses = torch.zeros(num_epochs)

    # Initialise accuracies as empty
    train_acc = []
    test_acc  = []

    # Loop over epochs
    for epoch_i in range(num_epochs):

        # Loop over batches
        batch_acc  = []
        batch_loss = torch.zeros(1)
        for X,y in train_data_LDr:

            # Forward propagation
            yHat = ANNiris(X)

            # Compute loss
            loss = loss_fun(yHat,y)
            batch_loss += loss

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Train accuracy for this batch
            batch_acc.append( 100*torch.mean((torch.argmax(yHat,axis=1)==y).float()).item() )

        # Compute average train accuracy and loss for this epoch (average of batch_loss)
        train_acc.append( np.mean(batch_acc) )
        losses[epoch_i] = batch_loss/len(train_data_LDr)

        # Compute test accuracy (just one test bacth)
        X,y = next(iter(test_data_LDr))
        pred_labels = torch.argmax( ANNiris(X),axis=1 )
        test_acc.append( 100*torch.mean((pred_labels==y).float()).item() )

    return train_acc,test_acc,losses


In [None]:
# %% Test the model once

ANNiris,loss_fun,optimizer = gen_model()
train_acc,test_acc,losses  = train_model()


In [None]:
# %% Plotting

fig = plt.figure(figsize=(10,5))

plt.plot(train_acc,'o-',alpha=.75)
plt.plot(test_acc,'s-',alpha=.75)
plt.xlabel('Epochs')
plt.ylabel('Accuracy (%)')
plt.legend(['Train','Test'])
plt.title('Train/test example')

plt.savefig('figure8_cross_validation_dataloader.png')

plt.show()

files.download('figure8_cross_validation_dataloader.png')


In [None]:
# %% Exercise 1
#    Read the help doc for the train_test_split() function, in particular to understand what the 'shuffle' option does.
#    What is the default value? Run the code again, switching the shuffling off. How does that affect model performance? Why?

# Default is True, and if turned to False the split is executed always in the same way, with the first n% samples
# being the train samples, and the remaining the test sample. In this case, the test dataset is always made up of
# a set of "2" labels, and as one would imagine, the model does not perform so well because it is trained mostly
# with labels "0" and "1" and then asked to classify labels "2"


In [None]:
# %% Exercise 2
#    The model training loop does not keep track of the losses. Modify the code to store the loss value on each epoch, include
#    it as an output of the training function, and then make a plot of the training losses. Try to do it without looking
#    at other code files!

fig = plt.figure(figsize=(10,5))

plt.plot(losses.detach(),'o-',alpha=.75)
plt.xlabel('Epochs')
plt.ylabel('Losses')
plt.legend(['Losses'])
plt.title('Losses over epochs')

plt.savefig('figure10_cross_validation_dataloader_extra2.png')

plt.show()

files.download('figure10_cross_validation_dataloader_extra2.png')
