In [None]:
# %% Deep learning - Section 9.78
#    Batch traning in action

# This code pertains a deep learning course provided by Mike X. Cohen on Udemy:
#   > https://www.udemy.com/course/deeplearning_x
# The "base" code in this repository is adapted (with very minor modifications)
# from code developed by the course instructor (Mike X. Cohen), while the
# "exercises" and the "code challenges" contain more original solutions and
# creative input from my side. If you are interested in DL (and if you are
# reading this statement, chances are that you are), go check out the course, it
# is singularly good.


In [None]:
# %% Libraries and modules
import numpy               as np
import matplotlib.pyplot   as plt
import torch
import torch.nn            as nn
import seaborn             as sns
import copy
import torch.nn.functional as F

from torch.utils.data                 import DataLoader,TensorDataset
from sklearn.model_selection          import train_test_split
from google.colab                     import files
from torchsummary                     import summary
from IPython                          import display
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg')


In [None]:
# %% Import Iris dataset

iris = sns.load_dataset('iris')

# Convert from pandas df to tensor
data = torch.tensor(iris[iris.columns[0:4]].values).float()

# Species to numbers
labels = torch.zeros(len(data),dtype=torch.long)
labels[iris.species=='setosa']     = 0
labels[iris.species=='versicolor'] = 1
labels[iris.species=='virginica']  = 2


In [None]:
# Plotting

iris.plot(marker='o',linestyle='none',figsize=(12,6))

plt.xlabel('Sample number')
plt.ylabel('Value')
plt.title('Iris dataset features')

plt.savefig('figure34_batching_regularisation.png')

plt.show()

files.download('figure34_batching_regularisation.png')


In [None]:
# %% Split into train and test data

# Split with scikitlearn
train_data,test_data,train_labels,test_labels = train_test_split(data,labels,test_size=0.2)

# Convert into PyTorch datasets
train_data = TensorDataset(train_data,train_labels)
test_data  = TensorDataset(test_data,test_labels)

# Convert into DataLoader objects (test data are not partitioned, we don't regularise in testing)
# Try size 4, 16, and 32
batch_size   = 16
train_loader = DataLoader(train_data,batch_size=batch_size,shuffle=True,drop_last=True)
test_loader  = DataLoader(test_data,batch_size=test_data.tensors[0].shape[0])


In [None]:
# %% Check sizes of data batches

# Notice size and numbers of last mini-batch with 'drop_last' option turned True or False
for X,y in train_loader:
    print(X.shape,y.shape)


In [None]:
# %% Function to generate the model

def gen_model():

    # Architecture
    ANN = nn.Sequential(
             nn.Linear(4,64),
             nn.ReLU(),
             nn.Linear(64,64),
             nn.ReLU(),
             nn.Linear(64,3))

    # Loss function
    loss_fun = nn.CrossEntropyLoss()

    # Optimizer (small lr for illustration purpose)
    optimizer = torch.optim.SGD(ANN.parameters(),lr=0.0005)

    return ANN,loss_fun,optimizer


In [None]:
# %% Function to train the model

# Parameters
num_epochs = 500

def train_model():

    # Initialise accuracies
    train_acc = []
    test_acc  = []
    losses    = []

    # Loop over epochs
    for epoch_i in range(num_epochs):

        batch_acc  = []
        batch_loss = []

        for X,y in train_loader:

            # Forward propagation and loss
            yHat = ANN(X)
            loss = loss_fun(yHat,y)

            # Only now do backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Batch training accuracy
            batch_acc.append( 100*torch.mean((torch.argmax(yHat,axis=1)==y).float()).item() )
            batch_loss.append(loss.item())

        # Average accuracy from batch
        train_acc.append(np.mean(batch_acc))
        losses.append(np.mean(batch_loss))

        # Test accuracy
        X,y = next(iter(test_loader))
        pred_labels = torch.argmax(ANN(X),axis=1)
        test_acc.append(  100*torch.mean((pred_labels==y).float()).item() )

    # Function output
    return train_acc,test_acc,losses


In [None]:
# %% Test the model

ANN,loss_fun,optimizer = gen_model()
train_acc,test_acc,losses = train_model()


In [None]:
# %% Plotting

fig,ax = plt.subplots(1,2,figsize=(15,5))

ax[0].plot(losses,'^-')
ax[0].set_ylabel('Loss')
ax[0].set_xlabel('Epochs')
ax[0].set_title('Losses with minibatch size = ' + str(batch_size))

ax[1].plot(train_acc,'o-')
ax[1].plot(test_acc,'s-')
ax[1].set_title('Accuracy with minibatch size = ' + str(batch_size))
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('Accuracy (%)')
ax[1].legend(['Train','Test'])
ax[1].set_ylim([27,103])

plt.savefig('figure35_batching_regularisation.png')

plt.show()

files.download('figure35_batching_regularisation.png')


In [None]:
# %% Exercise 1
#    Is there a relationship between the test_size parameter in train_test_split(), the batchsize parameter in DataLoader,
#    and the length of test_data? Think of your answer first, then test it in code, by creating new dataloader objects
#    with varying test_size parameters.
#    Hint: You can use the code 'len(test_data.dataset.tensors[1])', which returns the length of the labels vector.

# Changing the proportion of test data will affect the number of mini-batches, depending on the size of the
# mini-batches; large proportions of test data combined with large mini-batches sizes -for example- are at risk
# of dropping larger chunks of data (if drop_last=True)

# %% Modified split

# Try different test_size values
train_data,test_data,train_labels,test_labels = train_test_split(data,labels,test_size=0.2)

train_data = TensorDataset(train_data,train_labels)
test_data  = TensorDataset(test_data,test_labels)

# Try different sizes
batch_size   = 16
train_loader = DataLoader(train_data,batch_size=batch_size,shuffle=True,drop_last=True)
test_loader  = DataLoader(test_data,batch_size=test_data.tensors[0].shape[0])

# Check length of label vector and batch sizes
print(len(test_data.tensors[1]))
for X,y in train_loader:
    print(X.shape,y.shape)


In [None]:
# %% Exercise 2
#    Let's say you didn't care about the three types of irises; you only want a model that labels a flower as setosa or
#    "other." What would you have to change in the data and in the model to make this work?

# Some minimal changes would allow to change the model to classify 'setosa' and 'others'; mainly, one
# need to change the labels and the number of output nodes

# %% Modify labels and output nodes

iris = sns.load_dataset('iris')
data = torch.tensor(iris[iris.columns[0:4]].values).float()

labels = torch.zeros(len(data),dtype=torch.long)
labels[iris.species=='setosa']     = 0
labels[iris.species=='versicolor'] = 1
labels[iris.species=='virginica']  = 1

def gen_model():

    # Architecture
    ANN = nn.Sequential(
             nn.Linear(4,64),
             nn.ReLU(),
             nn.Linear(64,64),
             nn.ReLU(),
             nn.Linear(64,2))

    # Loss function
    loss_fun = nn.CrossEntropyLoss()

    # Optimizer (small lr for illustration purpose)
    optimizer = torch.optim.SGD(ANN.parameters(),lr=0.0005)

    return ANN,loss_fun,optimizer


In [None]:
# %% Exercise 3
#    In the course section "More on data," you will learn that unbalanced designs can be problematic for DL models (an
#    unbalanced design means that there is an uneven distribution of samples in different categories). Does the
#    modification in #2 produce an unbalanced design? To find out, count the number of data labels that are 0 (setosa) or
#    1 (not setosa).

# Yes, the previous model was designed to classify 3 labels and the data were divided into 3 balanced
# categories, merging 2 categories together produces an unbalanced design

print(labels[labels==0].shape)
print(labels[labels==1].shape)
