In [None]:
# %% Deep learning - Section 9.79
#    The importance of equal batch sizes

# This code pertains a deep learning course provided by Mike X. Cohen on Udemy:
#   > https://www.udemy.com/course/deeplearning_x
# The "base" code in this repository is adapted (with very minor modifications)
# from code developed by the course instructor (Mike X. Cohen), while the
# "exercises" and the "code challenges" contain more original solutions and
# creative input from my side. If you are interested in DL (and if you are
# reading this statement, chances are that you are), go check out the course, it
# is singularly good.


In [None]:
# %% Libraries and modules
import numpy               as np
import matplotlib.pyplot   as plt
import torch
import torch.nn            as nn
import seaborn             as sns
import copy
import torch.nn.functional as F

from torch.utils.data                 import DataLoader,TensorDataset
from sklearn.model_selection          import train_test_split
from google.colab                     import files
from torchsummary                     import summary
from IPython                          import display
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg')


In [None]:
# %% Generate some synthetic data

# Parameters
n_clust = 200
th = np.linspace(0,4*np.pi,n_clust)
r1 = 10
r2 = 15

# Data
a = [ r1*np.cos(th) + np.random.randn(n_clust)*3,
      r1*np.sin(th) + np.random.randn(n_clust)  ]
b = [ r2*np.cos(th) + np.random.randn(n_clust),
      r2*np.sin(th) + np.random.randn(n_clust)*3]

labels_np = np.vstack(( np.zeros((n_clust,1)),np.ones((n_clust,1)) ))
data_np   = np.hstack((a,b)).T

data   = torch.tensor(data_np).float()
labels = torch.tensor(labels_np).float()


In [None]:
# %% Plotting

fig = plt.figure(figsize=(6,6))

plt.plot(data[np.where(labels==0)[0],0],data[np.where(labels==0)[0],1],'s')
plt.plot(data[np.where(labels==1)[0],0],data[np.where(labels==1)[0],1],'o')
plt.title("Some nonlinear data clusters")
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')

plt.savefig('figure39_batching_regularisation_batch_size.png')

plt.show()

files.download('figure39_batching_regularisation_batch_size.png')


In [None]:
# %% Split into train and test data

# Split with scikitlearn
train_data,test_data,train_labels,test_labels = train_test_split(data,labels,test_size=0.1)

# Convert into PyTorch datasets
train_data = TensorDataset(train_data,train_labels)
test_data  = TensorDataset(test_data,test_labels)

# Convert into DataLoader objects
batch_size_tr = 16
batch_size_tt = test_data.tensors[0].shape[0]-20
train_loader  = DataLoader(train_data,batch_size=batch_size_tr,shuffle=True,drop_last=True)
test_loader   = DataLoader(test_data,batch_size=batch_size_tt)


In [None]:
# %% Check sizes of data batches

for X,y in test_loader:
    print(X.shape,y.shape)


In [None]:
# %% Model class

class model_class(nn.Module):
    def __init__(self):
        super().__init__()

        # Layers
        self.input  = nn.Linear(  2,128)
        self.hidden = nn.Linear(128,128)
        self.output = nn.Linear(128,1)

    # Forward propagation
    def forward(self,x):

        x = F.relu(self.input(x))
        x = F.relu(self.hidden(x))
        x = self.output(x)
        return x


In [None]:
# %% Function to generate the model

def gen_model():

    # Generate instance of model class
    ANN = model_class()

    # Loss function and optimizer
    loss_fun  = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.SGD(ANN.parameters(),lr=0.01)

    return ANN,loss_fun,optimizer


In [None]:
# %% Function to train the model

# Parameters
num_epochs = 500

def train_model(ANN,loss_fun,optimizer):

    # Initialise accuracies
    train_acc = []
    test_acc  = []

    # Loop over epochs
    for epoch_i in range(num_epochs):

        # Loop over data batches
        batch_acc = []
        for X,y in train_loader:

            # Forward propagation, loss, and backpropagation
            yHat = ANN(X)
            loss = loss_fun(yHat,y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Batch accuracy
            batch_acc.append( 100*torch.mean(((yHat>0)==y).float()).item() )

        # Average batch accuracy
        train_acc.append(np.mean(batch_acc))

        # Test accuracy (but averaging over 2 batches only!)
        tmp_acc = []
        for X,y in test_loader:
            yHat = ANN(X)
            tmp_acc.append( 100*torch.mean(((yHat>0)==y).float()).item() )

        test_acc.append(np.mean(tmp_acc))

    # Output
    return train_acc,test_acc


In [None]:
# %% Test model

ANN,loss_fun,optimizer = gen_model()
train_acc,test_acc = train_model(ANN,loss_fun,optimizer)


In [None]:
# %% Plotting

fig = plt.figure(figsize=(10,5))

plt.plot(train_acc,'s')
plt.plot(test_acc,'o')
plt.xlabel('Epochs')
plt.ylabel('Accuracy (%)')
plt.legend(['Train','Test'])
plt.title('Train and test accuracy - (un)balanced test batches')

plt.savefig('figure40_batching_regularisation_batch_size.png')

plt.show()

files.download('figure40_batching_regularisation_batch_size.png')
