In [None]:
# %% Deep learning - Section 10.87
#    Batch normalisation in practice

# This code pertains a deep learning course provided by Mike X. Cohen on Udemy:
#   > https://www.udemy.com/course/deeplearning_x
# The "base" code in this repository is adapted (with very minor modifications)
# from code developed by the course instructor (Mike X. Cohen), while the
# "exercises" and the "code challenges" contain more original solutions and
# creative input from my side. If you are interested in DL (and if you are
# reading this statement, chances are that you are), go check out the course, it
# is singularly good.


In [None]:
# %% Libraries and modules
import numpy               as np
import matplotlib.pyplot   as plt
import torch
import torch.nn            as nn
import seaborn             as sns
import copy
import torch.nn.functional as F
import pandas              as pd
import scipy.stats         as stats
import time

from torch.utils.data                 import DataLoader,TensorDataset
from sklearn.model_selection          import train_test_split
from google.colab                     import files
from torchsummary                     import summary
from IPython                          import display
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg')


In [None]:
# %% Load and prepare data

# Load
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
data = pd.read_csv(url,sep=';')

# Remove some outliers (see lec. 82 for why)
data = data[data['total sulfur dioxide']<200]

# Z-score all the variables but quality
cols2zscore = data.keys()
cols2zscore = cols2zscore.drop('quality')

for col in cols2zscore:
    mean_val  = np.mean(data[col])
    std_val   = np.std(data[col])
    data[col] = (data[col] - mean_val) / std_val

# Binarise quality
data.loc[:,'boolean_quality'] = 0
data.loc[data['quality']>5, 'boolean_quality'] = 1
data.loc[data['quality']<6, 'boolean_quality'] = 0 # Implicit but here for clarity

# Convert from pandas dataframe to PyTorch tensor
data_t = torch.tensor( data[cols2zscore].values ).float()
labels = torch.tensor( data['boolean_quality'].values ).float()

print(f'Data shape: {data_t.shape}')
print(f'Labels shape: {labels.shape}')

# Labels need to be multidimentional for PyTorch, not an array, and need to be long integers too
labels = labels[:,None]
print(f'Proper labels shape: {labels.shape}')


In [None]:
# %% Split into train and test data

# Split with scikitlearn
train_data,test_data,train_labels,test_labels = train_test_split(data_t,labels,test_size=0.1)

# Convert into PyTorch datasets
train_data = TensorDataset(train_data,train_labels)
test_data  = TensorDataset(test_data,test_labels)

# Convert into DataLoader objects
batch_size   = 32
train_loader = DataLoader(train_data,batch_size=batch_size,shuffle=True,drop_last=True)
test_loader  = DataLoader(test_data,batch_size=test_data.tensors[0].shape[0])


In [None]:
# %% Model class

class ANN_with_BN(nn.Module):
    def __init__(self):
        super().__init__()

        # Architecture and batch normalisation
        self.input  = nn.Linear(11,16)

        self.hid1   = nn.Linear(16,32)
        self.bn1    = nn.BatchNorm1d(16)
        self.hid2   = nn.Linear(32,20)
        self.bn2    = nn.BatchNorm1d(32)

        self.output = nn.Linear(20,1)

    # Forward propagation (boolean arg to switch batchnorm on and off)
    def forward(self,x,doBN):

        # Input (already normalised if data normalised)
        x = F.relu( self.input(x) )

        # Hidden layers (batchnorm, weighted sum, activation function)
        if doBN:

            x = self.bn1(x)
            x = self.hid1(x)
            x = F.relu(x)

            x = self.bn2(x)
            x = self.hid2(x)
            x = F.relu(x)

        else:

            x = self.hid1( F.relu(x) )
            x = self.hid2( F.relu(x) )

        # Output
        x = self.output(x)

        return x


In [None]:
# %% Function to train the model

# Parameters
num_epochs = 1000

def train_model(doBN=True):

    # Loss function and optimizer
    loss_fun  = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.SGD(ANN.parameters(),lr=0.01)

    # Initialise losses
    train_acc = []
    test_acc  = []
    losses    = []

    # Loop over epochs
    for epoch_i in range(num_epochs):

        # Switch training mode on
        ANN.train()

        batch_acc  = []
        batch_loss = []

        for X,y in train_loader:

            # Forward propagation and loss (with batchnorm arg)
            yHat = ANN(X,doBN)
            loss = loss_fun(yHat,y)

            # Only now do backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Batch training accuracy
            batch_acc.append( 100*torch.mean(((yHat>0) == y).float()).item() )
            batch_loss.append(loss.item())

        # Average accuracy from batch
        train_acc.append(np.mean(batch_acc))
        losses.append(np.mean(batch_loss))

        # Test accuracy
        ANN.eval()
        X,y = next(iter(test_loader))
        with torch.no_grad():
            yHat = ANN(X,doBN)
        test_acc.append( 100*torch.mean(((yHat>0) == y).float()).item() )

    # Function output
    return train_acc,test_acc,losses


In [None]:
# %% Test the model

# Create model instance and train, with and without batch normalisation (takes ~3 mins)
ANN = ANN_with_BN()
train_acc_BN,test_acc_BN,losses_BN = train_model(True)

ANN = ANN_with_BN()
train_acc_noBN,test_acc_noBN,losses_noBN = train_model(False)


In [None]:
# %% Functions for 1D smoothing filter

# Improved for edge effects - adaptive window
def smooth_adaptive(x,k):
    smoothed = np.zeros_like(x)
    half_k   = k // 2

    for i in range(len(x)):
        start       = max(0, i-half_k)
        end         = min(len(x), i+half_k + 1)
        smoothed[i] = np.mean(x[start:end])

    return smoothed


In [None]:
# %% Plotting

phi = ( 1 + np.sqrt(5) ) / 2
fig,ax = plt.subplots(1,3,figsize=(17,5))

ax[0].plot(smooth_adaptive(losses_BN,20),label='With batchnorm')
ax[0].plot(smooth_adaptive(losses_noBN,20),label='Without batchnorm')
ax[0].set_title('Losses')
ax[0].legend()

ax[1].plot(smooth_adaptive(train_acc_BN,20),label='With batchnorm')
ax[1].plot(smooth_adaptive(train_acc_noBN,20),label='Without batchnorm')
ax[1].set_title('Train accuracy')
ax[1].legend()

ax[2].plot(smooth_adaptive(test_acc_BN,20),label='With batchnorm')
ax[2].plot(smooth_adaptive(test_acc_noBN,20),label='Without batchnorm')
ax[2].set_title('Test accuracy')
ax[2].legend()

plt.savefig('figure18_batch_normalisation.png')

plt.show()

files.download('figure18_batch_normalisation.png')


In [None]:
# %% Exercise 1
#    BatchNorm1d takes several additional inputs, including one called "momentum." You will learn what this parameter
#    means later in this section, but briefly: it is a smoothing parameter that helps stabilize and improve learning
#    weights. The default value is .1. What happens when you change it to .001? How about setting it to zero?

# Not sure about the meaning of this parameter, but the test accuracy collapses
# when set to 0

# %% Modified model class

class ANN_with_BN(nn.Module):
    def __init__(self):
        super().__init__()

        # Architecture and batch normalisation
        self.input  = nn.Linear(11,16)

        self.hid1   = nn.Linear(16,32)
        self.bn1    = nn.BatchNorm1d(16,momentum=0.001)
        self.hid2   = nn.Linear(32,20)
        self.bn2    = nn.BatchNorm1d(32,momentum=0.001)

        self.output = nn.Linear(20,1)

    # Forward propagation (boolean arg to switch batchnorm on and off)
    def forward(self,x,doBN):

        # Input (already normalised if data normalised)
        x = F.relu( self.input(x) )

        # Hidden layers (batchnorm, weighted sum, activation function)
        if doBN:

            x = self.bn1(x)
            x = self.hid1(x)
            x = F.relu(x)

            x = self.bn2(x)
            x = self.hid2(x)
            x = F.relu(x)

        else:

            x = self.hid1( F.relu(x) )
            x = self.hid2( F.relu(x) )

        # Output
        x = self.output(x)

        return x


In [None]:
# %% Exercise 2
#    In the lecture, I said to apply batchnorm to the data *before* applying the nonlinearity (here, relu). This is also
#    implemented in the code above. However, this is discussed in the field and online, and not everyone agrees. Modify
#    the code to apply batch normalization *after* applying relu. Does that make a big difference? Thinking about the
#    math, which order makes more sense to you? (Don't worry, it's OK to disagree with me!)

# No substantial differences in the performance of the model

# %% Modified model class

class ANN_with_BN(nn.Module):
    def __init__(self):
        super().__init__()

        # Architecture and batch normalisation
        self.input  = nn.Linear(11,16)

        self.hid1   = nn.Linear(16,32)
        self.bn1    = nn.BatchNorm1d(32)
        self.hid2   = nn.Linear(32,20)
        self.bn2    = nn.BatchNorm1d(20)

        self.output = nn.Linear(20,1)

    # Forward propagation (boolean arg to switch batchnorm on and off)
    def forward(self,x,doBN):

        # Input (already normalised if data normalised)
        x = F.relu( self.input(x) )

        # Hidden layers (batchnorm, weighted sum, activation function)
        if doBN:

            x = self.hid1(x)
            x = F.relu(x)
            x = self.bn1(x)

            x = self.hid2(x)
            x = F.relu(x)
            x = self.bn2(x)

        else:

            x = self.hid1( F.relu(x) )
            x = self.hid2( F.relu(x) )

        # Output
        x = self.output(x)

        return x
