## Multilayer Perceptron (MLP) based on Pytorch (1)

Multi-class classification problem - using a MLP with configurable number of hidden neurons - with a configurable number of classes (up to 10). It selects them from the (Fashion-)MNIST dataset, splits it up into a train and test part, does normalisation and then trains a classifier using softmax.

Both datasets consist of images with 28x28 = 784 pixel each. The features refer to these pixel values of the images.

You can choose MNIST or Fashion-MNIST data in cell [2]

We use the PyTorch nn-library providing all required layer types and in particular the Sequential Container to set up a MLP [torch.nn](https://pytorch.org/docs/stable/nn.html).

In [None]:
import torch
import torchvision
import numpy as np
import matplotlib.pyplot as plt
import time

from utils import read_data, plot_img, plot_tiles, plot_error, plot_cost, plot_parameter_hist

In [None]:
#note the path in read_data for the data, which points to ../week1/data
x, y, labels_map = read_data('fashionMNIST', storage_path='../week1/data') #MNIST or fashionMNIST

In [None]:
plot_img(x[0])

In [None]:
#append rows x cols tiles of images
rows = 8
cols = 18
#figure size can be set
fig_size = [8,8]

plot_tiles(x, rows, cols, fig_size)

In [None]:
#choose a given class 0..9
digit  = 0

plot_tiles(x[y==digit], rows, cols, fig_size)
print(labels_map[digit])

In [None]:
#select the classes for your training and test set, select train and test split and to normalization
def prepare_data(classes, train_size=0.8, min_max_normalise=1, flatten=0):
    """
    prepare the data for training

    Arguments:
    classes -- list of classes to use for training (at least two classes must be given)
    train_size -- fraction of train image size
    min_max_normalise -- whether to do min-max-normalisation (1) or rescaling (0)
    flatten -- whether to flatten the 28x28 image to single row (=1); otherwise a new dimension is added at axis=1 (to be compatible with cnn)
    """

    if len(classes) < len(labels_map):
        for label in classes:
            print('labels chosen are: %r' % labels_map[label.item()])

    ind_sel = torch.isin(y, classes)

    x_sel = torch.zeros(x[ind_sel,:].shape, dtype=torch.float)
    x_sel.copy_(x[ind_sel,:])
    y_sel = torch.zeros(y[ind_sel].shape, dtype=y.dtype)
    y_sel.copy_(y[ind_sel])

    #replace the labels such that they are in successive order
    for i0 in range(0,len(classes)):
        if i0 != classes[i0]:
            y_sel[y_sel == classes[i0]] = i0

    #we give y back as simple vector -> simplifies handling below
    #y_sel = np.reshape(y_sel, (-1,1))
    
    #do train and test split
    num_samples = x_sel.shape[0]
    max_train_ind = int(train_size*num_samples)
    indices = torch.randperm(num_samples)
    
    x_train = x_sel[indices[:max_train_ind]]
    x_test = x_sel[indices[max_train_ind:]]
    
    y_train = y_sel[indices[:max_train_ind]]
    y_test = y_sel[indices[max_train_ind:]]

    #perform normalisation, take care of converting data type to float!
    xmax, xmin = torch.max(x_train), torch.min(x_train)
    
    if min_max_normalise:
        x_train = 2*(x_train - xmin) / (xmax - xmin) - 1
        x_test = 2*(x_test - xmin) / (xmax - xmin) - 1
    else:
        x_train = x_train / xmax 
        x_test = x_test / xmax 

    if flatten:
        m = x_train.shape[0]
        x_train = x_train.reshape([m,-1])
        m = x_test.shape[0]
        x_test = x_test.reshape([m,-1])
    
    return x_train, x_test, y_train, y_test

### Class MiniBatches

Splits the given dataset (`x: features` and `y: labels`) into individual batches of size `batch_size` (a value of `0` will return the full batch). The total number of batches available in an epoch is returned with method `number_of_batches()`. Each call to `next()` will return a new batch in the given format: `{'x_batch': x_batch, 'y_batch': y_batch}`

In [None]:
class MiniBatches:
    """
    obtains x- and y-data in the constructor and returns a sample of batch_size with each call to next()
    """
    def __init__(self, x, y, batch_size):
        """
        constructor

        Arguments:
        x/y -- data
        batch_size -- size of batch (0 means one single batch)
        """
        self.x = x
        self.y = y
        m = x.shape[0]
        self.indices = torch.randperm(m)
        self.n = x.shape[1]
        
        if not batch_size:
            self.batch_size = m
            self.mb = 1
        else:
            self.batch_size = batch_size        
            self.mb = int(m / self.batch_size)    
        
        self.ib = 0

    def number_of_batches(self):
        return self.mb

    def next(self):
        it = self.indices[self.ib * self.batch_size:(self.ib + 1) * self.batch_size]
        x_batch = self.x[it, :]
        y_batch = self.y[it]
        self.ib += 1

        return {'x_batch': x_batch, 'y_batch': y_batch}

### Class NeuralNetwork

This class constructs a Multilayer Perceptron with a configurable number of hidden layers. Cost function is CE. The method $propagate()$ returns the prediction $$ \hat{y}^{(i)}=h_\theta(\mathbf{x}^{(i)}) $$ on the input data (can be a n x 784 matrix of n images) and $back\_propagate()$ determines the gradients of the cost function with respect to the parameters (weights and bias for all layers) $$ \nabla_{\mathbf{\theta}} J(\mathbf{\theta}) $$
The method $gradient\_descend()$ finally does the correction of the parameters with a step in the negative gradient direction, weighted with the learning rate $$\alpha$$ for all layers.

In [None]:
class NeuralNetwork:
    """
    MLP class handling the layers and doing all propagation and back propagation steps
    all hidden layers are dense (with ReLU activation) and the last layer is softmax
    """
    def __init__(self, list_num_neurons):
        """
        constructor

        Arguments:
        list_num_neurons -- list of layer sizes including in- and output layer
        
        """
        self.model = torch.nn.Sequential()
        #first construct dense layers
        for i0 in range(len(list_num_neurons)-2):
            self.model.add_module('dense' + str(i0), torch.nn.Linear(list_num_neurons[i0], list_num_neurons[i0+1]))
            self.model.add_module('act' + str(i0), torch.nn.Sigmoid())
            
        #finally add softmax layer
        #we don't require activation function because it is included (for numerical reasons) in the cross 
        #entropy cost below; alternative is logSoftmax together with NLLLoss cost function
        self.model.add_module('dense' + str(i0+1), torch.nn.Linear(list_num_neurons[-2], list_num_neurons[-1]))
                         
        #define the cost function
        self.cost_fn = torch.nn.CrossEntropyLoss(reduction='mean')

        #used to save results
        self.result_data = torch.tensor([])
        
        #we keep a global step counter, thus that optimise can be called 
        #several times with different settings
        self.epoch_counter = 0 
        
    def propagate(self, x):
        """
        calculates the function estimation based on current parameters
        """    
        y_pred = self.model(x)

        return y_pred
           
     
    def back_propagate(self, cost):
        """
        calculates the backpropagation results based on expected output y
        this function must be performed AFTER the corresponding propagte step
        """    
        #set gradient values to zero
        self.model.zero_grad()
        
        cost.backward()
 

    def cost_funct(self, y_pred, y):
        """
        calculates the MSE loss function
        """
        cost = self.cost_fn(y_pred, y)
        
        return cost
    
         
    def gradient_descend(self, alpha):
        """
        does the gradient descend based on results from last back_prop step with learning rate alpha
        """
        with torch.no_grad():
            for param in self.model.parameters():
                param -= alpha * param.grad
            
         
    def calc_error(self, y_pred, y):
        """
        get error information
        """
        m = y.shape[0]

        y_pred_argmax = torch.argmax(y_pred, dim=1)
        error = torch.sum(y != y_pred_argmax) / m

        return error

    
    def append_result(self):
        """
        append cost and error data to output array
        """
        # determine cost and error functions for train and validation data
        y_pred_train = self.propagate(self.data['x_train'])
        y_pred_val = self.propagate(self.data['x_val'])

        res_data = torch.tensor([[self.cost_funct(y_pred_train, self.data['y_train']), 
                                  self.calc_error(y_pred_train, self.data['y_train']),
                                  self.cost_funct(y_pred_val, self.data['y_val']), 
                                  self.calc_error(y_pred_val, self.data['y_val'])]])
        
        self.result_data = torch.cat((self.result_data, res_data), 0)

        #increase epoch counter here (used for plot routines below)
        self.epoch_counter += 1 
        
        return res_data

        
    def optimise(self, data, epochs, alpha, batch_size=0, debug=0):
        """
        performs epochs number of gradient descend steps and appends result to output array

        Arguments:
        data -- dictionary with NORMALISED data
        epochs -- number of epochs
        alpha -- learning rate
        batch_size -- size of batches (1 = SGD, 0 = batch, 1 < .. < n = mini-batch)
        debug -- integer value; get info on gradient descend step every debug-step (0 -> no output)
        """
        #access to data from other methods
        self.data = data
        
        # save results before 1st step
        if self.epoch_counter == 0:
            res_data = self.append_result()

        for i0 in range(0, epochs):    
            #measure time for one epoch
            start=time.time()
            # create batches for each epoch
            batches = MiniBatches(self.data['x_train'], self.data['y_train'], batch_size)
            #set model to training mode
            self.model.train()
            for ib in range(batches.number_of_batches()):
                batch = batches.next()
                #do prediction
                y_pred = self.propagate(batch['x_batch'])
                #determine the loss 
                cost = self.cost_funct(y_pred, batch['y_batch'])
                #determine the error
                self.back_propagate(cost)
                #do the correction step
                self.gradient_descend(alpha)

            #save result
            self.model.eval()
            res_data = self.append_result()

            #end of time measurement
            end=time.time()
            
            if debug and np.mod(i0, debug) == 0:
                print('result after %d epochs (dt=%1.2f s), train: cost %.5f, error %.5f ; validation: cost %.5f, error %.5f'
                    % (self.epoch_counter-1, end-start, res_data[0, 0].item(), res_data[0, 1].item(), \
                                                                res_data[0, 2].item(), res_data[0, 3].item()))

        if debug:
            print('result after %d epochs, train: cost %.5f, error %.5f ; validation: cost %.5f, error %.5f'
                  % (self.epoch_counter-1, res_data[0, 0].item(), res_data[0, 1].item(), \
                                                                res_data[0, 2].item(), res_data[0, 3].item()))
                        
            

### Sample execution of Neural Network

The cell below shows how to use the class NeuralNetwork and how to perform the optimisation. The training and test data is given as dictionary in the call to the method $optimise()$. The classes (from 2 to 10) can be chosen via the `classes` list. This method can be called several times in a row with different arguments.

In [None]:
#choose the categories
classes = torch.tensor([0,1,2,3,4,5,6,7,8,9])

x_train, x_test, y_train, y_test = prepare_data(classes, train_size=0.8, min_max_normalise=1, flatten=1)

#further split in train and validation data
validation_size = 0.2
valid_ind = int(x_train.shape[0]*(1-validation_size))

#data is arranged as dictionary with quick access through respective keys
data = {'x_train' : x_train[:valid_ind,:], 'y_train' : y_train[:valid_ind],  \
               'x_val' : x_train[valid_ind:,:], 'y_val' : y_train[valid_ind:]}

#choose the hyperparameters you want to use for the initialisation
size_in = x_train.shape[1]
size_out = len(classes)
list_num_neurons = [size_in, 100, size_out]; 
NNet = NeuralNetwork(list_num_neurons)

#choose the hyperparameters you want to use for training
epochs = 40
batchsize = 16
learning_rate = 0.05
NNet.optimise(data, epochs, learning_rate, batchsize, debug=5)


plot_error(NNet)
plot_cost(NNet)

plot_parameter_hist(NNet)

y_pred = torch.argmax(NNet.propagate(x_test), axis=1)
false_classifications = x_test[(y_pred != y_test)]

print('test error rate: %.2f %% out of %d' % (100*false_classifications.shape[0]/y_pred.shape[0], y_pred.shape[0]))
print(false_classifications.shape)


In [None]:
#analyse false classified training or test images
y_pred = torch.argmax(NNet.propagate(x_test), axis=1)
false_classifications = x_test[(y_pred != y_test)]

print('test error rate: %.2f %% out of %d' % (100*false_classifications.shape[0]/y_pred.shape[0], y_pred.shape[0]))
print(false_classifications.shape)

#append rows x cols tiles of digits
rows = 7
cols = 8
#figure size can be set
fig_size = [8,8]

plot_tiles(false_classifications.reshape([-1,28,28]), rows, cols, fig_size)

#print the correct labels (for FashionMNIST)
if rows*cols < false_classifications.shape[0]:
    false_classifications_y = y_test[y_pred != y_test][:rows*cols]
else:
    false_classifications_y = np.append(y_test[y_pred != y_test], np.ones(rows*cols - false_classifications.shape[0])*-1)
print(false_classifications_y.reshape([cols,rows]).T.to(torch.int32))

In [None]:
#visualise weights of the first layer

print('we have %r weight vectors in layer [0]' % NNet.model[0].weight.shape[1])
print('choose a suitable combination of rows and cols below to plot them')

rows = 5
cols = 20
#figure size can be set
fig_size = [14,6]

plot_tiles(NNet.model[0].weight.detach().reshape([-1,28,28]), rows, cols, fig_size)