### Neural network training code

This code contains the Network class which handles the defining and training of any neural networks we use. In particular this class contains routines such as feed-forward, back-propagation, stochastic gradient descent, network evaluation and more. For this basic version of the neural network code I mostly followed the online tutorial "Neural Networks and Deep Learning" by Michael A. Nielsen which was invaluable in understanding the algorithms behind stochastic gradient descent and backpropagation as well as how to implement them into python code.

In [1]:
#Importing standard libraries
import random
import numpy as np

#Miscellaneous functions mostly used in the Network class.
#Sigmoid activation function.
def sigmoid(z): 
    return 1.0/(1.0+np.exp(-z))

#Derivative of the sigmoid function; the chain rule shows that it can be expressed as follows in terms of itself.
def sigmoid_prime(z):    
    return sigmoid(z)*(1-sigmoid(z))
    
#Function that takes an integer j and returns a 10-entry vector with a 1 in the j^th position and 0 everywhere else.
def vectorized_result(j):
    e = np.zeros((10, 1))
    e[j] = 1.0
    return e

In [2]:
#Libraries for importing data sets.
import pickle  
import gzip

#Code for loading the MNIST data into the python runtime. 
def load_data():
    #Opening the file containing the MNIST data.
    f = gzip.open('mnist.pkl.gz', 'rb')
    #Loading the 50,000 training examples and the 10,000 validation and test examples.
    training_data, validation_data, test_data = pickle.load(f, encoding='bytes')
    f.close()
    return (training_data, test_data) #Here I don't use validation data and just return the training and test data.

#Code for organising the training and test data so that its ready for training use.
def load_data_wrapper():
    #Loading in the MNIST training data and test data using our load_data() routine above.
    tr_d, te_d = load_data()
    #Organising the 50,000 training data images from MNIST with their corresponding labels.
    training_inputs = [np.reshape(x, (784, 1)) for x in tr_d[0]]
    #For the test data we save the labels in vector form rather than as integers.
    training_results = [vectorized_result(y) for y in tr_d[1]] 
    training_data = list(zip(training_inputs, training_results))
    #Organising the 10,000 test data images from MNIST with their corresponding labels.
    test_inputs = [np.reshape(x, (784, 1)) for x in te_d[0]]
    test_data = list(zip(test_inputs, te_d[1]))
    return (training_data, test_data)

In [7]:
#Network class: this is the heart of the code as it contains all functions used in defining and training the neural networks.
#This class contains routines such as feed-forward, back-propagation, stochastic gradient descent, network evaluation and more.
#The entries of `layers' denote the number of nodes you want in each layer.
class Network(object):
    def __init__(self, layers): 
        
        #Initialising some useful parameters:
        self.num_layers = len(layers) #Number of layers.
        self.layers = layers #Array storing the number of nodes per layer.
        #Initialising storage for our weights and biases.
        self.weights = [None for layer in layers[:-1]]  
        self.biases = [None for layer in layers[1:]]

        #Looping over each layer in our network to initialise weights and biases in each layer.
        for i in range(self.num_layers-1):
            if type(layers[i+1]) == list: 
                next_layer_size = layers[i+1][0]*layers[i+1][1]  #Determining the size needed for inputs to our layer if it's a convolutional layer.                    
            else: next_layer_size = layers[i+1]  #Otherwise we have a fully connected layer so its size is simply prescribed by the value of layers[i+1].
            self.biases[i] = np.array(np.random.randn(next_layer_size, 1),dtype=np.float32) #Note that we don't define biases for fully connected first layer nodes.
            self.weights[i] = np.array(np.random.randn(next_layer_size, layers[i])/np.sqrt(layers[i]),dtype=np.float32) 
            
    #Code for feeding forward an array of input activations 'a' through our network to produce output activations.
    def feedforward(self, a):
        #Here we loop over each layer in our network.
        for b, w in zip(self.biases, self.weights):   #The arrays of both the bias vectors and weight matrices have length num_layers-1 since there are no first layer biases and no final layer weights.
            #Calculating our vector of weighted inputs for each layer.
            z = w@a+b
            #Turning z into a vector of activations using the sigmoid activation function.
            a = sigmoid(z)                            
        return a

    
    #Function for training a neural network using stochastic gradient descent for a given number of epochs.
    #Here eta is the learning rate and if test_data is provided then the network will evaluate itself against it at the end of each epoch.
    def SGD(self, training_data, epochs, mini_batch_size, eta, test_data = None):
        
        #Recording the size of the test and training data and performing an initial evaluation on the untrained network.
        if test_data: 
            n_test = len(test_data)
            correct = self.evaluate(test_data)
            accuracies = [100*correct/n_test]
        n = len(training_data)
        
        #Looping over each epoch (1 epoch = 1 full run through the training data).
        for j in range(epochs):
            #Creating the mini-batches.
            random.shuffle(training_data)
            mini_batches = [training_data[k:k+mini_batch_size] for k in range(0, n, mini_batch_size)]
            for mini_batch in mini_batches:
                #Performing a single gradient descent step for each mini-batch.
                self.mini_batch_gradient_descent(mini_batch, eta, n)
            #If test data is provided then the network will evaluate its performance against this data after every epoch.
            if test_data:
                correct = self.evaluate(test_data)
                accuracies.append(100*correct/n_test)
        if test_data: return accuracies 
    
    #This code tweaks the network's learnable parameters using gradient descent over a single mini-batch.
    def mini_batch_gradient_descent(self, mini_batch, eta, n):
        #Initialising storage for the gradients of our cost function C wrt the network's weights and biases.
        grad_b = [np.zeros(b.shape) for b in self.biases]
        grad_w = [np.zeros(w.shape) for w in self.weights]
        #Defining a coefficient used in our weight update step
        inv_len_mini_batch = 1/len(mini_batch)
        #All our gradient terms below (i.e. grad_w, grad_b) use the below coefficient in our update step.
        grad_coeff = eta*inv_len_mini_batch 
        for x, y in mini_batch:
            delta_grad_b, delta_grad_w = self.backprop(x, y)
            grad_b = [gb+dgb for gb, dgb in zip(grad_b, delta_grad_b)]
            grad_w = [gw+dgw for gw, dgw in zip(grad_w, delta_grad_w)]
            #Update step for our learnable parameters.
            self.weights = [w-grad_coeff*gw for w, gw in zip(self.weights, grad_w)]
            self.biases = [b-grad_coeff*gb for b, gb in zip(self.biases, grad_b)]  
        return

    #Function for back-propagation through our network taking in one mini batch tuple input (x,y) at a time to calculate the gradient
    #vectors grad_b and grad_w of our cost function C wrt our weights and biases.
    def backprop(self, x, y): 
        
        #Initialising arrays to store our partial derivatives of C wrt weights and biases.
        grad_b = [np.array(np.zeros(np.shape(b)),dtype=np.float32) for b in self.biases] 
        grad_w = [np.array(np.zeros(np.shape(w)),dtype=np.float32) for w in self.weights] 
        #Here we feedforward to calculate our weighted inputs z^l and our activations a^l := activation_function(z) for each layer l.
        #Using these we can then calculate the final layer error vector delta^L and the other layer error vectors delta^l for all l.
        activation = x #This sets our mini batch inputs as our first layer activation vector.
        activations = [activation] #This creates a matrix to store each of our layer activations.
        zs = [] #This creates an array to store all of our weighted input vectors z (or z^l).
        
        #Feed-forward part of our backpropagation routine.
        for b, w in zip(self.biases, self.weights):   #The arrays of both the bias vectors and weight matrices have length num_layers-1 since there are no first layer biases and no final layer weights.
            z = w@activation+b   #Calculating our weighted input (for a non-convolutional layer).
            activation = sigmoid(z) #Applying the sigmoid activation function to z to obtain our activations.
            #Saving each layer's activations and weighted inputs for use in the backward pass later.
            activations.append(activation)
            zs.append(z)                    
            
        #Backpropagation part of our backprop routine.
        #First we calculate the error in the final layer using our output activations and the derivative of our cost function.
        delta = self.cost_derivative(activations[-1], y)*sigmoid_prime(zs[-1]) #Calculates our final layer error vector.
        #Next we backpropagate one layer at a time to calculate the error delta for each layer and then we calculate our cost gradient vectors grad_b and grad_w for each layer.
        #First we use our last layer delta to calculate the next delta and pass it on: since the last layer is forced to not be convolutional this is just our standard delta calculation.
        for l in range(1, self.num_layers): 
            if l != self.num_layers-1: z = zs[-l-1] #Unless we are on the first layer we will need our previous layer weighted input at each step.
            #Using the current layer delta to calculate grad_b and grad_w for a fully connected layer.
            grad_b[-l] = delta
            grad_w[-l] = delta@activations[-l-1].T   #Again this creates a matrix since we are multiplying our vectors lengthways.
            #We then calculate the value of delta to be passed down from this layer to the next. Note that for our last layer we can skip this step as there is no further layer to pass on a delta to.
            if l != self.num_layers-1: delta = (self.weights[-l].T@delta)*sigmoid_prime(z)  
        return (grad_b, grad_w) #Finally we return a tuple of our cost gradient vectors wrt our biases and matrices wrt our weights for our whole network.
    
    #Function for evaluating a network's performance on unseen test data.
    def evaluate(self, test_data):
        test_results = [(np.argmax(self.feedforward(x)), y) for (x, y) in test_data]
        return sum(int(x == y) for (x, y) in test_results)

    #Derivative of the L^2 cost.
    def cost_derivative(self, output_activations, y):
        return (output_activations-y)

In [8]:
#Creating a function for creating plots of networks' accuracies during training.
import matplotlib.pyplot as plt
def plot_accuracies(accuracies, color, yrange, single_epoch = False):
    plt.figure(figsize=(8,5))
    if single_epoch == True:
        barwidth = 1500
        training_examples_seen = np.arange(0, 50001, 2000)
        plt.bar(training_examples_seen, accuracies, barwidth, color=color, edgecolor='black')
        plt.xlabel("Training examples seen", fontsize = 14)
        plt.xlim([0-barwidth, 50000+barwidth])
    else: 
        epochs = np.arange(0,len(accuracies))
        plt.plot(epochs, accuracies, color=color)
        plt.xlabel("Training epochs (i.e. runs over the training set)", fontsize = 14)
        plt.xlim([0, len(accuracies)-1])
        plt.grid()
    #Adding a black horizontal line to mark the 10% accuracy obtainable via random chance.
    plt.plot([-5000, 55000], [10, 10], 'k', linewidth=1.5)
    plt.ylim(yrange)
    #Labelling axes and framing plot window.
    plt.ylabel("% accuracy on unseen test data", fontsize = 14)
    plt.show()
    
#Creating a function for creating a combined plot of three networks' accuracies.
def plot_multi_accuracies(accuracy_array, colors, legend, yrange):
    epochs = np.arange(0,len(accuracies1))
    plt.figure(figsize=(8,4.5))
    plt.grid()
    for (accuracy, color) in zip(accuracy_array, colors):
        plt.plot(epochs, accuracy, color=color)
    plt.legend(legend, loc="best", fontsize=12)
    #Adding a black horizontal line to mark the 10% accuracy obtainable via random chance.
    plt.plot([0, len(accuracy_array[0])-1], [10, 10], 'k', linewidth=1.5)
    #Labelling axes and framing plot window.
    plt.xlim([0, len(accuracy_array[0])-1])
    plt.ylim(yrange)
    plt.xlabel("Training epochs", fontsize = 14)
    plt.ylabel("% accuracy on unseen test data", fontsize = 14)
    plt.show()

In [51]:
#Loading the training and test data.
training_data, test_data = load_data_wrapper()
#Here I also define small and medium versions of the training and test data that can be used for faster but less accurate training.
training_data_small = random.sample(training_data, 500)
training_data_medium = random.sample(training_data, 5000)
test_data_small = random.sample(test_data, 100)
test_data_medium = random.sample(test_data, 1000)