In [1]:
import math
import numpy as np

from sklearn.datasets import fetch_openml
import tensorflow as tf


In [2]:
# Load the MNIST data and spilt into training and test sets

mnist = fetch_openml('mnist_784', version = 1, as_frame=False, parser='liac-arff')

X, y = mnist['data'], mnist['target']

X_train, X_test = X[:60000]/255, X[60000:]/255
y_train, y_test = y[:60000], y[60000:]

In [3]:
# Split the 70000 examples into test and training sets
X_train_list, X_test_list = X[0:60000]/255, X[60000:]/255
y_train_list, y_test_list = y[0:60000], y[60000:]

X_train = tf.constant(X_train_list, dtype = tf.dtypes.float32)
X_test  = tf.constant(X_test_list,  dtype = tf.dtypes.float32)

# Define Network

Network is fully connected with 3 layers:

    Input layer of 784 (28 x 28) nodes
    Hidden layer of 64 nodes
    Output layer of 10 nodes.

Weights are randomised (biases have been removed from the network).

$$ w[0] = \begin{pmatrix} w_{1\ 1} & w_{1\ 2} &... & w_{1\ 784} \\ 
                       w_{2\ 1} & w_{2\ 2} & ... & w_{2\ 784}\\ 
                      ... & ... & ... & ...\\ 
                       w_{64\ 1} & w_{64\ 2} & ... & w_{64\ 784}\end{pmatrix} $$

$$w[1] = \begin{pmatrix} w_{1\ 1} & w_{1\ 2} &... & w_{1\ 64} \\ 
                       w_{2\ 1} & w_{2\ 2} & ... & w_{2\ 64}\\ 
                      ... & ... & ... & ...\\ 
                       w_{10\ 1} & w_{10\ 2} & ... & w_{10\ 64}\end{pmatrix} $$
\]



In [4]:
layer_nodes = (784, 64, 10)

network_weights = []

for layer, nodes in enumerate(layer_nodes[:-1]):

    network_weights.append(tf.random.uniform(shape=(layer_nodes[layer + 1], nodes), minval=-0.085, maxval=0.085, dtype = tf.dtypes.float32))
  


# Feed Foreward

The input matrix has one column / image


$$ input\ s = \begin{pmatrix} pixel_{1\ 1} & pixel_{1\ 2} &... & pixel_{1\ 1000} \\ 
                       pixel_{2\ 1} & pixel_{2\ 2} & ... & pixel_{2\ 1000}\\ 
                      ... & ... & ... & ...\\ 
                       pixel_{784\ 1} & pixel_{784\ 2} & ... & pixel_{64\ 1000}\end{pmatrix} $$
$$\eta_j =  \sum_{i=1}^{784}s_{i}w_{ij}$$


$$ hidden\ h_{j} = \sigma( \eta_j) \ \ \ \ \ \  \sigma(z) = \frac{1} {1 + e^{-z}}$$

$$ output\ \ S(z_i) = \frac{e^{z_{i}}}{\sum_{j=1}^{10} e^{z_{j}}} \ \ \ for\ i=1,2,\dots,10  \ \ \ \ z_{j} = \sum_{i=1}^{64}h_{j}w_{ij}\ \ \ $$


In [5]:
def feedforward(input_vector):

    def activation(matrix, activation="relu"):

        if activation == "relu":

            activated = max(0, matrix)

        elif activation == "sigmoid":

            activated = 1 / (1 + tf.exp(-matrix))

        elif activation == "softmax":
          
            activated = tf.exp(matrix) / tf.reduce_sum(tf.exp(matrix), axis=0)
        else:
            print("Unspecified activation.")

        return activated

    hidden = tf.transpose(input_vector)

    layers = []

    layers.append(hidden)

    for i, weights in enumerate(network_weights[:-1]):

        hidden = activation(tf.matmul(weights, hidden), 'sigmoid')

        layers.append(hidden)

    output = activation(tf.matmul(network_weights[-1], hidden), 'softmax')

    layers.append(output)

    return layers


In [6]:
def accuracy(X, y):

    # Measure accuracy of network:

    count, correct = 0, 0

    predictions = tf.transpose(feedforward(X)[-1])

    for i, prediction in enumerate(predictions):

        count += 1

        if int(y[i]) == np.argmax(prediction):

            correct += 1

    return correct / count

## Pre-training

On an untrained netork accuracy should be around 10%!

In [7]:
print('Pre-training ', 100 * accuracy(X_test, y_test_list))

Pre-training  9.42


## Calculate Neuron Activations

layers[] is a list of 3 tensors corresponding to the input, the hidden and the output neurons.

## Gradients, Targets, Deltas and Weight Updates

$$ \sigma'(h_{i}) = h_i(1-h_i)$$

The target vector is a one-hot vector, and represents the correct probability distribution of the classification.

$$ target\  t_k = \begin{pmatrix} 0 & 1 & ...\\ 0 & 0 & ... \\ 1 & 0 & ...\\0 & 0 & ... \\  ... &... &...\end{pmatrix} $$

The cross entropy measures increases with distance between 2 probability distributions.
$$Cross\ Entropy\ Error\ CE_j = y_j \log o_j$$
Output Layer Deltas:
$$\delta_k =  \frac{\partial }{\partial o_k} CE = -(t_k - o_k)$$
Hidden Layer Deltas
$$\delta_j = (\sum_k\delta_kw_{jk})h_j(1-h_j) $$

Weight Changes
$$\Delta w_{ij} = -\delta_j h_i $$


In [8]:
def train(input_vector_image, input_vector_labels):

    batch_size = len(input_vector_image)
    layers = feedforward(input_vector_image)
    output = layers[2]
    

    depth  = len(layers)
    deltas    = [None] * depth
    DWeights  = [None] * (depth - 1)
    DBiases   = [None] * (depth -1)
    Gradients = [None] * (depth - 1)
    
    Gradients[1] = layers[1] * (1 - layers[1])
    Gradients[0] = layers[0] * (1 - layers[0])
    
    epsilon = 0.4
    
    np_target = np.full((layer_nodes[-1], len(input_vector_labels)), 0.00)
    for i, targ in enumerate(input_vector_labels): np_target[int(targ)][i] = 1.0
    target     = tf.constant(np_target, dtype=tf.dtypes.float32)

    cross_entropy_loss = output - target

    #calculate errors and backpropogate through the network using gradient descent
    deltas[2] = -epsilon * cross_entropy_loss
    deltas[1] = tf.matmul(tf.transpose(network_weights[1]), deltas[2]) * Gradients[1]
    deltas[0] = tf.matmul(tf.transpose(network_weights[0]), deltas[1]) * Gradients[0]

    DWeights[1] = tf.tensordot(deltas[2], tf.transpose(layers[1]), axes=1)
    DWeights[0] = tf.tensordot(deltas[1], tf.transpose(layers[0]), axes=1)
    
    network_weights[1] = network_weights[1] + (1/batch_size) * DWeights[1]
    network_weights[0] = network_weights[0] + (1/batch_size) * DWeights[0]



# Minibatch Training 

Loop over 5 epochs, processing 32 items at a time!

In [9]:
epochs = 5

batch_size = 32

for epoch in range(epochs):
    
    for batch_start in range(0, len(X_train), batch_size):
        
        train(X_train[batch_start: batch_start + batch_size], y_train[batch_start: batch_start +batch_size])






In [10]:
print('Post-training ', 100 * accuracy(X_test, y_test_list))

Post-training  96.77
