# Digit Multiclass Classification From Scratch

In [1]:
import numpy as np 
from keras.datasets import mnist
import timeit

## Import and preprocess the mnist data

In [2]:
from keras.utils import to_categorical

(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

train_images = train_images.reshape((60000, 28*28))
train_images = train_images.astype('float32') / 255 

test_images = test_images.reshape((10000, 28*28))
test_images = test_images.astype('float32') / 255 

train_labels = to_categorical(train_labels)
test_labels = to_categorical(test_labels)

## Initialisation

- random initialisation of weights and biases
- 1 hidden layer (512 neurons)
- dense layers

In [4]:
def initialise(in_n: int ,hidden_n: int , out_n: int):
    w1 = np.random.rand(hidden_n, in_n) - 0.5 # weights range (-1/2, 1/2)
    w2 = np.random.rand(out_n, hidden_n) - 0.5
    b1 = np.zeros((hidden_n, 1)) # initialise biases to 0
    b2 = np.zeros((out_n, 1))
    return w1, b1, w2, b2

w1, b1, w2, b2 = initialise(784, 512, 10)

print(w1.shape)
print(w2.shape)
print(b1.shape)
print(b2.shape)

(512, 784)
(10, 512)
(512, 1)
(10, 1)


## Activations, Derivatives of Activations

- ReLU used for hidden layer
- softmax used for output

In [5]:
# ReLU
def relu(z):
    return np.maximum(z, 0)

# Derivative of ReLu
def relu_prime(z):
    return relu(z) > 0

# softmax
def soft_max(z):
    e_z = np.exp(z - np.max(z))
    return e_z / e_z.sum()

# derivative of softmax
def softmax_derivative(softmax):
    s = softmax.reshape(-1,1)
    return np.diagflat(s) - np.dot(s, s.T)

## Forward Pass

In [6]:
def forward_pass(a0, w1, b1, w2, b2):
    z1 = np.dot(w1,a0) + b1
    a1 = relu(z1)
    z2 = np.dot(w2, a1) + b2
    a2 = soft_max(z2)
    return z1, a1, z2, a2

## Compute the Gradrients (Back Propagation)

In [7]:
def compute_gradients(a2, y, a1, w2, z1, a0):
    de2 = a2 - y
    db2 = de2
    dw2 = np.dot(de2, a1.T)
    da1 = np.dot(w2.T, de2)
    de1 = da1 * relu_prime(z1)
    db1 = de1
    dw1 = np.dot(de1, a0.T)
    return dw1, db1, dw2, db2

## Updating the Weights and Biases

In [21]:
def update(w1, b1, w2, b2, dw1, db1, dw2, db2, lr):
    w1 = w1 - lr*dw1
    b1 = b1 - lr*db1
    w2 = w2 - lr*dw2
    b2 = b2 - lr*db2
    return w1, b1, w2, b2 

## Loss Function (cross-entropy)

In [33]:
def crossentropy(a, y):
    assert a.shape == y.shape
    n = a.shape[0]
    entropy = 0
    for i in range(n):
        entropy += y[i] * np.log(a[i])
    
    return (-1*entropy)[0]


## Training

In [30]:
# learning rate
lr = 0.01
epochs = 5

w1, b1, w2, b2 = initialise(784, 512, 10)

for i in range(epochs):
    loss = 0
    for j in range(60000):
        a0 = train_images[j].reshape(784, 1)
        y = train_labels[j].reshape(10,1)
        z1, a1, z2, a2 = forward_pass(a0, w1, b1, w2, b2)
        dw1, db1, dw2, db2 = compute_gradients(a2, y, a1, w2, z1, a0)
        w1, b1, w2, b2 = update(w1, b1, w2, b2, dw1, db1, dw2, db2, lr)
        loss = crossentropy(a2, y)
    print(f'epoch {i+1} | loss = {loss}')


epoch 1 | loss = 0.00011885735165259896
epoch 2 | loss = 1.792427957297521e-06
epoch 3 | loss = 2.0561736137659014e-06
epoch 4 | loss = 9.016175753876706e-06
epoch 5 | loss = 3.4418269629549524e-05


- The losses are low
- the main point of this exercise is a comparison between direct computation and (manual coding) and tensorflow(keras)
- it took 12 minutes 8.5 seconds to complete the training on 5 epochs

## Testing 

In [32]:
correct_test = 0
for i in range(10000):
    a0 = test_images[i].reshape(784, 1)
    y = test_labels[i].reshape(10, 1)
    # using the last updated weights in the training 
    z1, a1, z2, a2 = forward_pass(a0, w1, b1, w2, b2)
    if np.argmax(a2) == np.argmax(y):
        correct_test += 1

correct_train = 0
for i in range(60000):
    a0 = train_images[i].reshape(784, 1)
    y = train_labels[i].reshape(10, 1)
    # using the last updated weights in the training 
    z1, a1, z2, a2 = forward_pass(a0, w1, b1, w2, b2)
    if np.argmax(a2) == np.argmax(y):
        correct_train += 1
accuracy_train = correct_train/60000
accuracy_test = correct_test/10000

print(f'Train Accuaracy: {accuracy_train}')
print(f'Test Accuaracy: {accuracy_test}')

Train Accuaracy: 0.9951
Test Accuaracy: 0.9721


## Summary
- Train Accuracy: $0.9951$
- Test Accuracy: $0.9721$
- The test accuracy is lower than the train accuracy which suggests some overfitting 
- However the difference for this type of task is not extreme and the test accuracy is high
- Perhaps altering the learning rate will yield a more generalisable model but the model calculated is sufficiently generalisable