# Implementation of Backpropagation with MNIST handwritten digits
Using only numpy for matrix math, this three layer feedforward neural network can classify MNIST digits at an accuracy of 85 percent. 

In [None]:
from tensorflow.examples.tutorials.mnist import input_data
import numpy as np

In [None]:
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)

In [None]:
def sigmoid_activation(x, deriv=False):
    if not deriv:
        return 1 / (1 + np.exp(-x))
    else:
        return np.exp(x) / ((np.exp(x) + 1) **2)

In [None]:
class NeuralNet:
    def __init__(self, learning_rate):
        self.l1weights = 2 * np.random.random([784, 16]) - 1
        self.l1biases = 2 * np.random.random([16]) - 1
        self.l2weights = 2 * np.random.random([16, 10]) - 1
        self.l2biases = 2 * np.random.random([10]) - 1
        self.learning_rate = learning_rate
        
    def fwd_pass(self, X):
        l1_activation = sigmoid_activation(np.dot(X, self.l1weights) + self.l1biases)
        l2_activation = sigmoid_activation(np.dot(l1_activation, self.l2weights) + self.l2biases)
        return l2_activation
    
    def gradient_descent_step(self, X, Y):
        # Forward propagation
        
        l1_z = np.dot(X, self.l1weights) + self.l1biases
        l1_activation = sigmoid_activation(l1_z)
        
        l2_z = np.dot(l1_activation, self.l2weights) + self.l2biases
        l2_activation = sigmoid_activation(l2_z)
        
        # This is the derivative of our cost function, mean squared error.
        # These are the gradients of the final activation layer,
        # the derivative of the cost function with respect to the activations.
        l2_activation_grad = l2_activation - Y
        
        # Use the chain rule to compute delta, and multiply the activation gradients
        # by the derivative of pre-activations with respect to the activations.
        l2_delta = l2_activation_grad * sigmoid_activation(l2_z, deriv=True)
        
        # Continues with the chain rule to find the layer 2 weight gradient by
        # multiplying the previous layer with delta d(wx+b)/dw=x
        l2_weight_grad = np.dot(l1_activation.T, l2_delta) / X.shape[0]
        
        # The bias gradient is delta since the next derivative via the
        # chain rule is d(wx+b)/db = 1
        l2_bias_grad = np.mean(l2_delta, 0).reshape(self.l2biases.shape)
        
        # We repeat the process of backpropagating using the chain rule and use
        # delta, the derivative of the cost with respect to pre-activations to
        # compute the previous layer activation gradient d(wx+b)/dx=w
        l1_activation_grad = np.dot(l2_delta, self.l2weights.T)
        
        # We continue finding the delta, weight, and bias gradients as before of
        # the previous layer
        l1_delta = l1_activation_grad * sigmoid_activation(l1_z, deriv=True)
        l1_weight_grad = np.dot(X.T, l1_delta) / X.shape[0]
        l1_bias_grad = np.mean(l1_delta, 0).reshape(self.l1biases.shape)
        
        # The weights and biases are updated by subtracting their gradients
        # multiplied by the learning rate, performing gradient descent
        self.l2weights -= self.learning_rate * l2_weight_grad
        self.l2biases -= self.learning_rate * l2_bias_grad
        
        self.l1weights -= self.learning_rate * l1_weight_grad
        self.l1biases -= self.learning_rate * l1_bias_grad
        
    def accuracy(self, X, Y):
        predicted_labels = np.argmax(self.fwd_pass(X), 1)
        true_labels = np.argmax(Y, 1)
        return np.mean(np.equal(predicted_labels, true_labels).astype(int))

In [None]:
epochs = 20000
batch_size = 100
learning_rate = 0.1
display_step = 1000

net = NeuralNet(learning_rate)

validation_xs, validation_ys = mnist.train.next_batch(1000)

for epoch in range(epochs):
    xs, ys = mnist.train.next_batch(batch_size)
    net.gradient_descent_step(xs, ys)
    if epoch % display_step == 0:
        print("Epoch", epoch, "Accuracy", net.accuracy(validation_xs, validation_ys))

print("Epoch", epoch, "Accuracy", net.accuracy(validation_xs, validation_ys))

Epoch 0 Accuracy 0.088
Epoch 1000 Accuracy 0.372
Epoch 2000 Accuracy 0.493
Epoch 3000 Accuracy 0.565
Epoch 4000 Accuracy 0.618
Epoch 5000 Accuracy 0.652
Epoch 6000 Accuracy 0.691
Epoch 7000 Accuracy 0.728
Epoch 8000 Accuracy 0.76
Epoch 9000 Accuracy 0.774
Epoch 10000 Accuracy 0.792
Epoch 11000 Accuracy 0.802
Epoch 12000 Accuracy 0.813
Epoch 13000 Accuracy 0.825
Epoch 14000 Accuracy 0.828
Epoch 15000 Accuracy 0.833
Epoch 16000 Accuracy 0.835
Epoch 17000 Accuracy 0.838
Epoch 18000 Accuracy 0.844
Epoch 19000 Accuracy 0.848
Epoch 19999 Accuracy 0.851
