In [54]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

data = pd.read_csv('MINST_train.csv')


In [55]:
data.head(3)

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [56]:
data.pop("label")

0        1
1        0
2        1
3        4
4        0
        ..
41995    0
41996    1
41997    7
41998    6
41999    9
Name: label, Length: 42000, dtype: int64

In [57]:
data = np.array(data)

In [63]:
# Assign X & Y, and divide by 255 to scale to 0-1 and have no overflow error when doing exponential operations.
    # Divide by 255 as there are 256 possible pixel values (0->255)

# We transpose the data to match the shape the weights matrices will be in:
#   Neurons:
#   [ neuron 1      weight1, weight2, weight3 ]
#   [ neuron 2      weight1, weight2, weight3 ]
#   [ neuron 3      weight1, weight2, weight3 ]
#   [ neuron 10     weight1, weight2, weight3 ]
#
#   Pixels:
#   [ pixel1        example1, example2, example3 ]
#   [ pixel2        example1, example2, example3 ]
#   [ pixel3        example1, example2, example3 ]
#   ...
#   [ pixel748      example1, example2, example3 ]

# Data -> [ pix1, pix2, pix3 .....
#         [ example1 pixel 1, example1 pixel2, example1 pixel3 ....
#         [ example2 pixel 1, example2 pixel2, example2 pixel3 ....
X = data.T / 255

# Data.T (Transposed, or "Flipped") ->  [ pix1 example1 pixel 1, example2 pixel 1, example3 pixel1, .....
#                                       [ pix2 example1 pixel 2, example2 pixel 2, example3 pixel2, .....
#                                       [ pix3 example1 pixel 3, example2 pixel 3, example3 pixel3, .....

# Before transposing, each column to the right is a new pixel of the same example
# After transposing, each column to the right is a new example of the same pixel

Y = np.array(pd.read_csv('MINST_train.csv').pop("label")) / 255

print("X Shape (Rows, Cols)", X.shape)  # 784 pixels of a 28x28 image down the way, and 42000 image examples along to the right
print("Y Shape (Rows, Cols)", Y.shape)

(42000, 784)
X Shape (Cols, Rows) (784, 42000)
Y Shape (Cols, Rows) (42000,)


In [59]:
# A neural net is made up of inputs, weights & biases on each neuron, an activation function for each neuron, & final outputs
    # 1. Input features
    # 2. Layer(s) of Neurons: Weights, biases, and activation functions
    # 3. Output

# Training a neural network is done by propagating inputs through the layers of the network, then going backwards to optimise based on an error value.
# You do this repeatedly.

# Need weights & biases
def initialise_neural_net_parameters(n_features, n_neurons=10):
    '''
    @returns W1, b1, W2, b2
    
    - W1: Each neuron's weights for every input feature (all input features go to each neuron)
    - W2: Each neuron's weights for each of the previous layer's outputs (all inputs from last layer go to each neuron)
    - b1: bias for the first layer
    - b2: bias for the second layer
    '''

    W1 = np.random.rand(n_neurons, n_features)  # 10 rows, 784 columns -> weights for all features
                                                    # Each row is a neuron, and each neuron has 784 weights
                                                    # There are 784 weights per neuron, as each neuron receives input from each pixel in input layer
    W2 = np.random.rand(n_neurons, n_neurons)   # 10 rows, 10 columns -> weights for the 10 neurons in 2nd layer
                                                    # Each row is a neuron, and each neuron has 10 weights
                                                    # There are 10 weights per neuron as each neuron receives input from the 10 neurons in last layer

    b1 = np.random.rand(10, 1)
    b2 = np.random.rand(10, 1)

    return W1, b1, W2, b2

def ReLU(input):
    '''Linear Function which floors the lowest value at 0, else takes the positive value.'''
    return np.maximum(0, input)

def ReLU_derivative(input):
    '''Returns slope of line as 1 if greater than 0, else slope of line as 0 if <= 0'''
    return input > 0

def softmax(input):
    '''Converts a vector of numbers into a vector of probabilities based on the total (if 1,1,1, then probability of each is .33)'''
    return np.exp(input) / np.sum(np.exp(input))

# Need forward propagation to train
def forward_propagation(W1, b1, W2, b2, X):
    '''
    Calculate output of each layer
    @returns layer1WBOut, layer1ActivationOut, layer2WBOut, layer2ActivationOut
    '''
    
    # Output of a layer is ϕ((X * W) + b)
        # ϕ is activation function
        # X is matrix of inputs (we will multiply each input by weight)
        # W is weights (we will multiply each weight by each input)
        # b is bias
    
    # 1st layer uses pixel input with weights & bias
    layer1WeightBiasOutput = W1.dot(X) + b1
    layer1ActivationOutput = ReLU(layer1WeightBiasOutput)       # functions are what make a NN linear or non-linear

    # 2nd layer uses output from layer 1 as input against its weights & bias
    layer2WeightBiasOutput = W2.dot(layer1ActivationOutput) + b2
    layer2ActivationOutput = softmax(layer2WeightBiasOutput)    # functions are what make a NN linear or non-linear

    return layer1WeightBiasOutput, layer1ActivationOutput, layer2WeightBiasOutput, layer2ActivationOutput

def backward_propagation(L1WB, L1A, L2WB, L2A, W2, X, Y):
    '''
    Figures out how much each part of each layer contributed to the error
    @returns l1WErr, l1bErr, l2WErr, l2bErr
    '''
    
    # TODO: 1-hot encoding?

    # how much did each part of each layer contribute to error?
    layer2Error = L2A - Y
    layer2WeightError = (1 / Y.size) * layer2Error.dot(L1A.T)
    layer2BiasError = (1 / Y.size) * np.average(layer2Error)

    layer1Error = W2.T.dot(layer2Error) * ReLU_derivative(L1A)
    layer1WeightError = (1 / Y.size) * layer1Error.dot(X.T)
    layer1BiasError = (1 / Y.size) * np.average(layer1Error)

    return layer1WeightError, layer1BiasError, layer2WeightError, layer2BiasError

def update_neural_net_parameters(W1, b1, W2, b2, errW1, errb1, errW2, errb2, alpha):
    '''@returns new W1, b1, W2, b2'''
    W1 = W1 - alpha * errW1
    b1 = b1 - alpha * errb1

    W2 = W2 - alpha * errW2
    b2 = b2 - alpha * errb2

    return W1, b1, W2, b2

In [60]:
def get_predictions(L2A):
    # Get indices of largest values
    return np.argmax(L2A, 0)

def get_accuracy(predictions, Y):
    return np.sum(predictions == Y) / Y.size

def gradient_descent(X, Y, iterations, alpha):
    W1, b1, W2, b2 = initialise_neural_net_parameters(n_features=X.shape[0])

    for i in range(iterations):
        layer1WeightBiasOutput, layer1ActivationOutput, layer2WeightBiasOutput, layer2ActivationOutput = forward_propagation(W1, b1, W2, b2, X)
        l1WErr, l1bErr, l2WErr, l2bErr = backward_propagation(layer1WeightBiasOutput, layer1ActivationOutput, layer2WeightBiasOutput, layer2ActivationOutput,
                                                                W2, X, Y)
        W1, b1, W2, b2 = update_neural_net_parameters(W1, b1, W2, b2, l1WErr, l1bErr, l2WErr, l2bErr, alpha)
        
        if i % 10 == 0:
            print("Iteration:", i)
            predictions = get_predictions(layer2WeightBiasOutput)
            print(get_accuracy(predictions, Y))
    
    return W1, b1, W2, b2    

In [61]:
W1, b1, W2, b2 = gradient_descent(X, Y, iterations=500, alpha=0.10)

Num input features: 784
Iteration: 0
0.0


  return np.exp(input) / np.sum(np.exp(input))
  return np.exp(input) / np.sum(np.exp(input))


Iteration: 10
0.09838095238095237
Iteration: 20
0.09838095238095237
Iteration: 30
0.09838095238095237
Iteration: 40
0.09838095238095237
Iteration: 50
0.09838095238095237
Iteration: 60
0.09838095238095237
Iteration: 70
0.09838095238095237
Iteration: 80
0.09838095238095237
Iteration: 90
0.09838095238095237
Iteration: 100
0.09838095238095237
Iteration: 110
0.09838095238095237
Iteration: 120
0.09838095238095237
Iteration: 130
0.09838095238095237
Iteration: 140
0.09838095238095237
Iteration: 150
0.09838095238095237
Iteration: 160
0.09838095238095237
Iteration: 170
0.09838095238095237
Iteration: 180
0.09838095238095237
Iteration: 190
0.09838095238095237
Iteration: 200
0.09838095238095237
Iteration: 210
0.09838095238095237
Iteration: 220
0.09838095238095237
Iteration: 230
0.09838095238095237
Iteration: 240
0.09838095238095237
Iteration: 250
0.09838095238095237
Iteration: 260
0.09838095238095237
Iteration: 270
0.09838095238095237
Iteration: 280
0.09838095238095237
Iteration: 290
0.098380952380