In [9]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

data = pd.read_csv('MINST_train.csv')


In [10]:
data.head(3)

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
data.pop("label")

0        1
1        0
2        1
3        4
4        0
        ..
41995    0
41996    1
41997    7
41998    6
41999    9
Name: label, Length: 42000, dtype: int64

In [12]:
data = np.array(data)

In [13]:
# Tranposing flips columns/rows.
# You may need to transpose if you're doing a dot product of two matrices.
# Why? Because dot product of two matrices does columns in one against rows in another.
# So if the column is length 4 in m1, and row is length 3 in m2, it is invalid as non-matching lengths.

# Assign X & Y, and divide by 255 to scale to 0-1 and have no overflow error when doing exponential operations.
    # Divide by 255 as there are 256 possible pixel values (0->255)

# We transpose the data to match the shape the weights matrices will be in:
#   Neurons:
#   [ neuron 1      weight1, weight2, weight3 ]
#   [ neuron 2      weight1, weight2, weight3 ]
#   [ neuron 3      weight1, weight2, weight3 ]
#   [ neuron 10     weight1, weight2, weight3 ]
#
#   Pixels:
#   [ pixel1        example1, example2, example3 ]
#   [ pixel2        example1, example2, example3 ]
#   [ pixel3        example1, example2, example3 ]
#   ...
#   [ pixel748      example1, example2, example3 ]

# Data -> [ pix1, pix2, pix3 .....
#         [ example1 pixel 1, example1 pixel2, example1 pixel3 ....
#         [ example2 pixel 1, example2 pixel2, example2 pixel3 ....
X = data.T / 255

# Data.T (Transposed, or "Flipped") ->  [ pix1 example1 pixel 1, example2 pixel 1, example3 pixel1, .....
#                                       [ pix2 example1 pixel 2, example2 pixel 2, example3 pixel2, .....
#                                       [ pix3 example1 pixel 3, example2 pixel 3, example3 pixel3, .....

# Before transposing, each column to the right is a new pixel of the same example
# After transposing, each column to the right is a new example of the same pixel

Y = np.array(pd.read_csv('MINST_train.csv').pop("label")) / 255

print("X Shape (Rows, Cols)", X.shape)  # 784 pixels of a 28x28 image down the way, and 42000 image examples along to the right
print("Y Shape (Rows, Cols)", Y.shape)

X Shape (Rows, Cols) (784, 42000)
Y Shape (Rows, Cols) (42000,)


In [14]:
# A neural net is made up of inputs, weights for all inputs for each neuron, bias(es), an activation function for each neuron, & final outputs
    # 1. Input features
    # 2. Layer(s) of Neurons: A neuron is a set of weights for all inputs, a bias, and an activation function
    # 3. Output

# Training a neural network is done by propagating inputs through the layers of the network, then going backwards to optimise based on an error value.
# You do this repeatedly.

# Need weights & biases
def initialise_neural_net_parameters(n_features, n_neurons=10):
    '''
    @returns layer1Weights, layer1Biases, layer2Weights, layer2Biases
    
    - W1: Each neuron's weights for every input feature (all input features go to each neuron)
    - b1: biases for each neuron in the first layer
    - W2: Each neuron's weights for each of the previous layer's outputs (all inputs from last layer go to each neuron)
    - b2: biases for each in neuron the second layer
    '''

    layer1Weights = np.random.rand(n_neurons, n_features)  # 10 rows, 784 columns -> weights for all features
                                                    # Each row is a neuron, and each neuron has 784 weights
                                                    # There are 784 weights per neuron, as each neuron receives input from each pixel in input layer
    layer2Weights = np.random.rand(n_neurons, n_neurons)   # 10 rows, 10 columns -> weights for the 10 neurons in 2nd layer
                                                    # Each row is a neuron, and each neuron has 10 weights
                                                    # There are 10 weights per neuron as each neuron receives input from the 10 neurons in last layer

    layer1Biases = np.random.rand(10, 1)  # 10 rows, 1 value per row
    layer2Biases = np.random.rand(10, 1)  # 10 rows, 1 value per row

    return layer1Weights, layer1Biases, layer2Weights, layer2Biases

def ReLU(input):
    '''RELU result is 0 if input is < 0, else it is just the same as the input'''
    return np.maximum(0, input)

def ReLU_derivative(input):
    '''Returns slope of line as 1 if greater than 0, else slope of line as 0 if <= 0'''
    return input > 0

def softmax(input):
    '''Converts a vector of numbers into a vector of probabilities based on the total (if 1,1,1, then probability of each is .33)'''
    return np.exp(input) / np.sum(np.exp(input))

# Need forward propagation to train
def forward_propagation(layer1InputWeights, layer1Biases, layer2InputWeights, layer2Biases, X):
    '''
    Calculate output of each layer
    @returns layer1FunctionInput, layer1FunctionOutput, layer2FunctionInput, layer2FunctionOutput
    '''
    
    # Output of a layer is ϕ((X * W) + b)
        # ϕ is activation function
        # X is matrix of inputs (we will multiply each input by weight)
        # W is weights (we will multiply each weight by each input)
        # b is bias
    
    # 1st layer uses pixel input with weights & bias
    layer1FunctionInput = layer1InputWeights.dot(X) + layer1Biases
    layer1FunctionOutput = ReLU(layer1FunctionInput)       # functions are what make a NN linear or non-linear

    # 2nd layer uses output from layer 1 as input against its weights & bias
    layer2FunctionInput = layer2InputWeights.dot(layer1FunctionOutput) + layer2Biases
    layer2FunctionOutput = softmax(layer2FunctionInput)    # functions are what make a NN linear or non-linear

    return layer1FunctionInput, layer1FunctionOutput, layer2FunctionInput, layer2FunctionOutput

def backward_propagation(L1WB, layer1Output, L2WB, layer2Output, layer2Weights, X, Y):
    '''
    Figures out how much each part of each layer contributed to the error
    @returns l1WErr, l1bErr, l2WErr, l2bErr
    '''
    
    # TODO: 1-hot encoding?

    # how much did each part of each layer contribute to error?
    layer2Error = layer2Output - Y
    layer2WeightError = (1 / Y.size) * layer2Error.dot(layer1Output.T)
    layer2BiasError = (1 / Y.size) * np.average(layer2Error)

    layer1Error = layer2Weights.T.dot(layer2Error) * ReLU_derivative(layer1Output)
    layer1WeightError = (1 / Y.size) * layer1Error.dot(X.T)
    layer1BiasError = (1 / Y.size) * np.average(layer1Error)

    return layer1WeightError, layer1BiasError, layer2WeightError, layer2BiasError

def update_neural_net_parameters(layer1Weights, layer1Biases, layer2Weights, layer2Biases, errL1W, errL1B, errL2W, errL2B, alpha):
    '''@returns new W1, b1, W2, b2'''
    layer1Weights = layer1Weights - alpha * errL1W
    layer1Biases = layer1Biases - alpha * errL1B

    layer2Weights = layer2Weights - alpha * errL2W
    layer2Biases = layer2Biases - alpha * errL2B

    return layer1Weights, layer1Biases, layer2Weights, layer2Biases

In [15]:
def get_predictions(L2A):
    # Get indices of largest values
    return np.argmax(L2A, 0)

def get_accuracy(predictions, Y):
    return np.sum(predictions == Y) / Y.size

def gradient_descent(X, Y, iterations, alpha):
    layer1Weights, layer1Biases, layer2Weights, layer2Biases = initialise_neural_net_parameters(n_features=X.shape[0])

    for i in range(iterations):
        layer1WeightBiasOutput, layer1Output, layer2WeightBiasOutput, layer2Output = forward_propagation(layer1Weights, layer1Biases,
                                                                                                            layer2Weights, layer2Biases,
                                                                                                            X)
        # Go back through layers & figure out error
        l1WErr, l1bErr, l2WErr, l2bErr = backward_propagation(layer1WeightBiasOutput, layer1Output, 
                                                                layer2WeightBiasOutput, layer2Output,
                                                                layer2Weights, X, Y)

        # Get new weights & biases based on error
        layer1Weights, layer1Biases, layer2Weights, layer2Biases = update_neural_net_parameters(layer1Weights, layer1Biases, 
                                                                                                layer2Weights, layer2Biases, 
                                                                                                l1WErr, l1bErr, l2WErr, l2bErr, 
                                                                                                alpha)
        
        if i % 10 == 0:
            print("Iteration:", i)
            predictions = get_predictions(layer2WeightBiasOutput)
            print(get_accuracy(predictions, Y))
    
    return layer1Weights, layer1Biases, layer2Weights, layer2Biases    

In [18]:
l1Weights, l1Biases, l2Weights, l2Biases = gradient_descent(X, Y, iterations=50, alpha=0.10)



Iteration: 0
0.0
Iteration: 10
0.09838095238095237
Iteration: 20
0.09838095238095237
Iteration: 30
0.09838095238095237
Iteration: 40
0.09838095238095237
