In [46]:
#import functions
import numpy as np
import pandas as pd

In [47]:
data = pd.read_csv('train.csv')
data = np.array(data)
m, n = data.shape
np.random.shuffle(data) # shuffle before splitting into devv and training sets

data_dev = data[0:1000].T
Y_dev = data_dev[0] #split into labels and images
X_dev = data_dev[1:n] 
X_dev = X_dev / 255.

data_train = data[1000:m].T #transpose so its a 1x748 array and not a 748x1
Y_train = data_train[0] #split into labels and images
X_train = data_train[1:n] 
X_train = X_train / 255.
_,m_train = X_train.shape


In [43]:
def init_params(): #generates random paramaters for the weights and Biases to start
    W1 = np.random.rand(10, 784) - 0.5
    b1 = np.random.rand(10, 1) - 0.5
    W2 = np.random.rand(10, 10) - 0.5
    b2 = np.random.rand(10, 1) - 0.5
    W3 = np.random.rand(10, 10) - 0.5
    b3 = np.random.rand(10, 1) - 0.5
    return W1, b1, W2, b2, W3, b3

def ReLU(Z): #returns Z if its greater then zero or just returns zero
    return np.maximum(Z, 0)

def softmax(Z): #changes the numbers given by the neural networks into there probabilities
    A = np.exp(Z) / sum(np.exp(Z))
    return A
    
def forward_prop(W1, b1, W2, b2, W3, b3, X): #use the weights and biases and input of the image [1x728] array to find the output in terms of probability
    Z1 = W1.dot(X) + b1
    A1 = ReLU(Z1)
    Z2 = W2.dot(A1) + b2
    A2 = ReLU(Z2)
    Z3 = W3.dot(A2) + b3
    A3 = softmax(Z3)
    return Z1, A1, Z2, A2, Z3, A3

def ReLU_deriv(Z): #the derivitive of the ReLU function
    return Z > 0

def one_hot(Y): #changes the number of the answer into an array that can be used for finding the gradients
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1
    one_hot_Y = one_hot_Y.T
    return one_hot_Y

def backward_prop(Z1, A1, Z2, A2, Z3, A3, W1, W2, W3, X, Y): #finds the gradients and loss for each weight bias or input using the formulas
    one_hot_Y = one_hot(Y)
    dZ3 = A3 - one_hot_Y
    dW3 = 1 / m * dZ3.dot(A2.T) 
    db3 = 1 / m * np.sum(dZ3)
    dZ2 = W3.T.dot(dZ3) * ReLU_deriv(Z2)
    dW2 = 1 / m * dZ2.dot(A1.T)
    db2 = 1 / m * np.sum(dZ2)
    dZ1 = W2.T.dot(dZ2) * ReLU_deriv(Z1)
    dW1 = 1 / m * dZ1.dot(X.T)
    db1 = 1 / m * np.sum(dZ1)
    return dW1, db1, dW2, db2, dW3, db3

def update_params(W1, b1, W2, b2, W3, b3, dW1, db1, dW2, db2, dW3, db3, lr): #updates the weights and biases using the learning rate and gradients
    W1 = W1 - lr * dW1
    b1 = b1 - lr * db1    
    W2 = W2 - lr * dW2  
    b2 = b2 - lr * db2    
    W3 = W3 - lr * dW3
    b3 = b3 - lr * db3
    return W1, b1, W2, b2, W3, b3

In [44]:
def get_predictions(A2):
    return np.argmax(A2, 0) #returns the predection but scaled to 1 or 0

def get_accuracy(predictions, Y):
    print(predictions, Y) #gets the accuracy from the neural network output
    return np.sum(predictions == Y) / Y.size

def gradient_descent(X, Y, lr, epochs): #trains the model
    W1, b1, W2, b2, W3, b3 = init_params() #initializes paramaters
    for i in range(epochs): #runs for the amount of epochs
        Z1, A1, Z2, A2, Z3, A3 = forward_prop(W1, b1, W2, b2, W3, b3, X)  #does the forward propagation and gets back the values of each level before and after activation function
        dW1, db1, dW2, db2, dW3, db3 = backward_prop(Z1, A1, Z2, A2, Z3, A3, W1, W2, W3, X, Y) # uses the values of each level to find out the gradient and derivitive of the weights and biases
        W1, b1, W2, b2, W3, b3 = update_params(W1, b1, W2, b2, W3, b3, dW1, db1, dW2, db2, dW3, db3, lr) # changes the weights and biases using the derivitives
        if i % 10 == 0:
            print("Epoch: ", i) #prints the current epoch and its predictions/accuracy
            predictions = get_predictions(A3)
            print(get_accuracy(predictions, Y))
    print("Epoch: ", epochs)
    predictions = get_predictions(A3)
    print(get_accuracy(predictions, Y))
    return W1, b1, W2, b2, W3, b3

In [None]:
W1, b1, W2, b2, W3, b3 = gradient_descent(X_train, Y_train, 0.1, 100) #train the model for 100 epochs at a learning rate of 0.1

Epoch:  0
[8 9 6 ... 6 6 6] [0 3 9 ... 1 5 5]
0.06275609756097561
Epoch:  10
[8 9 6 ... 9 1 6] [0 3 9 ... 1 5 5]
0.13902439024390245
Epoch:  20
[8 1 9 ... 1 1 3] [0 3 9 ... 1 5 5]
0.232609756097561
Epoch:  30
[0 1 9 ... 1 1 3] [0 3 9 ... 1 5 5]
0.2889512195121951
Epoch:  40
[0 1 9 ... 1 1 6] [0 3 9 ... 1 5 5]
0.336609756097561
Epoch:  50
[0 1 9 ... 1 1 6] [0 3 9 ... 1 5 5]
0.38590243902439025
Epoch:  60
[0 1 9 ... 1 1 3] [0 3 9 ... 1 5 5]
0.4250731707317073
Epoch:  70
[0 1 9 ... 1 1 3] [0 3 9 ... 1 5 5]
0.47056097560975607
Epoch:  80
[0 1 9 ... 1 1 3] [0 3 9 ... 1 5 5]
0.511
Epoch:  90
[0 1 9 ... 1 1 3] [0 3 9 ... 1 5 5]
0.5473170731707317
Epoch:  100
[0 1 9 ... 1 1 3] [0 3 9 ... 1 5 5]
0.5773170731707317
Epoch:  110
[0 1 9 ... 1 1 3] [0 3 9 ... 1 5 5]
0.603170731707317
Epoch:  120
[0 1 9 ... 1 1 3] [0 3 9 ... 1 5 5]
0.6267560975609756
Epoch:  130
[0 1 9 ... 1 1 3] [0 3 9 ... 1 5 5]
0.6471463414634147
Epoch:  140
[0 1 9 ... 1 1 3] [0 3 9 ... 1 5 5]
0.6640487804878049
Epoch:  150
[0 1 9