In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Data Import and Prep ##

In [2]:
data = pd.read_csv('train.csv')

In [227]:
data = np.array(data)
np.random.shuffle(data)

# Split the data to train and test
train_data = data[:33600, 1:].transpose() / 255
train_labels = data[:33600 ,0]

test_data = data[33601:, 1:].transpose()
test_labels = data[33601:, 0]

## Manual Testing ##

In [167]:
w1 = np.random.randn(10,784) * 0.01
b1 = np.random.randn(10, 1) * 0.01

In [168]:
z1 = np.matmul(w1, train_data) + b1

In [123]:
z1.shape

(10, 33600)

In [124]:
def Relu(matrix):
    for i in range(0, matrix.shape[1]):
        for j in range(0, 10):
            if matrix[j,i] < 0:
                matrix[j,i] = 0
            
    return matrix

In [125]:
a1 = Relu(z1)

In [130]:
a1

array([[   0.        ,    0.        ,    0.        , ...,    0.        ,
           0.        ,    0.        ],
       [1664.97589461,    0.        , 1023.51196007, ...,    0.        ,
         905.42105208,  349.23207221],
       [1764.16562412, 3270.88019555,    0.        , ..., 3533.6729674 ,
        1832.45166582,    0.        ],
       ...,
       [1186.29238091, 3514.72689112,    0.        , ...,    0.        ,
        1535.31065586, 3331.28267939],
       [   0.        ,    0.        ,    0.        , ...,    0.        ,
           0.        ,    0.        ],
       [1294.27118489,    0.        ,    0.        , ...,    0.        ,
           0.        ,    0.        ]])

In [165]:
w2 = np.random.randn(10,10) * 0.01
b2 = np.random.randn(10, 1) * 0.01

In [132]:
z2 = np.matmul(w2, a1) + b2

In [133]:
def Softmax(logits):
    exp_logits = np.exp(logits - np.max(logits, axis=1, keepdims=True))  # Numerical stability improvement
    return exp_logits / np.sum(exp_logits, axis=1, keepdims=True)

In [134]:
a2 = Softmax(z2)

In [135]:
a2.shape

(10, 33600)

In [144]:
train_data.shape

(784, 33600)

## Code Rewritten in Class format (Locked In)

In [None]:
def Relu(matrix):
    return np.maximum(matrix, 0)

def Deriv_Relu(matrix):
    return matrix > 0

def Softmax(logits):
    exp_logits = np.exp(logits - np.max(logits, axis=1, keepdims=True))  # Numerical stability improvement
    return exp_logits / np.sum(exp_logits, axis=1, keepdims=True)

class Neural_Network:
    def __init__(self, a0, learning_rate=0.001):
        # Weights and Biases
        self.w1 = np.random.rand(10, 784) - 0.5
        self.b1 = np.random.rand(10, 1)   - 0.5
        self.w2 = np.random.rand(10, 10)  - 0.5
        self.b2 = np.random.rand(10, 1)   - 0.5
        
        # Others
        self.a0 = a0
        self.learning_rate = learning_rate
        
    def forward_pass(self):
        # Hidden Layer
        self.z1 = np.matmul(self.w1, self.a0) + self.b1
        self.a1 = Relu(self.z1)
        
        # Output Layer
        self.z2 = np.matmul(self.w2, self.a1) + self.b2
        self.a2 = Softmax(self.z2)
        
        return self.a2
    
    def cost_function(self, labels):
        # one-hot encode labels
        self.y = np.zeros((labels.max() + 1, labels.size))
        self.y[labels, np.arange(labels.size)] = 1
        
        # Cost Function (Mean Square Error)
        self.cost = np.sum(np.power((self.a2 - self.y), 2) / 2) / 784
        
        return self.cost
    
    def backward_pass_not_working(self):
        dA2dZ2 = self.a2 - (1 - self.a2)
        dCdA2 = np.sum(self.a2 - self.y) / 784
        dZ2dW2 = self.a1.transpose()
        
        self.dW2 = dCdA2 * np.matmul(dA2dZ2, dZ2dW2)
        
        # dCdA2
        # dA2dZ2
        dZ2dA1 = self.w2.transpose()
        dA1dZ1 = Deriv_Relu(self.z1)
        dZ1dW1 = self.a0.transpose()
        
        dZ2 = dCdA2 * dA2dZ2
        
        # We element wise multiply the derivative of Relu as Relu is an element-wise activation function
        self.dW1 = np.matmul(np.matmul(dZ2dA1, dZ2)*dA1dZ1,  dZ1dW1)
        
        return self.dW1, self.dW2 
    
    def backward_pass(self):
        m = 784
        self.dZ2 = self.a2 - self.y
        self.dW2 = 1 / m * self.dZ2.dot(self.a1.T)
        self.db2 = 1 / m * np.sum(self.dZ2)
        self.dZ1 = self.w2.T.dot(self.dZ2) * Deriv_Relu(self.z1)
        self.dW1 = 1 / m * self.dZ1.dot(self.a0.T)
        self.db1 = 1 / m * np.sum(self.dZ1)
        
        return self.dW1, self.dW2
    
    def grad_descent(self):
        self.w1 -= self.learning_rate * self.dW1
        self.w2 -= self.learning_rate * self.dW2
        self.b1 -= self.learning_rate * self.db1
        self.b2 -= self.learning_rate * self.db2

In [162]:
def get_accuracy():
    correct = 0
    wrong = 0

    for i in range(0, 33599):
        if x.a2[:, i].argmax() == train_labels[i]:
            correct += 1
        else: 
            wrong +=1
            
    accuracy = (correct / (correct + wrong))*100

    return accuracy

In [226]:
x.w1

array([[ 0.2701488 , -0.06333839, -0.41303122, ..., -0.47049951,
        -0.44952697,  0.25424486],
       [-0.44181561, -0.16244422, -0.43954549, ..., -0.12471968,
         0.17562699, -0.2807511 ],
       [ 0.35673168,  0.38327482,  0.16177345, ...,  0.21417211,
         0.33143282,  0.34229856],
       ...,
       [-0.11740341, -0.17045203,  0.1993784 , ...,  0.42613486,
        -0.14474687,  0.06434644],
       [ 0.38887865, -0.48480583, -0.3653902 , ..., -0.18851569,
         0.11801506,  0.22301514],
       [ 0.41471847,  0.07892538, -0.41540931, ...,  0.2259472 ,
        -0.26256731, -0.34797288]])

In [229]:
x = Neural_Network(train_data)

for i in range(50):
    x.forward_pass()
    cost = x.cost_function(train_labels)
    x.backward_pass()
    x.grad_descent()
    
    # np.savetxt(f"w2-{i}.csv", x.w2, delimiter=",")
    
    print(f"{i} cost: {cost}")
    print(f"{i} accuracy: {get_accuracy()}")
    print()
    

0 cost: 21.426621928586865
0 accuracy: 14.220661329206226

1 cost: 21.426571772250043
1 accuracy: 14.652221792315249

2 cost: 21.42652031055428
2 accuracy: 15.101639929759813

3 cost: 21.42646766479353
3 accuracy: 15.670109229441353

4 cost: 21.42641439814233
4 accuracy: 16.018333878984496

5 cost: 21.426361646162807
5 accuracy: 16.387392481919104

6 cost: 21.42631082400291
6 accuracy: 16.756451084853715

7 cost: 21.42626408057248
7 accuracy: 17.006458525551356

8 cost: 21.426224552503406
8 accuracy: 17.205869222298283

9 cost: 21.426195682871246
9 accuracy: 17.28027619869639

10 cost: 21.42618122255148
10 accuracy: 17.17908271079496

11 cost: 21.42618541086467
11 accuracy: 17.1344385249561

12 cost: 21.426211620892552
12 accuracy: 17.19098782701866

13 cost: 21.426260076486745
13 accuracy: 17.1225334087324

14 cost: 21.426331052427532
14 accuracy: 17.012411083663203

15 cost: 21.42642670123846
15 accuracy: 16.98264829310396

16 cost: 21.426570608526113
16 accuracy: 17.074912943837614
