# First attempt at MNIST classifier
This code implements a simple neural network of 2 hidden layers of 16 units each with ReLU activation, and 1 output layer of 10 units with softmax activation, implemented from scratch using numpy. It doesn't make use of more optimization techniques for better performance like regularization, dropout or mini-batch gradient descent. That will be implemented in a newer version again

the data is taking from [kaggle](https://www.kaggle.com/competitions/digit-recognizer/data), which contains all images flattened and present in a csv file. As convolution is not involved, this is ok for current purpose.


In [1]:
import numpy as np
import time
import pandas as pd

### Loading data

the dataset directory contains the data downloaded directly as is from kaggle, without any modifications. So after loading it, we have to split it, scale the gray scale values b/w 0 to 1, and convert from pandas DF to numpy.NDarray

In [2]:
train_data = pd.read_csv('./dataset/train.csv')
train_data = np.array(train_data)
test_data = pd.read_csv('./dataset/test.csv')
test_data = np.array(test_data)

np.random.seed(42)


np.random.shuffle(train_data)

#separate data into training and cross validation set, and transpose so that data is column-wise, not row-wise
cv = train_data[:3000].T
train = train_data[3000:].T


X_train = train[1:]
X_train = X_train / 255 
Y_train = train[0].reshape((1, -1))
m = X_train.shape[1]

X_cv = cv[1:]
X_cv = X_cv / 255
Y_cv = cv[0].reshape((1, -1))

Y_train

array([[1, 9, 1, ..., 2, 6, 0]])

In [3]:
def relu(x):
    return np.maximum(0, x)


def softmax(x):
    exp_x = np.exp(x-np.max(x, axis=0, keepdims=True))
    
    return exp_x/np.sum(exp_x, axis=0, keepdims=True)
    
def sparse_categorical_cross_entropy_loss(predictions, y):
    return -np.sum(y * np.log(predictions)) / y.size


def get_prediction(A3):
    return np.argmax(A3, axis=0, keepdims=True)

def accuracy(prediction, Y):
    return np.sum(prediction==Y)/Y.size

def one_hot_encode(Y):
    encoded_Y = np.zeros((10, Y.size))

    encoded_Y[Y, np.arange(Y.size)] = 1
    return encoded_Y
    


In [18]:
def initialize_weights():
    #random_sample returns a uniform distribution from 0 to 1, in atleast v2.2

    W1 = np.random.random_sample((16, 784)) - 0.5
    b1 = np.random.random_sample((16, 1)) - 0.5

    W2 = np.random.random_sample((16, 16))- 0.5
    b2 = np.random.random_sample((16, 1))- 0.5

    W3 = np.random.random_sample((10, 16))- 0.5
    b3 = np.random.random_sample((10, 1))- 0.5

    return W1, b1, W2, b2, W3, b3

def forward_prop(W1, b1, W2, b2, W3, b3, X):

    Z1 = np.matmul(W1, X) + b1
    A1 = relu(Z1)

    Z2 = np.matmul(W2, A1) + b2
    A2 = relu(Z2)

    Z3 = np.matmul(W3, A2) + b3
    A3 = softmax(Z3)

    return Z1, A1, Z2, A2, Z3, A3

def back_prop(Z1, A1, Z2, A2, Z3, A3, W2, W3, X, y):
    """
    Assuming y has shape (10, m)
    """
    m = y.shape[1]

    ## layer 3
    # dA3 = -y / A3
    dZ3 = A3 - y

    dW3 = np.matmul(dZ3, A2.T) / m
    db3 = np.sum(dZ3, axis=1, keepdims=True) / m

    ## layer 2
    dA2 = np.matmul(W3.T, dZ3) 
    dZ2 = dA2 * (Z2 > 0)

    dW2 = np.matmul(dZ2, A1.T) / m
    db2 = np.sum(dZ2, axis=1, keepdims=True) / m

    ## layer 1
    dA1 = np.matmul(W2.T, dZ2)
    dZ1 = dA1 * (Z1 > 0)

    dW1 = np.matmul(dZ1, X.T) / m
    db1 = np.sum(dZ1, axis=1, keepdims=True) / m

    return dW1, db1, dW2, db2, dW3, db3

def update_params(W1, b1, W2, b2, W3, b3, dW1, db1, dW2, db2, dW3, db3, alpha):

    W1 -= alpha * dW1
    b1 -= alpha * db1

    W2 -= alpha * dW2
    b2 -= alpha * db2

    W3 -= alpha * dW3
    b3 -= alpha * db3

    return W1, b1, W2, b2, W3, b3

def gradient_descent(X, Y, iterations, alpha, W1=None, b1=None, W2=None, b2=None, W3=None, b3=None):

    new_y = one_hot_encode(Y)
    if W1 is not None:
        W1, b1, W2, b2, W3, b3 = initialize_weights()
    

    start = time.time()

    for i in range(iterations):
        
        Z1, A1, Z2, A2, Z3, A3 = forward_prop(W1, b1, W2, b2, W3, b3, X)
        dW1, db1, dW2, db2, dW3, db3 = back_prop(Z1, A1, Z2, A2, Z3, A3, W2, W3, X, new_y)
        W1, b1, W2, b2, W3, b3 = update_params(W1, b1, W2, b2, W3, b3, dW1, db1, dW2, db2, dW3, db3, alpha)

        if i%50==0:
            
            prediction = get_prediction(A3)
            print(f"Iteration: {i}, took {time.time()-start}s- Accuracy={accuracy(prediction, Y)}")
            start = time.time()
            # print("Accuracy: ", accuracy(get_prediction(A3), Y))

    return W1, b1, W2, b2, W3, b3

        
def make_prediction(W1, b1, W2, b2, W3, b3, X):
    _, _, _, _, _, A3 = forward_prop(W1, b1, W2, b2, W3, b3, X)
    predictions = get_prediction(A3)
    return predictions

In [19]:
W1, b1, W2, b2, W3, b3 = gradient_descent(X_train, Y_train, 10001, 0.1, W1=W1, b1=b1, W2=W2, b2=b2, W3=W3, b3=b3)

Iteration: 0, took 0.11981797218322754s- Accuracy=0.12007692307692308
Iteration: 50, took 4.489748477935791s- Accuracy=0.5018717948717949
Iteration: 100, took 4.544604063034058s- Accuracy=0.6758205128205128
Iteration: 150, took 4.505732774734497s- Accuracy=0.7459743589743589
Iteration: 200, took 4.580596685409546s- Accuracy=0.7837692307692308
Iteration: 250, took 4.880459785461426s- Accuracy=0.8096923076923077
Iteration: 300, took 4.668389797210693s- Accuracy=0.8284615384615385
Iteration: 350, took 4.560608863830566s- Accuracy=0.8403333333333334
Iteration: 400, took 4.733078479766846s- Accuracy=0.8465897435897436
Iteration: 450, took 5.974888563156128s- Accuracy=0.8585128205128205
Iteration: 500, took 2.0204713344573975s- Accuracy=0.8641538461538462
Iteration: 550, took 1.673706293106079s- Accuracy=0.8694358974358974
Iteration: 600, took 1.6700444221496582s- Accuracy=0.8743076923076923
Iteration: 650, took 1.6652534008026123s- Accuracy=0.8788205128205128
Iteration: 700, took 1.63634967

In [24]:
cv_predictions = make_prediction(W1, b1, W2, b2, W3, b3, X_cv)

accuracy(cv_predictions, Y_cv)

np.float64(0.941)

In [21]:
weights = {
    'W1':W1.tolist(),
    'b1':b1.tolist(),
    'W2':W2.tolist(),
    'b2':b2.tolist(),
    'W3':W3.tolist(),
    'b3':b3.tolist()
}
import json
with open('./weights_v1.json', 'w+') as f:
    json.dump(weights, f)

In [32]:

test_data = pd.read_csv('./dataset/test.csv')
test_data = np.array(test_data)
test_data = test_data.T / 255
predictions = make_prediction(W1, b1, W2, b2, W3, b3, test_data)
predictions = pd.DataFrame({'label':predictions.reshape((-1))})
predictions.head()

Unnamed: 0,label
0,2
1,0
2,9
3,4
4,3


In [34]:
predictions.to_csv("kaggle_submission_v1.csv")