# 2nd attempt at MNIST classifier
This code implements a simple neural network of 2 hidden layers of 16 units each with ReLU activation, and 1 output layer of 10 units with softmax activation, implemented from scratch using numpy. It also makes use of regularization, mini-batch gradient descent for more optimization

the data is taking from [kaggle](https://www.kaggle.com/competitions/digit-recognizer/data), which contains all images flattened and present in a csv file. As convolution is not involved, this is ok for current purpose. 


### Results
after applying just regularization, with lamdba = 0.001, the accuracy drop to about 92%, with 0.01, it is about 90%, and same with 0.1. 

With mini-batch gradient descent, batch_size=64 and lambda=0.05, alpha=0.1, with at just epoch=4, it reached an accuracy of 92.4%, which took version1 2350 iterations using batch gradient descent, proving mini-batch gradient descent to be highly efficient. after about 210 epochs, it is converged to ~98% accuracy. Although its accuracy on cross validation was 95.4%, much lower than training set accuracy, but still more than version1 which didn't use regularization (v1 had CV accuracy = 94.1%)

-------------------
### Github Instruction
after downloading the files in parent directory, `/kaggle/mnist_classifier/version2`, extract `dataset.zip` into folder `dataset`. If you want to load in pretrained weights, use `weights_v2.1.json`. it contains a dict, with keys (W1, b1, ...) mapped with their matrices. Load them in and convert to np.array and you should be good to go (Although I haven't tested it, _yet_). It also contains a 'details' key with the hypterparameters during training. 


In [3]:
import numpy as np
import time
import pandas as pd

### Loading data

the dataset directory contains the data downloaded directly as is from kaggle, without any modifications. So after loading it, we have to split it, scale the gray scale values b/w 0 to 1, and convert from pandas DF to numpy.NDarray

In [4]:
train_data = pd.read_csv('./dataset/train.csv')
train_data = np.array(train_data)
test_data = pd.read_csv('./dataset/test.csv')
test_data = np.array(test_data)

np.random.seed(42)


np.random.shuffle(train_data)

#separate data into training and cross validation set, and transpose so that data is column-wise, not row-wise
cv = train_data[:3000].T
train = train_data[3000:].T


X_train = train[1:]
X_train = X_train / 255 
Y_train = train[0].reshape((1, -1))
m = X_train.shape[1]

X_cv = cv[1:]
X_cv = X_cv / 255
Y_cv = cv[0].reshape((1, -1))

Y_train

array([[1, 9, 1, ..., 2, 6, 0]])

In [5]:
def relu(x):
    return np.maximum(0, x)


def softmax(x):
    exp_x = np.exp(x-np.max(x, axis=0, keepdims=True))
    
    return exp_x/np.sum(exp_x, axis=0, keepdims=True)
    
def sparse_categorical_cross_entropy_loss(predictions, y):
    return -np.sum(y * np.log(predictions)) / y.size


def get_prediction(A3):
    return np.argmax(A3, axis=0, keepdims=True)

def accuracy(prediction, Y):
    return np.sum(prediction==Y)/Y.size

def one_hot_encode(Y):
    encoded_Y = np.zeros((10, Y.size))

    encoded_Y[Y, np.arange(Y.size)] = 1
    return encoded_Y
    


In [25]:
def initialize_weights():
    #random_sample returns a uniform distribution from 0 to 1, in atleast v2.2

    W1 = np.random.random_sample((16, 784)) - 0.5
    b1 = np.random.random_sample((16, 1)) - 0.5

    W2 = np.random.random_sample((16, 16))- 0.5
    b2 = np.random.random_sample((16, 1))- 0.5

    W3 = np.random.random_sample((10, 16))- 0.5
    b3 = np.random.random_sample((10, 1))- 0.5

    return W1, b1, W2, b2, W3, b3

def forward_prop(W1, b1, W2, b2, W3, b3, X):

    Z1 = np.matmul(W1, X) + b1
    A1 = relu(Z1)

    Z2 = np.matmul(W2, A1) + b2
    A2 = relu(Z2)

    Z3 = np.matmul(W3, A2) + b3
    A3 = softmax(Z3)

    return Z1, A1, Z2, A2, Z3, A3

def back_prop(Z1, A1, Z2, A2, Z3, A3, W1, W2, W3, X, y, lambda_):
    """
    Assuming y has shape (10, m)
    """
    m = y.shape[1]

    ## layer 3
    # dA3 = -y / A3
    dZ3 = A3 - y

    dW3 = np.matmul(dZ3, A2.T) / m + lambda_ / m * W3
    db3 = np.sum(dZ3, axis=1, keepdims=True) / m

    ## layer 2
    dA2 = np.matmul(W3.T, dZ3) 
    dZ2 = dA2 * (Z2 > 0)

    dW2 = np.matmul(dZ2, A1.T) / m + lambda_ / m * W2
    db2 = np.sum(dZ2, axis=1, keepdims=True) / m

    ## layer 1
    dA1 = np.matmul(W2.T, dZ2)
    dZ1 = dA1 * (Z1 > 0)

    dW1 = np.matmul(dZ1, X.T) / m + lambda_/m * W1
    db1 = np.sum(dZ1, axis=1, keepdims=True) / m

    return dW1, db1, dW2, db2, dW3, db3

def update_params(W1, b1, W2, b2, W3, b3, dW1, db1, dW2, db2, dW3, db3, alpha):

    W1 -= alpha * dW1
    b1 -= alpha * db1

    W2 -= alpha * dW2
    b2 -= alpha * db2

    W3 -= alpha * dW3
    b3 -= alpha * db3

    return W1, b1, W2, b2, W3, b3

def gradient_descent(X, Y, epoch, alpha, lambda_=0.01, batch_size=64, W1=None, b1=None, W2=None, b2=None, W3=None, b3=None):

    n, m = X.shape
    y_copy = one_hot_encode(Y)
    print(y_copy.shape)
    x_copy = X.copy()
    
    if W1 is None:
        W1, b1, W2, b2, W3, b3 = initialize_weights()
    
    
    start = time.time()

    for i in range(epoch):

        #this tracks the number of correct predictions done by the algorithm
        y_correct = 0

        for j in range(0, m, batch_size):
            x_batch = x_copy[:, j: j+batch_size]
            y_batch = y_copy[:, j: j+batch_size]
            
            Z1, A1, Z2, A2, Z3, A3 = forward_prop(W1, b1, W2, b2, W3, b3, x_batch)            
            dW1, db1, dW2, db2, dW3, db3 = back_prop(Z1, A1, Z2, A2, Z3, A3, W1, W2, W3, x_batch, y_batch, lambda_)    
            W1, b1, W2, b2, W3, b3 = update_params(W1, b1, W2, b2, W3, b3, dW1, db1, dW2, db2, dW3, db3, alpha)
            
            
            prediction = get_prediction(A3)
            
            y_correct += np.sum(prediction == np.argmax(y_batch, axis=0, keepdims=True)) 
            
        print(f"Iteration: {i}, took {time.time()-start}s- Accuracy={y_correct/m}")
        start = time.time()

    return W1, b1, W2, b2, W3, b3

        
def make_prediction(W1, b1, W2, b2, W3, b3, X):
    _, _, _, _, _, A3 = forward_prop(W1, b1, W2, b2, W3, b3, X)
    predictions = get_prediction(A3)
    return predictions
    

In [26]:
W1, b1, W2, b2, W3, b3 = gradient_descent(X_train, Y_train, 1001, 0.1, 0.05)

(10, 39000)
Iteration: 0, took 0.47753167152404785s- Accuracy=0.7311794871794872
Iteration: 1, took 0.5024430751800537s- Accuracy=0.8822051282051282
Iteration: 2, took 0.4727592468261719s- Accuracy=0.9052307692307693
Iteration: 3, took 0.4579184055328369s- Accuracy=0.9166153846153846
Iteration: 4, took 0.5449190139770508s- Accuracy=0.924923076923077
Iteration: 5, took 0.5584194660186768s- Accuracy=0.9308205128205128
Iteration: 6, took 0.5713837146759033s- Accuracy=0.9368717948717948
Iteration: 7, took 0.53670334815979s- Accuracy=0.940923076923077
Iteration: 8, took 0.49346113204956055s- Accuracy=0.9443076923076923
Iteration: 9, took 0.47324705123901367s- Accuracy=0.9471538461538461
Iteration: 10, took 0.5330188274383545s- Accuracy=0.949974358974359
Iteration: 11, took 0.5202507972717285s- Accuracy=0.9516666666666667
Iteration: 12, took 0.47409749031066895s- Accuracy=0.9538974358974359
Iteration: 13, took 0.49549150466918945s- Accuracy=0.9558461538461539
Iteration: 14, took 0.4687535762

In [27]:
cv_predictions = make_prediction(W1, b1, W2, b2, W3, b3, X_cv)

accuracy(cv_predictions, Y_cv)

np.float64(0.9543333333333334)

In [29]:
weights = {
    'details':{
        'learning_rate':0.1,
        'lambda': 0.05,
        'batch_size': 64,
        'epoch':1001
    },
    'W1':W1.tolist(),
    'b1':b1.tolist(),
    'W2':W2.tolist(),
    'b2':b2.tolist(),
    'W3':W3.tolist(),
    'b3':b3.tolist()
}
import json
with open('./weights_v2.1.json', 'w+') as f:
    json.dump(weights, f)

In [32]:

test_data = pd.read_csv('./dataset/test.csv')
test_data = np.array(test_data)
test_data = test_data.T / 255
predictions = make_prediction(W1, b1, W2, b2, W3, b3, test_data)
predictions = pd.DataFrame({'label':predictions.reshape((-1))})
predictions.head()

Unnamed: 0,label
0,2
1,0
2,9
3,4
4,3


In [34]:
predictions.to_csv("kaggle_submission_v1.csv")