# 2.2 MNIST classifier
This code implements a simple neural network of 2 hidden layers of 16 units each with ReLU activation, and 1 output layer of 10 units with softmax activation, implemented from scratch using numpy. It also makes use of regularization, mini-batch gradient descent and also momentum gradient descent and dropout regularization for more optimization. While version 2.1 didn't make use of dropout, this does

the data is taking from [kaggle](https://www.kaggle.com/competitions/digit-recognizer/data), which contains all images flattened and present in a csv file. As convolution is not involved, this is ok for current purpose. 


### Results
Applying dropout regularization only made the model worse. The first 2 results are without dropout, remaining with different tries of dropout, with different batch sizes. Increasing batch size can increase the CV accuracy, but also increase the epoch to get better result 


| Alpha | Beta | Lambda | Epoch |1-Dropout| Batch Size  |  Test Accuracy | Cross Validation Accuracy | 
|:-----:|:----:|:------:|:-----:|   :-:   |:-----------:|:--------------:|:-------------------------:|
|    0.1| 0.9  | 0.0    |  500  |   1.0   |     64      |   99.2%        |          93.6%            |
|    0.1| 0.9  | 0.01   |  100  |   1.0   |    128      |   97.2%        |          94.4%            |
|    0.1| 0.9  | 0.0    |  500  |   0.8   |      64     |    96.3%       |          92.4%            |
|    0.1| 0.9  | 0.0    |  500  |   0.5   |      64     |   91.5%        |          89.16%           |
|    0.1| 0.9  | 0.0    |  500  |    0.8  |     128     |   96.77%       |          92.9%            |
|   0.05| 0.9  | 0.0    |  500  |   0.7   |     128     |   93.9%        |          92.76%           |
|   0.05| 0.9  | 0.0    |  1000 |   0.7   |     128     |   94.7%        |          92.26%           |
|    0.1| 0.9  | 0.0    |  500  |   0.8   |     256     |   96.02%       |          93.16%           |
|    0.1| 0.9  | 0.0    |  500  |   0.8   |     512     |   94.5%        |          92.3%            |
|    0.1| 0.9  | 0.0    |  1000 |   0.8   |     512     |   95.2%        |          93.6%            |

   

-------------------
### Github Instruction
after downloading the files in parent directory, `/kaggle/mnist_classifier/version2`, extract `dataset.zip` into folder `dataset`. If you want to load in pretrained weights, use `weights_v2.1.json`. it contains a dict, with keys (W1, b1, ...) mapped with their matrices. Load them in and convert to np.array and you should be good to go (Although I haven't tested it, _yet_). It also contains a 'details' key with the hypterparameters during training. 


In [3]:
import numpy as np
import time
import pandas as pd

### Loading data

the dataset directory contains the data downloaded directly as is from kaggle, without any modifications. So after loading it, we have to split it, scale the gray scale values b/w 0 to 1, and convert from pandas DF to numpy.NDarray

In [4]:
train_data = pd.read_csv('./dataset/train.csv')
train_data = np.array(train_data)
test_data = pd.read_csv('./dataset/test.csv')
test_data = np.array(test_data)


np.random.seed(42)


np.random.shuffle(train_data)

#separate data into training and cross validation set, and transpose so that data is column-wise, not row-wise
cv = train_data[:3000].T
train = train_data[3000:].T


X_train = train[1:]
X_train = X_train / 255 
Y_train = train[0].reshape((1, -1))
m = X_train.shape[1]

X_cv = cv[1:]
X_cv = X_cv / 255
Y_cv = cv[0].reshape((1, -1))

Y_train

array([[1, 9, 1, ..., 2, 6, 0]])

In [5]:
def relu(x):
    return np.maximum(0, x)


def softmax(x):
    exp_x = np.exp(x-np.max(x, axis=0, keepdims=True))
    
    return exp_x/np.sum(exp_x, axis=0, keepdims=True)
    
def sparse_categorical_cross_entropy_loss(predictions, y):
    return -np.sum(y * np.log(predictions)) / y.size


def get_prediction(A3):
    return np.argmax(A3, axis=0, keepdims=True)

def accuracy(prediction, Y):
    return np.sum(prediction==Y)/Y.size

def one_hot_encode(Y):
    encoded_Y = np.zeros((10, Y.size))

    encoded_Y[Y, np.arange(Y.size)] = 1
    return encoded_Y
    


In [10]:
def initialize_weights():
    #random_sample returns a uniform distribution from 0 to 1, in atleast v2.2

    W1 = np.random.random_sample((16, 784)) - 0.5
    b1 = np.random.random_sample((16, 1)) - 0.5

    W2 = np.random.random_sample((16, 16))- 0.5
    b2 = np.random.random_sample((16, 1))- 0.5

    W3 = np.random.random_sample((10, 16))- 0.5
    b3 = np.random.random_sample((10, 1))- 0.5

    return W1, b1, W2, b2, W3, b3

def forward_prop(W1, b1, W2, b2, W3, b3, X, keep_prob=1.0):
    

    Z1 = np.matmul(W1, X) + b1
    A1 = relu(Z1)
    d1 = np.random.random_sample(A1.shape) < keep_prob
    A1 *= d1 / keep_prob

    Z2 = np.matmul(W2, A1) + b2
    A2 = relu(Z2)
    d2 = np.random.random_sample(A2.shape) < keep_prob
    A2 *= d2 / keep_prob

    Z3 = np.matmul(W3, A2) + b3
    A3 = softmax(Z3)

    return Z1, A1, d1, Z2, A2, d2, Z3, A3

def back_prop(Z1, A1, d1, Z2, A2, d2, Z3, A3, W1, W2, W3, X, y, lambda_=0.0):
    """
    Assuming y has shape (10, m)
    """
    m = y.shape[1]

    ## layer 3
    # dA3 = -y / A3
    dZ3 = A3 - y

    dW3 = np.matmul(dZ3, A2.T) / m + lambda_ / m * W3
    db3 = np.sum(dZ3, axis=1, keepdims=True) / m

    ## layer 2
    dA2 = np.matmul(W3.T, dZ3) * d2
    dZ2 = dA2 * (Z2 > 0)

    dW2 = np.matmul(dZ2, A1.T) / m + lambda_ / m * W2
    db2 = np.sum(dZ2, axis=1, keepdims=True) / m

    ## layer 1
    dA1 = np.matmul(W2.T, dZ2) * d1
    dZ1 = dA1 * (Z1 > 0)

    dW1 = np.matmul(dZ1, X.T) / m + lambda_/m * W1
    db1 = np.sum(dZ1, axis=1, keepdims=True) / m

    return dW1, db1, dW2, db2, dW3, db3

def update_params(W1, b1, W2, b2, W3, b3, dW1, db1, dW2, db2, dW3, db3, alpha):

    W1 -= alpha * dW1
    b1 -= alpha * db1

    W2 -= alpha * dW2
    b2 -= alpha * db2

    W3 -= alpha * dW3
    b3 -= alpha * db3

    return W1, b1, W2, b2, W3, b3

def gradient_descent(X, Y, epoch, alpha, lambda_=0.0, batch_size=64, beta=0.9, keep_prob=1.0, W1=None, b1=None, W2=None, b2=None, W3=None, b3=None):

    n, m = X.shape
    y_copy = one_hot_encode(Y)
    print(y_copy.shape)
    x_copy = X.copy()
    
    if W1 is None:
        W1, b1, W2, b2, W3, b3 = initialize_weights()

    v_dW1 = np.zeros_like(W1)
    v_db1 = np.zeros_like(b1)
    v_dW2 = np.zeros_like(W2)
    v_db2 = np.zeros_like(b2)
    v_dW3 = np.zeros_like(W3)
    v_db3 = np.zeros_like(b3)
    
    
    start = time.time()

    for i in range(epoch):

        #this tracks the number of correct predictions done by the algorithm
        y_correct = 0

        for j in range(0, m, batch_size):
            x_batch = x_copy[:, j: j+batch_size]
            y_batch = y_copy[:, j: j+batch_size]
            
            Z1, A1, d1, Z2, A2, d2, Z3, A3 = forward_prop(W1, b1, W2, b2, W3, b3, x_batch, keep_prob=keep_prob)            
            dW1, db1, dW2, db2, dW3, db3 = back_prop(Z1, A1, d1, Z2, A2, d2, Z3, A3, W1, W2, W3, x_batch, y_batch, lambda_)    
            

            v_dW1 = beta* v_dW1 + (1-beta)*dW1
            v_db1 = beta* v_db1 + (1-beta)*db1
            v_dW2 = beta* v_dW2 + (1-beta)*dW2
            v_db2 = beta* v_db2 + (1-beta)*db2
            v_dW3 = beta* v_dW3 + (1-beta)*dW3
            v_db3 = beta* v_db3 + (1-beta)*db3
            
            W1, b1, W2, b2, W3, b3 = update_params(W1, b1, W2, b2, W3, b3, v_dW1, v_db1, v_dW2, v_db2, v_dW3, v_db3, alpha)

            
            prediction = get_prediction(A3)
            
            y_correct += np.sum(prediction == np.argmax(y_batch, axis=0, keepdims=True)) 

        if i%30==0 or i==epoch-1:
            print(f"Iteration: {i+1}, took {time.time()-start}s- Accuracy={y_correct/m}")
            start = time.time()

    return W1, b1, W2, b2, W3, b3

        
def make_prediction(W1, b1, W2, b2, W3, b3, X):
    _, _, _, _, _, _, _, A3 = forward_prop(W1, b1, W2, b2, W3, b3, X)
    predictions = get_prediction(A3)
    return predictions
    

In [37]:
W1, b1, W2, b2, W3, b3 = gradient_descent(X_train, Y_train, epoch=750, alpha=0.1, keep_prob=0.8, lambda_=0.0, batch_size=512)# W1=W1,b1=b1,W2=W2,b2=b2,W3=W3,b3=b3)

(10, 39000)
Iteration: 1, took 0.18317389488220215s- Accuracy=0.23553846153846153
Iteration: 31, took 6.621882438659668s- Accuracy=0.773974358974359
Iteration: 61, took 6.655176639556885s- Accuracy=0.8156410256410257
Iteration: 91, took 5.816028833389282s- Accuracy=0.8337692307692308
Iteration: 121, took 6.150511980056763s- Accuracy=0.8460512820512821
Iteration: 151, took 6.219083309173584s- Accuracy=0.8528974358974359
Iteration: 181, took 6.030574321746826s- Accuracy=0.8592051282051282
Iteration: 211, took 6.018198728561401s- Accuracy=0.863051282051282
Iteration: 241, took 6.113745927810669s- Accuracy=0.8668205128205129
Iteration: 271, took 6.247676372528076s- Accuracy=0.8705384615384615
Iteration: 301, took 6.163020610809326s- Accuracy=0.8714615384615385
Iteration: 331, took 6.2688422203063965s- Accuracy=0.8761025641025642
Iteration: 361, took 6.829492807388306s- Accuracy=0.8745384615384615
Iteration: 391, took 6.800660610198975s- Accuracy=0.8764358974358974
Iteration: 421, took 6.56

In [38]:
test_predictions = make_prediction(W1, b1, W2, b2, W3, b3, X_train)
cv_predictions = make_prediction(W1, b1, W2, b2, W3, b3, X_cv)


print(f"Test accuracy: {accuracy(test_predictions, Y_train)}")
print(f"Test accuracy: {accuracy(cv_predictions, Y_cv)}")
# accuracy(cv_predictions, Y_cv)

Test accuracy: 0.9523076923076923
Test accuracy: 0.936


In [63]:
weights = {
    'details':{
        'learning_rate':0.1,
        'lambda': 0.1,
        'batch_size': 64,
        'epoch':500,
        'beta':0.9
    },
    'W1':W1.tolist(),
    'b1':b1.tolist(),
    'W2':W2.tolist(),
    'b2':b2.tolist(),
    'W3':W3.tolist(),
    'b3':b3.tolist()
}
import json
with open('./weights_v2.2.json', 'w+') as f:
    json.dump(weights, f)

In [64]:

test_data = pd.read_csv('./dataset/test.csv')
test_data = np.array(test_data)
test_data = test_data.T / 255
predictions = make_prediction(W1, b1, W2, b2, W3, b3, test_data)
predictions = pd.DataFrame({'label':predictions.reshape((-1))})
predictions.head()

Unnamed: 0,label
0,2
1,0
2,9
3,9
4,3


In [65]:
predictions.to_csv("kaggle_submission_v2.csv")