# Assignment - 8

In this assignment, we have to improve the test accuracy of "ml-2hh-mnist-nobias" notebook

Authors - Devesh Surve, Abhishek Jaiswal

## Importing the requisite libraries

In [1]:
import numpy as np
from urllib import request
import gzip
import pickle

## Downloading data

In [2]:
filename = [
    ["training_images","train-images-idx3-ubyte.gz"],
    ["test_images","t10k-images-idx3-ubyte.gz"],
    ["training_labels","train-labels-idx1-ubyte.gz"],
    ["test_labels","t10k-labels-idx1-ubyte.gz"]
]

def download_mnist():
    base_url = "http://yann.lecun.com/exdb/mnist/"
    for name in filename:
        print("Downloading "+name[1]+"...")
        request.urlretrieve(base_url+name[1], name[1])
    print("Download complete.")

def save_mnist():
    mnist = {}
    for name in filename[:2]:
        with gzip.open(name[1], 'rb') as f:
            mnist[name[0]] = np.frombuffer(f.read(), np.uint8, offset=16).reshape(-1,28*28)
    for name in filename[-2:]:
        with gzip.open(name[1], 'rb') as f:
            mnist[name[0]] = np.frombuffer(f.read(), np.uint8, offset=8)
    with open("mnist.pkl", 'wb') as f:
        pickle.dump(mnist,f)
    print("Save complete.")

In [3]:
download_mnist()
save_mnist()

Downloading train-images-idx3-ubyte.gz...
Downloading t10k-images-idx3-ubyte.gz...
Downloading train-labels-idx1-ubyte.gz...
Downloading t10k-labels-idx1-ubyte.gz...
Download complete.
Save complete.


## Loading the training and testing data

In [4]:
def load():
    with open("mnist.pkl",'rb') as f:
        mnist = pickle.load(f)
    return mnist["training_images"], mnist["training_labels"], mnist["test_images"], mnist["test_labels"]

In [5]:
X_train, y_train, X_test, y_test = load()

In [6]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((60000, 784), (60000,), (10000, 784), (10000,))

## Preparing and Normalizing data

In [7]:
def norm(X, x_min, x_max):
    nom = (X - X.min(axis=0)) * (x_max-x_min)
    denom = X.max(axis=0) - X.min(axis=0)
    denom[denom==0] = 1
    return x_min + nom/denom 

In [8]:
def prep_data(X, y):

    X_ = []
    y_ = []

    itr = int(len(y) / batch_size) + 1
    for j in range(1, itr):
        rng = j * batch_size
        X_.append(X[rng - batch_size : rng, :])
        y_.append(y[rng - batch_size : rng])

    X, y = np.array(X_), np.array(y_)
    X = norm(X, 0, 1)

    return X, y

## Best Hyperparameters For Training

In [9]:
eta=0.01
alpha=0.01
n_iter=50
batch_size=100

In [10]:
X_train, y_train = prep_data(X_train, y_train)

In [11]:
X_train.shape, y_train.shape

((600, 100, 784), (600, 100))

Let's one-hot encode our labels so that instead of many categories denoted by integers one through 10, we only have two categories: `0` and `1`, albeit in more dimensions.

## One hot encoding of data

In [12]:
def one_hot_enc(y, num_labels=10):
    one_hot = np.zeros((num_labels, y.shape[0]), dtype=np.float32)

    for i, val in enumerate(y):
        one_hot[val,i] = 1.0

    return one_hot

In [13]:
def one_hot_enc_v2(y, num_labels=10):
    one_hot = np.zeros((y.shape[0], num_labels), dtype=np.float32)

    for i, val in enumerate(y):
        one_hot[i,val] = 1.0

    return one_hot

Let's initialize our weights:

In [14]:
def init_weights(n_input, n_hidden_1, n_hidden_2, n_output, batch_size):
    #w1 = np.random.randn(n_hidden_1, n_input + 1)
    #w2 = np.random.randn(n_hidden_2, n_hidden_1 + 1)
    #w3 = np.random.randn(n_output, n_hidden_2 + 1)
    
    w1 = np.random.randn(n_input, n_hidden_1)
    w2 = np.random.randn(n_hidden_1, n_hidden_2)
    w3 = np.random.randn(n_hidden_2, n_output)
    
    return w1, w2, w3

Binary cross-entropy loss:

In [15]:
def compute_loss(prediction, label):
    term_1 = -1*label * np.log(prediction)
    term_2 = (1 - label) * (np.log(1 - prediction))

    loss = np.sum(term_1 - term_2)
    return loss

Initialize the number of neurons per layer:

In [16]:
n_hidden_1, n_hidden_2, n_output = 100, 100, 10
n_input = len(X_train[0,0,:]) #returns the flattened image size (28*28 = 784)
n_input

784

## Initializing the weights

In [17]:
w1, w2, w3 = init_weights(n_input, n_hidden_1, n_hidden_2, n_output, batch_size)

In [18]:
w1.shape, w2.shape, w3.shape

((784, 100), (100, 100), (100, 10))

Initialize Markovian weight history, losses and accuracy per epoch:

In [19]:
delta_w1_prev = np.zeros(w1.shape)
delta_w2_prev = np.zeros(w2.shape)
delta_w3_prev = np.zeros(w3.shape)

train_losses = []
train_acc = []

This is how we denotate network parameters:

`a1` is the input data, `a2` is the output of hidden layer 1, `a3` is the output of hidden layer 2, `a4` is the output of the final layer.

`z2` is the linear output of hidden layer 1 *before its activation function*, `z3` is the linear output of hidden layer 2 *before its activation function*, `z4` is the linear output of the final layer *before its activation function*.

Here is how we sample our inputs:

In [20]:
print(X_train.shape, y_train.shape)

(600, 100, 784) (600, 100)


Each batch of observations consists of 50 28x828 images sliced as a 784 vector:

In [21]:
for j, (inputs, label) in enumerate(zip(X_train, y_train)):
    print(j, inputs.shape, label.shape, one_hot_enc_v2(label).shape)
    print(j, inputs.transpose().shape, label.shape, one_hot_enc_v2(label).shape)
    print(j, inputs.transpose(1,0).shape, label.shape, one_hot_enc_v2(label).shape)
    break

0 (100, 784) (100,) (100, 10)
0 (784, 100) (100,) (100, 10)
0 (784, 100) (100,) (100, 10)


In [22]:
for j, (inputs, label) in enumerate(zip(X_train, y_train)):
    print(one_hot_enc_v2(label)[0,:])
    if 10 < j: break

[0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
[0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
[0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]


Note that in our previous notebook, each observation was 25-dimensional, and we had 608 of them, so we had an input vector of shape $[608, 25]$. We had a single output neuron and an output of shape $[608, 1]$.

Here, each observation is 784 dimensional, and we have 50 of them, so we have an input vector of shape $[50, 784]$. We have 10 output neurons and an output shape of $[50, 10]$.

In [23]:
w1.shape, w2.shape, w3.shape

((784, 100), (100, 100), (100, 10))

Let's compute the feedforward pass one layer at a time:

In [24]:
for j, (inputs, label) in enumerate(zip(X_train, y_train)):
    a1 = inputs
    print("inputs:", a1.shape)
    
    # outputs of first hidden layer
    z2 = np.matmul(a1, w1)
    a2 = 1/(1 + np.exp(-z2))
    print("layer 1: weights", w1.shape, "mini-batched output:", a2.shape)
    
    # outputs of second hidden layer
    z3 = np.matmul(a2, w2)
    a3 = 1/(1 + np.exp(-z3))
    print("layer 2: weights", w2. shape, "mini-batched output:", a3.shape)
    
    # outputs of final layer
    z4 = np.matmul(a3, w3)
    a4 = 1/(1 + np.exp(-z4))
    print("layer 2: weights", w3.shape, "mini-batched output:", a4.shape)
    
    break

inputs: (100, 784)
layer 1: weights (784, 100) mini-batched output: (100, 100)
layer 2: weights (100, 100) mini-batched output: (100, 100)
layer 2: weights (100, 10) mini-batched output: (100, 10)


Here is the full code for the **feedforward pass** (we return input and all outputs):

In [25]:
def compute_forward_pass(inputs):   
    a1 = inputs

    # outputs of first hidden layer
    z2 = np.matmul(a1, w1)
    a2 = 1/(1 + np.exp(-z2))

    # outputs of second hidden layer
    z3 = np.matmul(a2, w2)
    a3 = 1/(1 + np.exp(-z3))

    # outputs of final layer
    z4 = np.matmul(a3, w3)
    a4 = 1/(1 + np.exp(-z4))
    
    return a1, z2, a2, z3, a3, z4, a4

Here's our activation and its derivative:

In [26]:
def sigmoid(x):
    try:
        x = np.vectorize(round)(x)
        return 1.0/(1+ np.exp(-x))
    except:
        print(x)
        print(np.exp(-x))
        return 1.0/(1+ np.exp(-x))

In [27]:
def sigmoid_derivative(x):
    return sigmoid(x) * (1.0 - sigmoid(x))

Here's the backpropagation pass:

In [28]:
def compute_backward_pass_fullmath(outputs, label, verbose=False):
    a1, z2, a2, z3, a3, z4, a4 = outputs

    #
    # LAST LAYER
    #
    
    # from ml-dense-pruned:
    # sigmoid_derivative_3 = sigmoid_derivative(np.dot(self.layer2, self.weights3)) # s'(W3 z) derivative of final layer output
    # d_weights3 = np.dot(self.layer2.T, 
    #                        (2*(self.y - self.output) * sigmoid_derivative_3))
    sigmoid_derivative_3 = sigmoid_derivative(np.dot(a3, w3)) # s'(W3 z) derivative of final layer output [50, 10]
    if verbose: print("sigmoid_derivative_3:", sigmoid_derivative_3.shape, "should be [50, 10]")
    d_weights3 = np.dot(a3.T, 
                           (2 * (label - a4) * sigmoid_derivative_3)) # [50, 100].T * [50, 10]
    if verbose: print("d_weights3:", d_weights3.shape, "should be [100, 10]")
    
    
    # 
    # 2ND HIDDEN LAYER
    #

    # from ml-dense-pruned:
    # sigmoid_derivative_2 = sigmoid_derivative(np.dot(self.layer1, self.weights2)) # s'(r W2) derivative of hidden layer 2 out
    # d_weights2 = np.dot(self.layer1.T,  
    #                        np.dot(2*(self.y - self.output) * sigmoid_derivative_3, self.weights3.T) * 
    #                        sigmoid_derivative_2)
    sigmoid_derivative_2 = sigmoid_derivative(np.dot(a2, w2)) # s'(r W2) derivative of hidden layer 2 out
    if verbose: print("sigmoid_derivative_2:", sigmoid_derivative_2.shape, "should be [50, 100]")
    d_weights2 = np.dot(a2.T,  
                        np.dot((2 * (label - a4) * sigmoid_derivative_3), w3.T) * 
                        sigmoid_derivative_2) # [50, 100].T * ([50, 10] * [100, 10].T)[50, 100] = [50, 100].T * [50, 100] 
                                                                                                # = [100, 100]
    if verbose: print("d_weights2:", d_weights2.shape, "should be [100, 100]")
    
    
    # 
    # 1ST HIDDEN LAYER
    #
    
    # from ml-dense-pruned:
    # sigmoid_derivative_1 = sigmoid_derivative(np.dot(self.input, self.weights1))  # s'(x W1) derivative of hidden layer 1 out
    # d_weights1 = np.dot(self.input.T,  
    #                        np.dot(
    #                            np.dot(2*(self.y - self.output) * sigmoid_derivative_3, self.weights3.T) *
    #                                sigmoid_derivative_2, self.weights2.T) *   
    #                        sigmoid_derivative_1)
    sigmoid_derivative_1 = sigmoid_derivative(np.dot(a1, w1))  # s'(x W1) derivative of hidden layer 1 out [50, 100]
    if verbose: print("sigmoid_derivative_1:", sigmoid_derivative_1.shape, "should be [50, 100]")
    d_weights1 = np.dot(a1.T,  
                           np.dot(
                               np.dot((2 * (label - a4) * sigmoid_derivative_3), w3.T) *
                                   sigmoid_derivative_2, w2.T) *   
                           sigmoid_derivative_1) # [50, 784].T * (([50, 100] * [100, 100].T) . [50, 100]) * [100, 100].T)
                                                # = [50, 784].T * [50, 100] * [100, 100].T
                                                # = [50, 784].T * [50, 100].T
                                                # = [784, 100]
    if verbose: print("d_weights1:", d_weights1.shape, "should be [784, 100]")

    return d_weights1, d_weights2, d_weights3

Here's how we predict from the output of 10 neurons, representing the digit predicted in a one-hot encoded fashion:

In [29]:
def predict(a4):
    prediction = np.argmax(a4, axis=1)
    return prediction

In [30]:
for j, (inputs, label) in enumerate(zip(X_train, y_train)):
    print(np.any(inputs))
    break

True


Let's debug *one batch* of one epoch:

In [31]:
for i in range(n_iter):
    for (input, label) in zip(X_train, y_train):
        one_hot_label = one_hot_enc_v2(label, num_labels=10)

        a1, z2, a2, z3, a3, z4, a4 = compute_forward_pass(input)
        print("feedforward outputs:", a1.shape, a2.shape, a3.shape, a4.shape)
        
        loss = compute_loss(a4, one_hot_label)
        print("loss", loss)
        
        grad1, grad2, grad3 = compute_backward_pass_fullmath([a1, z2, a2, z3, a3, z4, a4], one_hot_label, verbose=True)
        #print("backpropagation gradient updates:", grad1, grad2, grad3)
        
        break
    break

feedforward outputs: (100, 784) (100, 100) (100, 100) (100, 10)
loss 1564.6864476153412
sigmoid_derivative_3: (100, 10) should be [50, 10]
d_weights3: (100, 10) should be [100, 10]
sigmoid_derivative_2: (100, 100) should be [50, 100]
d_weights2: (100, 100) should be [100, 100]
sigmoid_derivative_1: (100, 100) should be [50, 100]
d_weights1: (784, 100) should be [784, 100]


So far so good. Here is our mini-batch training, for 50 epochs:

## Commenting the Training code since File already pickled 
Note: Feel free to review the code and re-run. It took approximately 30 minutes for both of us in our laptop to run below cell!! 

In [33]:
# from tqdm import tqdm
# #epoch loop
# for i in tqdm(range(n_iter)):
#     # batch loop
#     for (input, label) in zip(X_train, y_train):
#         one_hot_label = one_hot_enc_v2(label, num_labels=10)

#         a1, z2, a2, z3, a3, z4, a4 = compute_forward_pass(input)
#         loss = compute_loss(a4, one_hot_label)
#         grad1, grad2, grad3 = compute_backward_pass_fullmath([a1, z2, a2, z3, a3, z4, a4], one_hot_label)

#         # multiplicative learning factor
#         delta_w1, delta_w2, delta_w3 = eta * grad1, eta * grad2, eta * grad3

#         # additive learning factor
#         w1 = w1 + delta_w1 + delta_w1_prev * alpha
#         w2 = w2 + delta_w2 + delta_w2_prev * alpha
#         w3 = w3 + delta_w3 + delta_w3_prev * alpha

#         delta_w1_prev, delta_w2_prev, delta_w3_prev = delta_w1, delta_w2, delta_w3

#         train_losses.append(loss)
#         predictions = predict(a4)

#         wrong = np.where(predictions != label, np.matrix([1.]), np.matrix([0.]))
#         accuracy = 1 - (np.sum(wrong) / batch_size)
#         train_acc.append(accuracy)

#     # epoch loss and accuracy (mean of al batches)
#     print('epoch ', i, 'loss %.2f' % np.mean(np.matrix(train_losses)).item(), 
#           'training accuracy %.2f' % np.mean(np.matrix(train_acc)).item())

  2%|▏         | 1/50 [00:14<11:45, 14.39s/it]

epoch  0 loss 351.04 training accuracy 0.53


  4%|▍         | 2/50 [00:28<11:25, 14.29s/it]

epoch  1 loss 247.25 training accuracy 0.66


  6%|▌         | 3/50 [00:43<11:15, 14.36s/it]

epoch  2 loss 193.37 training accuracy 0.73


  8%|▊         | 4/50 [00:57<11:00, 14.36s/it]

epoch  3 loss 163.35 training accuracy 0.77


 10%|█         | 5/50 [01:11<10:45, 14.34s/it]

epoch  4 loss 143.85 training accuracy 0.80


 12%|█▏        | 6/50 [01:25<10:29, 14.30s/it]

epoch  5 loss 129.96 training accuracy 0.82


 14%|█▍        | 7/50 [01:39<10:09, 14.18s/it]

epoch  6 loss 119.45 training accuracy 0.83


 16%|█▌        | 8/50 [01:54<09:55, 14.18s/it]

epoch  7 loss 111.16 training accuracy 0.84


 18%|█▊        | 9/50 [02:08<09:42, 14.21s/it]

epoch  8 loss 104.42 training accuracy 0.85


 20%|██        | 10/50 [02:22<09:30, 14.26s/it]

epoch  9 loss 98.79 training accuracy 0.86


 22%|██▏       | 11/50 [02:37<09:19, 14.35s/it]

epoch  10 loss 94.00 training accuracy 0.87


 24%|██▍       | 12/50 [02:51<09:05, 14.35s/it]

epoch  11 loss 89.86 training accuracy 0.87


 26%|██▌       | 13/50 [03:06<08:53, 14.41s/it]

epoch  12 loss 86.23 training accuracy 0.88


 28%|██▊       | 14/50 [03:20<08:40, 14.46s/it]

epoch  13 loss 83.03 training accuracy 0.88


 30%|███       | 15/50 [03:35<08:28, 14.52s/it]

epoch  14 loss 80.16 training accuracy 0.89


 32%|███▏      | 16/50 [03:50<08:17, 14.64s/it]

epoch  15 loss 77.58 training accuracy 0.89


 34%|███▍      | 17/50 [04:05<08:05, 14.71s/it]

epoch  16 loss 75.25 training accuracy 0.90


 36%|███▌      | 18/50 [04:19<07:49, 14.66s/it]

epoch  17 loss 73.12 training accuracy 0.90


 38%|███▊      | 19/50 [04:33<07:29, 14.49s/it]

epoch  18 loss 71.16 training accuracy 0.90


 40%|████      | 20/50 [04:47<07:09, 14.33s/it]

epoch  19 loss 69.36 training accuracy 0.90


 42%|████▏     | 21/50 [05:01<06:53, 14.26s/it]

epoch  20 loss 67.70 training accuracy 0.91


 44%|████▍     | 22/50 [05:16<06:38, 14.23s/it]

epoch  21 loss 66.15 training accuracy 0.91


 46%|████▌     | 23/50 [05:30<06:25, 14.27s/it]

epoch  22 loss 64.71 training accuracy 0.91


 48%|████▊     | 24/50 [05:44<06:13, 14.36s/it]

epoch  23 loss 63.36 training accuracy 0.91


 50%|█████     | 25/50 [05:59<05:57, 14.32s/it]

epoch  24 loss 62.10 training accuracy 0.92


 52%|█████▏    | 26/50 [06:13<05:41, 14.21s/it]

epoch  25 loss 60.91 training accuracy 0.92


 54%|█████▍    | 27/50 [06:27<05:26, 14.21s/it]

epoch  26 loss 59.80 training accuracy 0.92


 56%|█████▌    | 28/50 [06:42<05:15, 14.36s/it]

epoch  27 loss 58.74 training accuracy 0.92


 58%|█████▊    | 29/50 [06:57<05:06, 14.61s/it]

epoch  28 loss 57.74 training accuracy 0.92


 60%|██████    | 30/50 [07:12<04:53, 14.69s/it]

epoch  29 loss 56.79 training accuracy 0.92


 62%|██████▏   | 31/50 [07:26<04:37, 14.62s/it]

epoch  30 loss 55.90 training accuracy 0.93


 64%|██████▍   | 32/50 [07:40<04:20, 14.46s/it]

epoch  31 loss 55.04 training accuracy 0.93


 66%|██████▌   | 33/50 [07:54<04:04, 14.38s/it]

epoch  32 loss 54.22 training accuracy 0.93


 68%|██████▊   | 34/50 [08:09<03:50, 14.40s/it]

epoch  33 loss 53.45 training accuracy 0.93


 70%|███████   | 35/50 [08:23<03:36, 14.41s/it]

epoch  34 loss 52.70 training accuracy 0.93


 72%|███████▏  | 36/50 [08:38<03:23, 14.51s/it]

epoch  35 loss 51.99 training accuracy 0.93


 74%|███████▍  | 37/50 [08:52<03:08, 14.51s/it]

epoch  36 loss 51.31 training accuracy 0.93


 76%|███████▌  | 38/50 [09:07<02:53, 14.45s/it]

epoch  37 loss 50.66 training accuracy 0.93


 78%|███████▊  | 39/50 [09:21<02:37, 14.29s/it]

epoch  38 loss 50.03 training accuracy 0.93


 80%|████████  | 40/50 [09:34<02:21, 14.12s/it]

epoch  39 loss 49.43 training accuracy 0.94


 82%|████████▏ | 41/50 [09:48<02:06, 14.01s/it]

epoch  40 loss 48.85 training accuracy 0.94


 84%|████████▍ | 42/50 [10:02<01:51, 13.99s/it]

epoch  41 loss 48.30 training accuracy 0.94


 86%|████████▌ | 43/50 [10:16<01:38, 14.02s/it]

epoch  42 loss 47.76 training accuracy 0.94


 88%|████████▊ | 44/50 [10:30<01:24, 14.03s/it]

epoch  43 loss 47.24 training accuracy 0.94


 90%|█████████ | 45/50 [10:44<01:10, 14.02s/it]

epoch  44 loss 46.74 training accuracy 0.94


 92%|█████████▏| 46/50 [10:58<00:55, 13.94s/it]

epoch  45 loss 46.26 training accuracy 0.94


 94%|█████████▍| 47/50 [11:12<00:41, 13.91s/it]

epoch  46 loss 45.79 training accuracy 0.94


 96%|█████████▌| 48/50 [11:26<00:27, 13.96s/it]

epoch  47 loss 45.34 training accuracy 0.94


 98%|█████████▊| 49/50 [11:40<00:13, 13.98s/it]

epoch  48 loss 44.90 training accuracy 0.94


100%|██████████| 50/50 [11:54<00:00, 14.30s/it]

epoch  49 loss 44.48 training accuracy 0.94





The loss keeps on *decreasing* gradually whereas the accuracy starts converging at 90%, so that is encouraging. We can also play with the learning rates!

However, we have achieved the desired goal of the Assignment to improve the accuracy. And we achieved the training accuracy of 94%

Verifying with the test set:

## Pickling and saving the Model file

In [34]:
import pickle

weights = {"w1": w1, "w2": w2, "w3": w3}

# pickle and store the object
# with open("model_obj.pickle", "wb") as f:
#     pickle.dump(weights, f)

# load the object from the pickle file
with open("model_obj.pickle", "rb") as f:
    loaded_obj = pickle.load(f)

# test the loaded object
print(loaded_obj)  # output: My name is Alice and I'm 25 years old.

{'w1': array([[-0.14997095,  0.31483887, -0.48954779, ..., -0.78872145,
         1.36996464,  0.67208106],
       [ 0.35400282, -1.21971218,  0.61564137, ..., -0.1918318 ,
         1.04680048,  0.63559922],
       [-0.5203689 ,  0.1056613 ,  0.00904234, ..., -1.38069351,
         0.89219492,  0.43634337],
       ...,
       [ 0.03061318, -1.83391292,  2.22401033, ..., -0.49705636,
        -0.82273292,  1.22012058],
       [-0.63198001,  0.46891891, -0.15704594, ...,  0.69150619,
         0.95007154,  0.67360055],
       [ 0.66011863,  1.05376945,  0.57085366, ...,  0.14311742,
        -1.32773597,  0.34461406]]), 'w2': array([[ 0.74450077, -1.50887922, -0.33335477, ..., -0.61732294,
         0.15822524,  0.06544368],
       [-1.49388238,  1.24457944, -1.38653688, ...,  1.05926415,
         1.21376151,  0.05485352],
       [ 0.56610785, -0.01433834, -1.8300559 , ...,  0.75113131,
         1.59148536,  0.99241089],
       ...,
       [ 0.57854666, -1.3528458 ,  0.35848809, ...,  0.141121

In [35]:
w1, w2, w3 = loaded_obj["w1"], loaded_obj["w2"], loaded_obj["w3"]

In [36]:
y_test_one_hot = np.zeros((y_test.shape[0], 10), dtype=np.float32)

for i, val in enumerate(y_test):
    y_test_one_hot[i, val] = 1.0
    
print(y_test_one_hot)

[[0. 0. 0. ... 1. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [37]:
X_test.shape, y_test.shape, y_test_one_hot.shape

((10000, 784), (10000,), (10000, 10))

To predict the test dataset, we run one feedforward pass with a batch of size 10,000!

In [38]:
a1, z2, a2, z3, a3, z4, a4 = compute_forward_pass(X_test)
loss = compute_loss(a4, y_test_one_hot)
predictions = predict(a4)
predictions

  a2 = 1/(1 + np.exp(-z2))


array([7, 2, 1, ..., 4, 5, 6], dtype=int64)

In [39]:
labels = predict(y_test_one_hot)
labels

array([7, 2, 1, ..., 4, 5, 6], dtype=int64)

In [40]:
def predict(a4):
    prediction = np.argmax(a4, axis=1)
    return prediction

In [41]:
a1, z2, a2, z3, a3, z4, a4 = compute_forward_pass(X_test[:,:])
loss = compute_loss(a4, y_test_one_hot[:, :])
predictions = predict(a4)
loss, predictions

  a2 = 1/(1 + np.exp(-z2))


(4367.251842708593, array([7, 2, 1, ..., 4, 5, 6], dtype=int64))

In [42]:
y_train.shape

(600, 100)

## Training and Testing accuracy

In [43]:
def compute_train_accuracy():
    X_train, y_train, X_test, y_test = load()
    y_train_one_hot = np.zeros((y_train.shape[0], 10), dtype=np.float32)

    for i, val in enumerate(y_train):
        y_train_one_hot[i, val] = 1.0
        
    a1, z2, a2, z3, a3, z4, a5 = compute_forward_pass(X_train)

    s1 = predict(y_train_one_hot[:, :])
    s2 = predict(a5)

    right = 0
    for i in range(len(s2)):
        if s1[i] == s2[i]:
            right+=1
    return right/len(s2)

compute_train_accuracy()

  a2 = 1/(1 + np.exp(-z2))


0.9690666666666666

In [44]:
def compute_test_accuracy():
    s1 = predict(y_test_one_hot[:, :])
    s2 = predict(a4)
    right = 0
    for i in range(len(s2)):
        if s1[i] == s2[i]:
            right+=1
    return right/len(s2)

compute_test_accuracy()

0.9428

## Conclusion

- We learned how to train and build a neural network model from scratch to classify MNIST dataset.
- We achieved a very good training accuracy of 96% and test accuracy of 94%