# Digit Recognition NN

In [1]:
from sklearn import datasets,metrics
import numpy as np
digits = datasets.load_digits()

In [2]:
from sklearn import model_selection as ms

trainx, testx, trainy, testy = ms.train_test_split(
        digits.images.reshape((len(digits.images), -1)),
        digits.target,
        train_size = 0.8,
        random_state=11)




In [3]:
trainx.shape,trainy.shape

((1437, 64), (1437,))

In [4]:
x=trainx
label=trainy
testy = testy

### 1. Digit recognition unvectorized

In [5]:
def sigmoid(z):
    return 1/(1+np.exp(-z))
def dsigmoid(z):
    return sigmoid(z) * (1 - sigmoid(z))

In [6]:
def pred_digit_recog_uv(x,weight,bias):
    input_data = x.T.copy()
    for layer_idx,layer_num in enumerate(layer[:-1]):                  
        data_in = np.matmul(weight[layer_idx],input_data)+bias[layer_idx]
        data_op = sigmoid(data_in)
        input_data = data_op.copy()    
    return data_op

In [7]:
layer=[64,30,10]
weight = []
bias = []

# initialize the weights & biases
for idx,num in enumerate(layer[:-1]):
    weight.append(np.random.normal(size=(layer[idx+1],num))/10)
    bias.append(np.random.normal(size=(layer[idx+1],1))/10)

lr = 0.055
N = len(x)
num_layers = len(layer)

for epochs in range(10):
    
    for data_idx in range(N):
        
        # loop through each input layer
        activation = [x[data_idx]]
        activation_gradient = []
        
        #forward pass
        for idx,layer_num in enumerate(layer[1:]):
            input_vec = np.zeros((layer_num,1))
            for act in range(len(activation[idx])):
                input_vec+=(activation[idx][act]*(weight[idx][:,act])).reshape(-1,1)
            input_vec+=bias[idx]
            activation.append(sigmoid(input_vec).reshape(-1,1))
            activation_gradient.append(dsigmoid(input_vec).reshape(-1,1))
        
        #loss
        y = np.zeros((layer[-1],1))
        y[label[data_idx]] = 1
        loss = (activation[-1]-y)
        
        #backward pass
        #last layer
        delta_prev = loss*activation_gradient[-1]
        
        for idx,layer_num in enumerate(layer[-2::-1]):
            
            if(idx!=len(layer)-2):
                # Do not compute the gradients for the input layer
                delta_curr = np.zeros((layer_num,1))
                for b_idx in range(len(delta_prev)):
                    delta_curr += (delta_prev[b_idx]*weight[-1-idx][b_idx,:]).reshape(-1,1)
                delta_curr*=activation_gradient[-2-idx]
            
            #SGD
            #update weights of layer + 1
            for w_idx in range(weight[-1-idx].shape[1]):
                weight[-1-idx][:,w_idx]-=(lr*activation[-2-idx][w_idx]*delta_prev).flatten()
                bias[-1-idx]-=(lr*delta_prev)
            delta_prev = delta_curr
    res = pred_digit_recog_uv(testx,weight,bias)
    pred = np.argmax(res,axis=0)
    print("Epoch:",epochs+1,"Test acc:",np.round(sum(pred==testy)/len(testy)*100,2))

Epoch: 1 Test acc: 71.94
Epoch: 2 Test acc: 90.0
Epoch: 3 Test acc: 92.5
Epoch: 4 Test acc: 91.67
Epoch: 5 Test acc: 95.0
Epoch: 6 Test acc: 95.56
Epoch: 7 Test acc: 96.11
Epoch: 8 Test acc: 95.56
Epoch: 9 Test acc: 95.83
Epoch: 10 Test acc: 95.83


### 2. Digit recognition paritially vectorized

In [8]:
layer=[64,30,10]
weight = []
bias = []

# initialize the weights & biases
for idx,num in enumerate(layer[:-1]):
    weight.append(np.random.normal(size=(layer[idx+1],num))/10)
    bias.append(np.random.normal(size=(layer[idx+1],1))/10)

lr = 0.05
N = len(x)
num_layers = len(layer)

for epochs in range(10):
    
    for data_idx in range(N):
        
        # loop through each input layer
        activation = [x[data_idx].reshape(-1,1)]
        activation_gradient = []
        
        #forward pass layer-by-layer
        for idx,layer_num in enumerate(layer[1:]):
            input_vec = np.matmul(weight[idx],activation[idx])
            activation.append(sigmoid(input_vec).reshape(-1,1))
            activation_gradient.append(dsigmoid(input_vec).reshape(-1,1))
        
        #loss
        y = np.zeros((layer[-1],1))
        y[label[data_idx]] = 1
        loss = (activation[-1]-y)
        
        #backward pass
        #last layer
        delta_prev = loss*activation_gradient[-1]
        
        for idx,layer_num in enumerate(layer[-2::-1]):
            
            if(idx!=len(layer)-2):
                # Do not compute the gradients for the input layer
                delta_curr = np.matmul(weight[-1-idx].T,delta_prev)
                delta_curr*=activation_gradient[-2-idx]
            
            #SGD
            #update weights of layer + 1 
            weight[-1-idx] -= (lr*np.matmul(delta_prev,activation[-2-idx].T))
            bias[-1-idx]-=(lr*delta_prev)
            delta_prev = delta_curr
    res = pred_digit_recog_uv(testx,weight,bias)
    pred = np.argmax(res,axis=0)
    print("Epoch:",epochs+1,"Test acc:",np.round(sum(pred==testy)/len(testy)*100,2))

Epoch: 1 Test acc: 76.39
Epoch: 2 Test acc: 88.89
Epoch: 3 Test acc: 91.94
Epoch: 4 Test acc: 93.33
Epoch: 5 Test acc: 93.61
Epoch: 6 Test acc: 93.61
Epoch: 7 Test acc: 94.17
Epoch: 8 Test acc: 95.83
Epoch: 9 Test acc: 94.17
Epoch: 10 Test acc: 94.72


### 3. Digit recognition batch wise vectorization

In [9]:
layer=[64,20,10]
weight = []
bias = []
batch_size= 64
desired_test_accuracy = 95
# initialize the weights & biases with Xavier initialization
for idx,num in enumerate(layer[:-1]):
    weight.append(np.random.normal(size=(layer[idx+1],num))*np.sqrt(6/(layer[idx]+layer[idx+1])))
    bias.append(np.zeros(shape=(layer[idx+1],1)))#/layer[0])

lr = 1
N = len(x)
num_layers = len(layer)

for epochs in range(1000):
    start = 0
    while(start<len(x)):
        activation = [x[start:start+batch_size].T]
        activation_gradient = []

        #forward pass layer-by-layer
        for idx,layer_num in enumerate(layer[1:]):
            input_vec = np.matmul(weight[idx],activation[idx])
            activation.append(sigmoid(input_vec))
            activation_gradient.append(dsigmoid(input_vec))

        #loss
        y = np.eye(10)
        y = y[trainy[start:start+batch_size]].T
        loss = (activation[-1]-y)

        #backward pass
        #last layer
        delta_prev = loss*activation_gradient[-1]
        for idx,layer_num in enumerate(layer[-2::-1]):

            if(idx!=len(layer)-2):
                # Do not compute the gradients for the input layer
                delta_curr = np.matmul(weight[-1-idx].T,delta_prev)
                delta_curr*=activation_gradient[-2-idx]

            #SGD
            #update weights of layer + 1 
            weight[-1-idx] -= (lr*np.matmul(delta_prev,activation[-2-idx].T))/batch_size
            bias[-1-idx]-=(lr*delta_prev.sum(axis=1,keepdims=True))/batch_size
            delta_prev = delta_curr        
        
        # update batch indicies
        start+=batch_size
    res = pred_digit_recog_uv(testx,weight,bias)
    pred = np.argmax(res,axis=0)
    acc = np.round(sum(pred==testy)/len(testy)*100,2)
    if epochs%100==0:print("Epoch:",epochs+1,"Test acc:",acc)
    if acc >= desired_test_accuracy:
        print("Stopping training\nDesired Test accuracy {2} \nTest accuracy attained {0} in epoch {1}".format(acc,epochs,desired_test_accuracy))
        break

Epoch: 1 Test acc: 42.5
Stopping training
Desired Test accuracy 95 
Test accuracy attained 95.0 in epoch 50
