## An implementation of NN using numpy for MNIST classification

Note: Using ReLU activation function for the first layer might cause a runtime warning when computing the sigmoid function for further layers. This happens when normal initialization is used. It is better to use sigmoid in the first layer.

Update: The previous issue is solved by using He initialization

In [1]:
import numpy as np
from keras.datasets import mnist
import time

(x_train, y_train), (x_val, y_val) = mnist.load_data()
print(x_train.shape) #(60000, 28, 28)
print(x_val.shape) #(10000, 28, 28)

#Flatten the inputs
from keras.utils.np_utils import to_categorical
y_train = to_categorical(y_train)
y_val = to_categorical(y_val)



x_train = np.reshape(x_train,(-1,28*28))
x_val = np.reshape(x_val,(-1,28*28))

#Normalize inputs
x_train = x_train/255.0
x_val = x_val/255.0

print(x_train.shape) #(60000, 784)
print(x_val.shape) #(10000, 784)

2022-06-17 09:11:12.075607: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-06-17 09:11:12.075638: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


(60000, 28, 28)
(10000, 28, 28)
(60000, 784)
(10000, 784)


In [2]:
class DeepNeuralNetwork():
    def __init__(self, sizes, epochs=10, l_rate=0.045):
        self.sizes = sizes
        self.epochs = epochs
        self.l_rate = l_rate

        # we save all parameters in the neural network in this dictionary
        self.params = self.initialization()

    def sigmoid(self, x, derivative=False):
        if derivative:
            return self.sigmoid(x)*(1-self.sigmoid(x))
        return 1/(1 + np.exp(-x))

    
    def relu(self, x, derivative=False):
        if derivative:
            #temp = np.zeros((x.shape))
            temp = (x > 1)*1
            return temp
        return np.maximum(x,0)
    
    
    def initialization(self):
        # He initialization is used
        input_layer=self.sizes[0]
        hidden_1=self.sizes[1]
        hidden_2=self.sizes[2]
        output_layer=self.sizes[3]

        params = {
            'W1':np.random.randn(hidden_1, input_layer) * np.sqrt(2/self.sizes[0]),
            'W2':np.random.randn(hidden_2, hidden_1) * np.sqrt(2/self.sizes[1]),
            'W3':np.random.randn(output_layer, hidden_2) * np.sqrt(2/self.sizes[2]),
            
            'b1':np.zeros((hidden_1,1)),
            'b2':np.zeros((hidden_2,1)),
            'b3':np.zeros((output_layer,1))
        }
        

        return params

    def forward_pass(self, x_train):
        params = self.params

        # input layer activations becomes sample
        params['A0'] = x_train[:,None]
        # (784,1)
        #print(params['A0'].shape)

        # input layer to hidden layer 1
        params['Z1'] = np.dot(params["W1"], params['A0']) + params['b1']
        ##()
        params['A1'] = self.relu(params['Z1'])

        # hidden layer 1 to hidden layer 2
        params['Z2'] = np.dot(params["W2"], params['A1']) + params['b2']
        params['A2'] = self.relu(params['Z2'])

        # hidden layer 2 to output layer
        params['Z3'] = np.dot(params["W3"], params['A2']) + params['b3']
        params['A3'] = self.sigmoid(params['Z3'])

        return params['A3']

    def backward_pass(self, y_train, output):
        '''
            This is the backpropagation algorithm, for calculating the updates
            of the neural network's parameters.

            Note: There is a stability issue that causes warnings. This is 
                  caused  by the dot and multiply operations on the huge arrays.
                  
                  RuntimeWarning: invalid value encountered in true_divide
                  RuntimeWarning: overflow encountered in exp
                  RuntimeWarning: overflow encountered in square
        '''
        params = self.params
        change_w = {}
        
        #y_train_back = y_train[:,None]
        y_train_back = y_train[:,None]

        d_Z3 = output - y_train_back
 
        d_W3 = np.dot(d_Z3, params['A2'].T)
        d_b3 = d_Z3
                
        change_w['W3'] = d_W3
        change_w['b3'] = d_b3
        #change_w['b3'] = d_b3

        # Calculate W2 update
        d_Z2 = np.dot(params['W3'].T, d_Z3) * self.relu(params['Z2'], derivative=True)
        d_W2 = np.dot(d_Z2, params['A1'].T)
        d_b2 = d_Z2
        
        change_w['W2'] = d_W2
        change_w['b2'] = d_b2
        
        # Calculate W1 update
        d_Z1 = np.dot(params['W2'].T, d_Z2) * self.relu(params['Z1'], derivative=True)
        
        d_W1 = np.dot(d_Z1, params['A0'].T)
        d_b1 = d_Z1
        
        change_w['b1'] = d_b1
        change_w['W1'] = d_W1

        return change_w

    def update_network_parameters(self, changes_to_w):
        '''
            Update network parameters according to update rule from
            Stochastic Gradient Descent.

            θ = θ - η * ∇J(x, y), 
                theta θ:            a network parameter (e.g. a weight w)
                eta η:              the learning rate
                gradient ∇J(x, y):  the gradient of the objective function,
                                    i.e. the change for a specific theta θ
        '''
        
        for key, value in changes_to_w.items():
            self.params[key] -= self.l_rate * value


    def train(self, x_train, y_train, x_val, y_val):
        for iteration in range(self.epochs):
            for x,y in zip(x_train, y_train):
                output = self.forward_pass(x)
                changes_to_w = self.backward_pass(y, output)
                self.update_network_parameters(changes_to_w)
            
            predictions = []
            for x,y in zip(x_val,y_val):
                output = self.forward_pass(x)
                pred = np.argmax(output)
                predictions.append(pred == np.argmax(y))
            
            accuracy = np.mean(predictions)
            print('Epoch: {0},  Accuracy: {1:.2f}%'.format(
                iteration+1, accuracy * 100
            ))

In [3]:
dnn = DeepNeuralNetwork(sizes=[784, 128, 64, 10],epochs=10,l_rate=0.001)
dnn.train(x_train, y_train, x_val, y_val)

Epoch: 1,  Accuracy: 90.06%
Epoch: 2,  Accuracy: 92.86%
Epoch: 3,  Accuracy: 93.76%
Epoch: 4,  Accuracy: 94.79%
Epoch: 5,  Accuracy: 95.28%
Epoch: 6,  Accuracy: 95.58%
Epoch: 7,  Accuracy: 95.85%
Epoch: 8,  Accuracy: 96.22%
Epoch: 9,  Accuracy: 96.28%
Epoch: 10,  Accuracy: 96.47%
