In [1]:
import numpy as np
import math
import random

In [2]:
class Network:
    #initiating our network
    def __init__(self, sizes):
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.biases = [ np.random.randn(y,1) for y in sizes[1:] ]
        self.weights = [ np.random.randn(y,x) for x,y in zip(sizes[:-1],sizes[1:])]
        
    def feedforward(self,a):
        #return w.x + b that is apply the weights and biases
        for b,w in zip(self.biases, self.weights):
            a = sigmoid(np.dot(w,a) + b)
        return a
    
    '''
        the training data is a list of tuples (x,y) with inputs and outputs
    '''
    
    def SGD(self, training_data, epochs, batch_size, learning_rate, test_data=None ):
        if test_data:
            test_data=list(test_data)
            n_test = len(test_data)
        training_data = list(training_data)
        n = len(training_data)
        #here we have used xrange to because it generates number one by one instead of
        #range method which stores a whole list of numbers then iterate
        #xrange not supported in python 3
        for j in range(epochs):
            random.shuffle(training_data)
            mini_batches = [
                training_data[ind:ind+batch_size] for ind in range(0,n,batch_size)
            ]
            #####
            if(j%4==0):
                learning_rate = learning_rate/2;
            
            #####
            for mini_batch in mini_batches:
                #the function below will update the wieghts and biases based on mini batch and the learning rate
                self.updateParameters(mini_batch, learning_rate)
            if test_data:
                #it will evaluate the function for each epoch - slowing down the training
                print(f"epoch {j} : {self.evaluate(test_data)} / {n_test}")
            else:
                print(f"Epoch {j} completed")
    
    def updateParameters(self, batch, eta):
        #nabla represents the gradient operator
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        for x,y in batch:
            #finding derivative for each training example in the batch
            delta_b, delta_w = self.backprop(x,y)
            #adding all the derivative first to take the average derivative later
            nabla_b = [(nb + b) for nb,b in zip(nabla_b, delta_b)]
            nabla_w = [(nw + w) for nw,w in zip(nabla_w, delta_w)]
        #updating all the weights
        self.weights = [w-(eta/len(batch))*nw for w,nw in zip(self.weights,nabla_w)]
        #updating all the biases
        self.biases = [b-(eta/len(batch))*nb for b,nb in zip(self.biases, nabla_b)]
    
    ''' !!!!!!!!!!   to do  !!!!!!!!!!!!!!!!'''
    def backprop(self, x, y):
        """Return a tuple ``(nabla_b, nabla_w)`` representing the
        gradient for the cost function C_x.  ``nabla_b`` and
        ``nabla_w`` are layer-by-layer lists of numpy arrays, similar
        to ``self.biases`` and ``self.weights``."""
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        # feedforward
        activation = x
        activations = [x] # list to store all the activations, layer by layer
        zs = [] # list to store all the z vectors, layer by layer
        for b, w in zip(self.biases, self.weights):
            z = np.dot(w, activation)+b
            zs.append(z)
            activation = sigmoid(z)
            activations.append(activation)
        # backward pass
        delta = self.cost_derivative(activations[-1], y) * \
            sigmoid_prime(zs[-1])
        nabla_b[-1] = delta
        nabla_w[-1] = np.dot(delta, activations[-2].transpose())
        # Note that the variable l in the loop below is used a little
        # differently to the notation in Chapter 2 of the book.  Here,
        # l = 1 means the last layer of neurons, l = 2 is the
        # second-last layer, and so on.  It's a renumbering of the
        # scheme in the book, used here to take advantage of the fact
        # that Python can use negative indices in lists.
        for l in range(2, self.num_layers):
            z = zs[-l]
            sp = sigmoid_prime(z)
            delta = np.dot(self.weights[-l+1].transpose(), delta) * sp
            nabla_b[-l] = delta
            nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
        return (nabla_b, nabla_w)
    
    def evaluate(self, test_data):
        '''
            returns the number of test inputs for which we got correct output
            Here we are using argmax to take the output as the one with maximum activation (output is the index)
        '''
        #we have created a list of tuple (y_hat, y) where y_hat is the output and y is the actual value
        test_results = [(np.argmax(self.feedforward(x)),y) for (x,y) in test_data]
        return sum(int(x==y) for (x,y) in test_results)
    
    def cost_derivative(self, output_activations, y):
        return output_activations-y
def sigmoid(z):
    return 1.0/(1.0 + np.exp(-z))
def sigmoid_prime(z):
    return sigmoid(z)*(1-sigmoid(z))

In [5]:
import mnist_loader
training_data , validation_data, test_data = mnist_loader.load_data_wrapper()
'''
NOTE : the following changes has been made to original code of mnist_loader
    1. changed to pickle instead of using cPickle because of python 3
    2. added encofing = 'latin1' in pickle.load to support all characters apart from ascii
'''

"\nNOTE : the following changes has been made to original code of mnist_loader\n    1. changed to pickle instead of using cPickle because of python 3\n    2. added encofing = 'latin1' in pickle.load to support all characters apart from ascii\n"

In [None]:
net2 = Network([784, 30, 30, 30, 10])
#the above initiated network has 30 neurons in the hidden layer and as per given data
#the input has 784 inputs pixels and 10  output neurons corresponding to each digit is there
net2.SGD(training_data, 500, 10,0.05, test_data=test_data)

In [17]:
net = Network([784, 50, 50, 10])
#the above initiated network has 30 neurons in the hidden layer and as per given data
#the input has 784 inputs pixels and 10  output neurons corresponding to each digit is there
net.SGD(training_data, 20, 10, 5.0, test_data=test_data)

epoch 0 : 9172 / 10000
epoch 1 : 9306 / 10000
epoch 2 : 9284 / 10000
epoch 3 : 9401 / 10000
epoch 4 : 9430 / 10000
epoch 5 : 9453 / 10000
epoch 6 : 9482 / 10000
epoch 7 : 9537 / 10000
epoch 8 : 9516 / 10000
epoch 9 : 9536 / 10000
epoch 10 : 9541 / 10000
epoch 11 : 9510 / 10000
epoch 12 : 9541 / 10000
epoch 13 : 9481 / 10000
epoch 14 : 9560 / 10000
epoch 15 : 9573 / 10000
epoch 16 : 9546 / 10000
epoch 17 : 9555 / 10000
epoch 18 : 9586 / 10000
epoch 19 : 9590 / 10000


In [None]:
net = Network([784,100, 10])
#the above initiated network has 30 neurons in the hidden layer and as per given data
#the input has 784 inputs pixels and 10  output neurons corresponding to each digit is there
net.SGD(training_data, 30, 10, 2.0, test_data=test_data)

epoch 0 : 6139 / 10000
epoch 1 : 7343 / 10000
