In [1]:
from google.colab import auth
auth.authenticate_user()

print('authenticated')

authenticated


In [21]:
import numpy as np
import random
import time
import pickle
import gzip
import json
import sys

## How to load the dataset
download the [mnist dataset](https://drive.google.com/file/d/1l_zwAKQTlZPib4xhVGriDpVqjaXkXZk-/view?usp=drive_link) or used your desired dataset *(donot forget to change the nerons and other parameters)*

upload the downloaded gzip file in the colab file


> remember: donot forget to check the correct file name at the line 7





In [3]:
# load 50,000 dataset from mnist
def load_data():
  # training data is 28*28 pixel
  # return value =: training data, the validation data, and the test data.

  print('loading dataset...')
  f = gzip.open('/content/mnist.pkl.gz', 'rb') # --> you can replace the file name with your desire file name

  u = pickle._Unpickler(f)
  u.encoding = 'latin1'
  training_data, validation_data, test_data = u.load()
  f.close()
  print('loaded')
  return (training_data, validation_data, test_data)

def load_data_wrapper():
  tr_d, va_d, te_d = load_data()
  print('reshapping...')
  training_inputs = [np.reshape(x, (784, 1)) for x in tr_d[0]]
  training_results = [vectorized_result(y) for y in tr_d[1]]
  training_data = list(zip(training_inputs, training_results))

  validation_inputs = [np.reshape(x, (784, 1)) for x in va_d[0]]
  validation_data = list(zip(validation_inputs, va_d[1]))

  test_inputs = [np.reshape(x, (784, 1)) for x in te_d[0]]
  test_data = list(zip(test_inputs, te_d[1]))
  print('completed')
  return (training_data, validation_data, test_data)

def vectorized_result(j):
  """
  This is used to convert a digit
  (0...9) into a corresponding desired output
  """
  e = np.zeros((10, 1))
  e[j] = 1.0
  return e

## Helping Function

In [25]:
def softmax(z):
  exp_z = np.exp(z - np.max(z))
  return exp_z / np.sum(exp_z, axis=0, keepdims=True)

def sigmoid(z):
  return 1.0/(1.0+np.exp(-z))

def sigmoid_prime(z):
  # derivative of sigmoid fn
  return sigmoid(z)*(1-sigmoid(z))

def load(filename):
  # load instance of the Network from a file
  f = open(filename, 'r')
  data = json.load(f)
  f.close()

  cost = getattr(sys.modules[__name__], data["cost"])
  net = Network(data["sizes"], cost=cost)
  net.weights = [np.array(w) for w in data["weights"]]
  net.biases = [np.array(b) for b in data["biases"]]

  return net


## Cost Caluation method

**Cross-entropy** is better than **Quadractic** for training because we have eliminated the direct multiplication of the sigmoid derivative in the gradient calculation. This helps avoid the issue of vanishing gradients when the network's predictions are very wrong, leading to faster and more stable training.

*When the network is way off, it learns quicker, that would be the case from the very start*

**NOTE** :In Cross Entropy Cost, we used np.nan_to_num is used to ensure numerical stability.  In particular, if both `a` and `y` have a 1.0
in the same slot, then the expression (1-y)*np.log(1-a)
returns nan.  The np.nan_to_num ensures that that is converted
to the correct value (0.0).

In [5]:
  class QuadraticCost(object):
    @staticmethod
    def fn(a, y):
      return 0.5*np.linalg.norm(a-y)**2

    @staticmethod
    def delta(z, a, y):
      return (a-y)*sigmoid_prime(z)

  class CrossEntropyCost(object):
    @staticmethod
    def fn(a, y):
      return np.sum(np.nan_to_num(-y*np.log(a)-(1-y)*np.log(1-a)))

    @staticmethod
    def delta(z, a, y):
      """
      we eliminated sigmoid derivative function from the error delta
      which allow this neural network to learn even faster when its more wrong
      """
      return (a-y)

## Neural Network Class and their necessary methods

In [34]:
class Network(object):
  def __init__(self, sizes, cost=CrossEntropyCost):
    self.num_layers = len(sizes)
    self.sizes = sizes
    # self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
    # self.weights = [np.random.randn(y, x) for x, y in zip(sizes[:-1], sizes[1:])]\
    self.default_weight_init()
    self.cost = cost

  def default_weight_init(self):
    """Initialize each weight using a Gaussian distribution with mean 0
        and standard deviation 1 over the square root of the number of
        weights connecting to the same neuron"""
    self.biases = [np.random.randn(y, 1) for y in self.sizes[1:]]
    self.weights = [np.random.randn(y, x)/np.sqrt(x) for x, y in zip(self.sizes[:-1], self.sizes[1:])]

  def large_weight_init(self):
    """Initialize the weights using a Gaussian distribution with mean 0
        and standard deviation 1
    """
    self.biases = [np.random.randn(y, 1) for y in self.sizes[1:]]
    self.weights = [np.random.randn(y, x) for x, y in zip(self.sizes[:-1], self.sizes[1:])]

  def feedforward(self, a):
    for w, b in zip(self.weights, self.biases):
      a = sigmoid(np.dot(w, a) + b)
    return a

  # def cost_derivative(self, output_activations, y):
  #   return (output_activations - y)

  def SGD(self, training_data, epochs, mini_batch_size, eta,
          lmbda = 0.0,
            test_data=None,
            monitor_evaluation_cost=False,
            monitor_evaluation_accuracy=False,
            monitor_training_cost=False,
            monitor_training_accuracy=False):

    print("training...")
    if test_data: n_test = len(test_data)
    n = len(training_data)
    evaluation_cost, evaluation_accuracy = [], []
    training_cost, training_accuracy = [], []

    for j in range(epochs):
      time1 = time.time()
      random.shuffle(training_data)
      mini_batches = [training_data[k:k+mini_batch_size] for k in range(0, n, mini_batch_size)]

      for mini_batch in mini_batches:
        self.update_mini_batch(mini_batch, eta, lmbda, len(training_data))

      print("Epoch %s training complete" % j)

      if monitor_training_cost:
        cost = self.total_cost(training_data, lmbda)
        training_cost.append(cost)
        print("Cost on training data: {}".format(cost))
      if monitor_training_accuracy:
        accuracy = self.accuracy(training_data, convert=True)
        training_accuracy.append(accuracy)
        print("Accuracy on training data: {} / {} = {}".format(accuracy, n, accuracy/n))
      if monitor_evaluation_cost:
        cost = self.total_cost(test_data, lmbda, convert=True)
        evaluation_cost.append(cost)
        print("Cost on evaluation data: {}".format(cost))
      if monitor_evaluation_accuracy:
        accuracy = self.accuracy(test_data)
        evaluation_accuracy.append(accuracy)
        print("Accuracy on evaluation data: {} / {} = {}".format(self.accuracy(test_data), n_test, self.accuracy(test_data)/n_test))

    return evaluation_cost, evaluation_accuracy, training_cost, training_accuracy

  def update_mini_batch(self, mini_batch, eta, lmbda, n):
    """Update the network's weights and biases by applying gradient
        descent using backpropagation to a single mini batch"""
    nabla_b = [np.zeros(b.shape) for b in self.biases]
    nabla_w = [np.zeros(w.shape) for w in self.weights]

    # print("calculating gradient...")
    for x, y in mini_batch:
      delta_nabla_b, delta_nabla_w = self.backprop(x, y)

      nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
      nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]

    # print("updating weights...")

    self.weights = [(1-eta*(lmbda/n))*w-(eta/len(mini_batch))*nw for w, nw in zip(self.weights, nabla_w)]
    # self.weights = [w-(eta/len(mini_batch))*nw for w, nw in zip(self.weights, nabla_w)]
    self.biases = [b-(eta/len(mini_batch))*nb for b, nb in zip(self.biases, nabla_b)]


  def backprop(self, x, y):
    """Return a tuple ``(nabla_b, nabla_w)`` representing the
        gradient for the cost function C_x."""
    nabla_b = [np.zeros(b.shape) for b in self.biases]
    nabla_w = [np.zeros(w.shape) for w in self.weights]

    #feedforward
    activation = x
    activations = [x] # layer by layer(LBL) activation
    zs = [] # z vector store LBL
    for b, w in zip(self.biases, self.weights):
      z = np.dot(w, activation) + b
      activation = sigmoid(z)
      activations.append(activation)
      zs.append(z)

    # backpass
    delta = (self.cost).delta(zs[-1], activations[-1], y)
    nabla_b[-1] = delta
    nabla_w[-1] = np.dot(delta, activations[-2].transpose())

    for l in range(2, self.num_layers):
      delta = np.dot(self.weights[-l+1].transpose(), delta)*sigmoid_prime(zs[-l])
      nabla_b[-l] = delta
      nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
    return (nabla_b, nabla_w)

  def evaluate(self, test_data):
    test_results = [(np.argmax(self.feedforward(x)), y) for x,y in test_data]
    return sum(int(x == y) for (x, y) in test_results)

  def accuracy(self, data, convert=False):
    """Return the number of inputs in ``data`` for which the neural
        network outputs the correct result"""
    if convert:
      results = [(np.argmax(self.feedforward(x)), np.argmax(y)) for (x, y) in data]
    else:
      results = [(np.argmax(self.feedforward(x)), y) for (x, y) in data]
    return sum(int(x == y) for (x, y) in results)

  def total_cost(self, data, lmbda, convert=False):
    """The flag
        ``convert`` should be set to False if the data set is the
        training data (the usual case), and to True if the data set is
        the validation or test data.
        """
    cost = 0.0
    for x, y in data:
      a = self.feedforward(x)
      if convert: y = vectorized_result(y)
      cost += self.cost.fn(a, y)/len(data)
      cost += 0.5*(lmbda/len(data))*sum(np.linalg.norm(w)**2 for w in self.weights)
    return cost

  def save(self, filename):
    data = {"sizes": self.sizes,
            "weights": [w.tolist() for w in self.weights],
            "biases": [b.tolist() for b in self.biases],
            "cost": str(self.cost.__name__)}
    f = open(filename, "w")
    json.dump(data, f)
    f.close()





## Main function
you can find some input boxes and slider to change the parameters

In [43]:
if(__name__ == "__main__"):
  training_data, validation_data, test_data = load_data_wrapper()

  testing_data = test_data # @param ["test_data", "validation_data"] {type: "raw"}

  Cost = CrossEntropyCost #@param ["QuadraticCost", "CrossEntropyCost"] {type:"raw"}
  Hidden_Layer = "784 155 10" # @param {"type":"string"}
  hidden_layer = [int(x) for x in Hidden_Layer.split()]

  net = Network(hidden_layer, cost=Cost)
  net.default_weight_init()

  Epochs = 14 # @param {type: "number"}
  Mini_Batch_Size = 10 # @param {"type":"slider","min":0,"max":100,"step":1}
  Learning_Rate = 0.5 # @param {"type":"slider","min":0,"max":1,"step":0.01}
  Regularization = 0.75 # @param {"type":"slider","min":0,"max":1,"step":0.05}

  Monitor_Evaluation_Accuracy = True # @param {type:"boolean"}
  Monitor_Evaluation_Cost = False # @param {type:"boolean"}
  Monitor_Training_Accuracy = False # @param {type:"boolean"}
  Monitor_Training_Cost = False # @param {type:"boolean"}


  net.SGD(training_data=training_data,
          epochs=Epochs,
          mini_batch_size=Mini_Batch_Size,
          eta=Learning_Rate,
          lmbda = Regularization,
          test_data=testing_data,
          monitor_evaluation_accuracy=Monitor_Evaluation_Accuracy,
          monitor_evaluation_cost=Monitor_Evaluation_Cost,
          monitor_training_accuracy=Monitor_Training_Accuracy,
          monitor_training_cost=Monitor_Training_Cost
        )

  Save = True # @param {type:"boolean"}
  if Save:
    net.save("net.json")




loading dataset...
loaded
reshapping...
completed
training...
Epoch 0 training complete
Accuracy on evaluation data: 9570 / 10000 = 0.957
Epoch 1 training complete
Accuracy on evaluation data: 9649 / 10000 = 0.9649
Epoch 2 training complete
Accuracy on evaluation data: 9725 / 10000 = 0.9725
Epoch 3 training complete
Accuracy on evaluation data: 9741 / 10000 = 0.9741
Epoch 4 training complete
Accuracy on evaluation data: 9757 / 10000 = 0.9757
Epoch 5 training complete
Accuracy on evaluation data: 9785 / 10000 = 0.9785
Epoch 6 training complete
Accuracy on evaluation data: 9796 / 10000 = 0.9796
Epoch 7 training complete
Accuracy on evaluation data: 9780 / 10000 = 0.978
Epoch 8 training complete
Accuracy on evaluation data: 9789 / 10000 = 0.9789
Epoch 9 training complete
Accuracy on evaluation data: 9788 / 10000 = 0.9788
Epoch 10 training complete
Accuracy on evaluation data: 9797 / 10000 = 0.9797
Epoch 11 training complete
Accuracy on evaluation data: 9799 / 10000 = 0.9799
Epoch 12 train