In [1]:
import json
import random
import sys
import pickle
import gzip
import numpy as np

In [2]:
def load_data():
    f = gzip.open('/Users/Aarushi/Desktop/neural-networks-and-deep-learning/data/mnist.pkl.gz', 'rb')
    training_data, validation_data, test_data = pickle.load(f)
    f.close()
    return (training_data, validation_data, test_data)

def load_data_wrapper():
    tr_d, va_d, te_d = load_data()
    training_inputs = [np.reshape(x, (784, 1)) for x in tr_d[0]]
    training_results = [vectorized_result(y) for y in tr_d[1]]
    training_data = zip(training_inputs, training_results)
    validation_inputs = [np.reshape(x, (784, 1)) for x in va_d[0]]
    validation_data = zip(validation_inputs, va_d[1])
    test_inputs = [np.reshape(x, (784, 1)) for x in te_d[0]]
    test_data = zip(test_inputs, te_d[1])
    return (training_data, validation_data, test_data)

def vectorized_result(j):
    e = np.zeros((10, 1))
    e[j] = 1.0
    return e

In [3]:
class QuadraticCost(object):

    @staticmethod
    def fn(a, y):
        return 0.5*np.linalg.norm(a-y)**2

    @staticmethod
    def delta(z, a, y):     
        return (a-y) * sigmoid_prime(z)

In [4]:
class CrossEntropyCost(object):
    @staticmethod
    def fn(a, y):     
        return np.sum(np.nan_to_num(-y*np.log(a)-(1-y)*np.log(1-a)))

    @staticmethod
    def delta(z, a, y):
       
        return (a-y)

In [5]:
class Network(object):

    def __init__(self, sizes, cost=CrossEntropyCost):
       
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.default_weight_initializer()
        self.cost=cost

    def default_weight_initializer(self):
        
        self.biases = [np.random.randn(y, 1) for y in self.sizes[1:]]
        self.weights = [np.random.randn(y, x)/np.sqrt(x)
                        for x, y in zip(self.sizes[:-1], self.sizes[1:])]

    def large_weight_initializer(self):        
        self.biases = [np.random.randn(y, 1) for y in self.sizes[1:]]
        self.weights = [np.random.randn(y, x)
                        for x, y in zip(self.sizes[:-1], self.sizes[1:])]

    def feedforward(self, a):      
        for b, w in zip(self.biases, self.weights):
            a = sigmoid(np.dot(w, a)+b)
        return a

    def SGD(self, training_data, epochs, mini_batch_size, eta,
            lmbda = 0.0,
            evaluation_data=None,
            monitor_evaluation_cost=False,
            monitor_evaluation_accuracy=False,
            monitor_training_cost=False,
            monitor_training_accuracy=False):
       
        if evaluation_data: n_data = len(evaluation_data)
        n = len(training_data)
        evaluation_cost, evaluation_accuracy = [], []
        training_cost, training_accuracy = [], []
        for j in xrange(epochs):
            random.shuffle(training_data)
            mini_batches = [
                training_data[k:k+mini_batch_size]
                for k in xrange(0, n, mini_batch_size)]
            for mini_batch in mini_batches:
                self.update_mini_batch(
                    mini_batch, eta, lmbda, len(training_data))
            print "Epoch %s training complete" % j
            if monitor_training_cost:
                cost = self.total_cost(training_data, lmbda)
                training_cost.append(cost)
                print "Cost on training data: {}".format(cost)
            if monitor_training_accuracy:
                accuracy = self.accuracy(training_data, convert=True)
                training_accuracy.append(accuracy)
                print "Accuracy on training data: {} / {}".format(
                    accuracy, n)
            if monitor_evaluation_cost:
                cost = self.total_cost(evaluation_data, lmbda, convert=True)
                evaluation_cost.append(cost)
                print "Cost on evaluation data: {}".format(cost)
            if monitor_evaluation_accuracy:
                accuracy = self.accuracy(evaluation_data)
                evaluation_accuracy.append(accuracy)
                print "Accuracy on evaluation data: {} / {}".format(
                    self.accuracy(evaluation_data), n_data)
            print
        return evaluation_cost, evaluation_accuracy, \
            training_cost, training_accuracy

    def update_mini_batch(self, mini_batch, eta, lmbda, n):
        
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        for x, y in mini_batch:
            delta_nabla_b, delta_nabla_w = self.backprop(x, y)
            nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
            nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
        self.weights = [(1-eta*(lmbda/n))*w-(eta/len(mini_batch))*nw
                        for w, nw in zip(self.weights, nabla_w)]
        self.biases = [b-(eta/len(mini_batch))*nb
                       for b, nb in zip(self.biases, nabla_b)]

    def backprop(self, x, y):
        
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        # feedforward
        activation = x
        activations = [x] # list to store all the activations, layer by layer
        zs = [] # list to store all the z vectors, layer by layer
        for b, w in zip(self.biases, self.weights):
            z = np.dot(w, activation)+b
            zs.append(z)
            activation = sigmoid(z)
            activations.append(activation)
        # backward pass
        delta = (self.cost).delta(zs[-1], activations[-1], y)
        nabla_b[-1] = delta
        nabla_w[-1] = np.dot(delta, activations[-2].transpose())

        for l in xrange(2, self.num_layers):
            z = zs[-l]
            sp = sigmoid_prime(z)
            delta = np.dot(self.weights[-l+1].transpose(), delta) * sp
            nabla_b[-l] = delta
            nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
        return (nabla_b, nabla_w)

    def accuracy(self, data, convert=False):
       
        if convert:
            results = [(np.argmax(self.feedforward(x)), np.argmax(y))
                       for (x, y) in data]
        else:
            results = [(np.argmax(self.feedforward(x)), y)
                        for (x, y) in data]
        return sum(int(x == y) for (x, y) in results)

    def total_cost(self, data, lmbda, convert=False):
        
        cost = 0.0
        for x, y in data:
            a = self.feedforward(x)
            if convert: y = vectorized_result(y)
            cost += self.cost.fn(a, y)/len(data)
        cost += 0.5*(lmbda/len(data))*sum(
            np.linalg.norm(w)**2 for w in self.weights)
        return cost

    def save(self, filename):
        
        data = {"sizes": self.sizes,
                "weights": [w.tolist() for w in self.weights],
                "biases": [b.tolist() for b in self.biases],
                "cost": str(self.cost.__name__)}
        f = open(filename, "w")
        json.dump(data, f)
        f.close()

def load(filename):
   
    f = open(filename, "r")
    data = json.load(f)
    f.close()
    cost = getattr(sys.modules[__name__], data["cost"])
    net = Network(data["sizes"], cost=cost)
    net.weights = [np.array(w) for w in data["weights"]]
    net.biases = [np.array(b) for b in data["biases"]]
    return net

def vectorized_result(j):   
    e = np.zeros((10, 1))
    e[j] = 1.0
    return e

def sigmoid(z):
    return 1.0/(1.0+np.exp(-z))

def sigmoid_prime(z):
    return sigmoid(z)*(1-sigmoid(z))

In [6]:
training_data, validation_data, test_data = load_data_wrapper()

In [7]:
net = Network([784, 30, 10])

In [8]:
net.SGD(training_data , 30, 10, 10.0,
            lmbda = 5.0,
            evaluation_data=None,
            monitor_evaluation_cost=False,
            monitor_evaluation_accuracy=False,
            monitor_training_cost=False,
            monitor_training_accuracy=False)

Epoch 0 training complete

Epoch 1 training complete

Epoch 2 training complete

Epoch 3 training complete

Epoch 4 training complete

Epoch 5 training complete

Epoch 6 training complete

Epoch 7 training complete

Epoch 8 training complete

Epoch 9 training complete

Epoch 10 training complete

Epoch 11 training complete

Epoch 12 training complete

Epoch 13 training complete

Epoch 14 training complete

Epoch 15 training complete

Epoch 16 training complete

Epoch 17 training complete

Epoch 18 training complete

Epoch 19 training complete

Epoch 20 training complete

Epoch 21 training complete

Epoch 22 training complete

Epoch 23 training complete

Epoch 24 training complete

Epoch 25 training complete

Epoch 26 training complete

Epoch 27 training complete

Epoch 28 training complete

Epoch 29 training complete



([], [], [], [])

In [10]:
net.SGD(training_data , 30, 10, 10.0,
            lmbda = 1000.0,
            evaluation_data= validation_data,
            monitor_evaluation_cost=True,
            monitor_evaluation_accuracy=True,
            monitor_training_cost=True,
            monitor_training_accuracy=True)

Epoch 0 training complete
Cost on training data: 3.88846025502
Accuracy on training data: 5175 / 50000
Cost on evaluation data: 3.88966849256
Accuracy on evaluation data: 1090 / 10000

Epoch 1 training complete
Cost on training data: 3.52679766282
Accuracy on training data: 4506 / 50000
Cost on evaluation data: 3.53748347866
Accuracy on evaluation data: 915 / 10000

Epoch 2 training complete
Cost on training data: 3.74027457473
Accuracy on training data: 4968 / 50000
Cost on evaluation data: 3.7450234795
Accuracy on evaluation data: 990 / 10000

Epoch 3 training complete
Cost on training data: 3.61647424889
Accuracy on training data: 4932 / 50000
Cost on evaluation data: 3.60506032205
Accuracy on evaluation data: 991 / 10000

Epoch 4 training complete
Cost on training data: 3.74775931362
Accuracy on training data: 5175 / 50000
Cost on evaluation data: 3.72689401406
Accuracy on evaluation data: 1090 / 10000

Epoch 5 training complete
Cost on training data: 3.65756283957
Accuracy on trai

([3.8896684925584322,
  3.5374834786639178,
  3.7450234795020045,
  3.6050603220500022,
  3.7268940140618461,
  3.6595718586596653,
  3.463030317397906,
  3.8222742996198376,
  3.5691788928371158,
  3.6129623500522432,
  3.4737627727097942,
  3.5038878470677117,
  3.591549907467531,
  3.8028074201089317,
  3.8208827304672832,
  3.933506943871631,
  3.8891651499625288,
  3.5130795733394251,
  3.4674057303750105,
  3.8738822607328602,
  3.5729073927641961,
  4.4638706716008754,
  3.9555624481188127,
  4.1748371589532898,
  3.8233024923625494,
  4.0238131438285443,
  3.8688640580136417,
  3.6915206935087741,
  3.5688892637639777,
  3.5480754279417295],
 [1090,
  915,
  990,
  991,
  1090,
  990,
  1090,
  983,
  1090,
  990,
  1090,
  1064,
  967,
  1030,
  991,
  983,
  991,
  967,
  1030,
  991,
  990,
  990,
  1009,
  983,
  915,
  967,
  961,
  1030,
  915,
  1064],
 [3.8884602550232508,
  3.5267976628178115,
  3.7402745747343058,
  3.6164742488877581,
  3.7477593136173266,
  3.657562

In [11]:
net = Network([784, 10])

In [12]:
net.SGD(training_data[:1000] , 30, 10, 10.0,
            lmbda = 1000.0,
            evaluation_data= validation_data[:100],
            monitor_evaluation_cost=True,
            monitor_evaluation_accuracy=True,
            monitor_training_cost=True,
            monitor_training_accuracy=True)

Epoch 0 training complete


  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


Cost on training data: inf
Accuracy on training data: 96 / 1000
Cost on evaluation data: inf
Accuracy on evaluation data: 10 / 100

Epoch 1 training complete
Cost on training data: inf
Accuracy on training data: 96 / 1000
Cost on evaluation data: inf
Accuracy on evaluation data: 10 / 100



  return sqrt(add.reduce((x.conj() * x).real, axis=None))


Epoch 2 training complete
Cost on training data: inf
Accuracy on training data: 96 / 1000
Cost on evaluation data: inf
Accuracy on evaluation data: 10 / 100

Epoch 3 training complete




Cost on training data: nan
Accuracy on training data: 96 / 1000
Cost on evaluation data: nan
Accuracy on evaluation data: 10 / 100

Epoch 4 training complete
Cost on training data: nan
Accuracy on training data: 96 / 1000
Cost on evaluation data: nan
Accuracy on evaluation data: 10 / 100

Epoch 5 training complete
Cost on training data: nan
Accuracy on training data: 96 / 1000
Cost on evaluation data: nan
Accuracy on evaluation data: 10 / 100

Epoch 6 training complete
Cost on training data: nan
Accuracy on training data: 96 / 1000
Cost on evaluation data: nan
Accuracy on evaluation data: 10 / 100

Epoch 7 training complete
Cost on training data: nan
Accuracy on training data: 96 / 1000
Cost on evaluation data: nan
Accuracy on evaluation data: 10 / 100

Epoch 8 training complete
Cost on training data: nan
Accuracy on training data: 96 / 1000
Cost on evaluation data: nan
Accuracy on evaluation data: 10 / 100

Epoch 9 training complete
Cost on training data: nan
Accuracy on training data

([inf,
  inf,
  inf,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan],
 [10,
  10,
  10,
  10,
  10,
  10,
  10,
  10,
  10,
  10,
  10,
  10,
  10,
  10,
  10,
  10,
  10,
  10,
  10,
  10,
  10,
  10,
  10,
  10,
  10,
  10,
  10,
  10,
  10,
  10],
 [inf,
  inf,
  inf,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan],
 [96,
  96,
  96,
  96,
  96,
  96,
  96,
  96,
  96,
  96,
  96,
  96,
  96,
  96,
  96,
  96,
  96,
  96,
  96,
  96,
  96,
  96,
  96,
  96,
  96,
  96,
  96,
  96,
  96,
  96])

In [13]:
net=Network([784, 30, 30, 30, 30, 10])

In [15]:
net.SGD(training_data, 30, 10, 0.1, lmbda=5.0,
        evaluation_data= validation_data,
            monitor_evaluation_cost=True,
            monitor_evaluation_accuracy=True,
            monitor_training_cost=True,
            monitor_training_accuracy=True)

Epoch 0 training complete
Cost on training data: 0.301551607628
Accuracy on training data: 48752 / 50000
Cost on evaluation data: 0.973229972296
Accuracy on evaluation data: 9577 / 10000

Epoch 1 training complete
Cost on training data: 0.254345538727
Accuracy on training data: 49224 / 50000
Cost on evaluation data: 0.946371162498
Accuracy on evaluation data: 9641 / 10000

Epoch 2 training complete
Cost on training data: 0.253444456024
Accuracy on training data: 49244 / 50000
Cost on evaluation data: 0.941521932504
Accuracy on evaluation data: 9653 / 10000

Epoch 3 training complete
Cost on training data: 0.272073358529
Accuracy on training data: 49057 / 50000
Cost on evaluation data: 0.963686794202
Accuracy on evaluation data: 9632 / 10000

Epoch 4 training complete
Cost on training data: 0.255725351011
Accuracy on training data: 49207 / 50000
Cost on evaluation data: 0.956148476365
Accuracy on evaluation data: 9635 / 10000

Epoch 5 training complete
Cost on training data: 0.275632071

([0.97322997229553898,
  0.94637116249759223,
  0.94152193250439375,
  0.96368679420239356,
  0.95614847636467626,
  0.97094795882808516,
  0.94619030861605125,
  0.94956411177918421,
  0.93702097285393049,
  1.0113217010800684,
  0.95222660367318979,
  0.95863178000519977,
  0.94439063102061505,
  0.94199391512718711,
  1.0417872639688703,
  0.94613550171533589,
  0.97041200340234357,
  0.98043935812173866,
  0.95119081379660553,
  0.95973609028994478,
  0.96434662190403264,
  0.94363526699780342,
  0.97270517307292592,
  0.96251046620763026,
  0.95412785637059616,
  0.94231768230809876,
  0.94780074793993074,
  0.97494503483470329,
  0.95758849632654186,
  0.94598521892111687],
 [9577,
  9641,
  9653,
  9632,
  9635,
  9630,
  9653,
  9644,
  9678,
  9570,
  9652,
  9650,
  9670,
  9661,
  9525,
  9674,
  9637,
  9608,
  9665,
  9656,
  9627,
  9681,
  9645,
  9661,
  9663,
  9688,
  9685,
  9641,
  9670,
  9687],
 [0.30155160762800332,
  0.25434553872693322,
  0.25344445602363541,
 