In [13]:
# Simple MNIST character recognizer using Michael Nielsen's "network.py" implementation
# Paste this into Colab and run!

# 1) Imports
import numpy as np
import random
from tensorflow.keras.datasets import mnist

# 2) Define the Network class (from Nielsen’s network.py)
class Network:
    def __init__(self, sizes):
        """sizes: list of layer node counts, e.g. [784,30,10]"""
        self.num_layers = len(sizes)
        self.sizes = sizes
        # bias and weight initialization
        self.biases  = [np.random.randn(y, 1) for y in sizes[1:]]
        self.weights = [np.random.randn(y, x)/np.sqrt(x)
                for x, y in zip(sizes[:-1], sizes[1:])]

    def feedforward(self, a):
        """Return the output of the network for input a."""
        for b, w in zip(self.biases, self.weights):
            a = sigmoid(np.dot(w, a) + b)
        return a

    def SGD(self, training_data, epochs, mini_batch_size, eta, test_data=None):
        """Train network using mini-batch stochastic gradient descent."""
        if test_data:
            n_test = len(test_data)
        n = len(training_data)
        for j in range(epochs):
            random.shuffle(training_data)
            # partition into mini-batches
            mini_batches = [
                training_data[k:k+mini_batch_size]
                for k in range(0, n, mini_batch_size)
            ]
            for mb in mini_batches:
                self.update_mini_batch(mb, eta)
            # evaluation
            if test_data:
                acc = self.evaluate(test_data)
                print(f"Epoch {j+1}: {acc} / {n_test}")
            else:
                print(f"Epoch {j+1} complete")

    def update_mini_batch(self, mini_batch, eta):
        """Apply gradient descent step to one mini-batch."""
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        for x, y in mini_batch:
            db, dw = self.backprop(x, y)
            nabla_b = [nb + dnb for nb, dnb in zip(nabla_b, db)]
            nabla_w = [nw + dnw for nw, dnw in zip(nabla_w, dw)]
        # update weights & biases
        m = len(mini_batch)
        self.weights = [w - (eta/m)*nw
                        for w, nw in zip(self.weights, nabla_w)]
        self.biases  = [b - (eta/m)*nb
                        for b, nb in zip(self.biases,  nabla_b)]

    def backprop(self, x, y):
        """Return (∂C/∂b, ∂C/∂w) for the cost on input x and target y."""
        # feedforward
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        activation = x
        activations = [x]  # activations layer by layer
        zs = []            # weighted inputs layer by layer
        for b, w in zip(self.biases, self.weights):
            z = w.dot(activation) + b
            zs.append(z)
            activation = sigmoid(z)
            activations.append(activation)
        # backward pass
        # output error δ_L
        #delta = self.cost_derivative(activations[-1], y) * sigmoid_prime(zs[-1])
        # output error δ_L for cross-entropy cost
        delta = (activations[-1] - y)
        nabla_b[-1] = delta
        nabla_w[-1] = delta.dot(activations[-2].T)
        # propagate backwards
        for l in range(2, self.num_layers):
            z = zs[-l]
            sp = sigmoid_prime(z)
            delta = self.weights[-l+1].T.dot(delta) * sp
            nabla_b[-l] = delta
            nabla_w[-l] = delta.dot(activations[-l-1].T)
        return (nabla_b, nabla_w)

    def evaluate(self, test_data):
        """Return number of correct results on test_data."""
        test_results = [
            (np.argmax(self.feedforward(x)), y)
            for (x, y) in test_data
        ]
        return sum(int(x == y) for (x, y) in test_results)

    def cost_derivative(self, output_activations, y):
        """∂C/∂a for quadratic cost."""
        return (output_activations - y)

# 3) Activation functions
def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

def sigmoid_prime(z):
    s = sigmoid(z)
    return s * (1 - s)

# 4) Helper to one-hot encode labels
def vectorized_result(j):
    """Return a 10×1 one-hot vector with 1.0 at index j."""
    e = np.zeros((10, 1))
    e[j] = 1.0
    return e

# 5) Load & preprocess MNIST
(x_train, y_train), (x_test, y_test) = mnist.load_data()
# flatten & scale to [0,1], shape = (784, 1)
training_inputs = [x.reshape(784,1)/255.0 for x in x_train]
test_inputs     = [x.reshape(784,1)/255.0 for x in x_test]
training_data   = list(zip(
    training_inputs,
    [vectorized_result(y) for y in y_train]
))
test_data       = list(zip(test_inputs, y_test))

# 6) Create network and train
net = Network([784, 30, 10])
# epochs=30, mini_batch_size=10, learning_rate=3.0
net.SGD(training_data, epochs=10, mini_batch_size=10,
        eta=0.5, test_data=test_data)

# number of correct on test set
num_correct = net.evaluate(test_data)

# total examples
n_test = len(test_data)

# accuracy as a fraction or percentage
accuracy = num_correct / n_test
print(f"Test set accuracy: {accuracy:.2%}")

Epoch 1: 9358 / 10000
Epoch 2: 9500 / 10000
Epoch 3: 9512 / 10000
Epoch 4: 9551 / 10000
Epoch 5: 9565 / 10000
Epoch 6: 9558 / 10000
Epoch 7: 9579 / 10000
Epoch 8: 9597 / 10000
Epoch 9: 9563 / 10000
Epoch 10: 9585 / 10000
Test set accuracy: 95.85%
