In [None]:
import numpy as np
import random
from keras.datasets import mnist

class Network(object):
    def __init__(self, layers):  # Define some useful parameters for the network.
        self.num_layers = len(layers)
        self.layers = layers
        self.weights = [np.random.randn(y, x) for x, y in zip(layers[:-1], layers[1:])]
        self.biases = [np.random.randn(y, 1) for y in layers[1:]]

    def SGD(self, train_list, iterations, mini_batch_size, learning_rate, test_list=None):
        # Stochastic Gradient Descent (SGD) algorithm. `learning_rate` is the learning rate
        # (divided by 60000). If `test_list` is provided, after each iteration of `train_list`,
        # the network's accuracy on the 10000 test samples will be evaluated.

        n = len(train_list)
        if n % mini_batch_size != 0:
            print("Error: Mini-batch size is not a multiple of 60000.")
            raise SystemExit
        if test_list:
            n_test = len(test_list)
        for j in range(iterations):
            random.shuffle(train_list)  # Shuffle the training list.
            mini_batches = [train_list[k:k + mini_batch_size]  # Define the mini-batches.
                            for k in range(0, n, mini_batch_size)]
            for mini_batch in mini_batches:  # Update each mini-batch separately.
                self.update_mini_batch(mini_batch, learning_rate)
            if test_list:
                accuracy = self.evaluate(test_list)  # Evaluate the network if test_list is provided.
                print(f"Iteration {j + 1}: {accuracy}/{n_test} correct.")
            else:
                print(f"Iteration {j} completed.")

    def update_mini_batch(self, mini_batch, learning_rate):
        # This function updates the mini-batches obtained in SGD(). It updates the
        # network's weights and biases.

        bias_grad = [np.zeros(bias.shape) for bias in self.biases]  # Initialize gradient vectors.
        weight_grad = [np.zeros(weight.shape) for weight in self.weights]  # For weights and biases.
        for x, y in mini_batch:
            bias_delta_grad, weight_delta_grad = self.backpropagation(x, y)  # Backpropagation.
            bias_grad = [bias + bias_derivative for bias, bias_derivative
                         in zip(bias_grad, bias_delta_grad)]
            weight_grad = [weight + weight_derivative for weight, weight_derivative
                           in zip(weight_grad, weight_delta_grad)]
        self.weights = [weight - (learning_rate / len(mini_batch)) * weight_gradient
                        for weight, weight_gradient in zip(self.weights, weight_grad)]
        self.biases = [bias - (learning_rate / len(mini_batch)) * bias_gradient
                       for bias, bias_gradient in zip(self.biases, bias_grad)]

    def forward_propagation(self, A):
        # Forward propagation algorithm, returns the activations of the last layer.

        for bias, weight in zip(self.biases, self.weights):
            A = sigmoid(np.dot(weight, A) + bias)
        return A

    def backpropagation(self, x, y):
        # This function returns a tuple (bias_grad, weight_grad), which is the gradient
        # of the cost function.

        bias_grad = [np.zeros(bias.shape) for bias in self.biases]  # Initialize gradient vectors.
        weight_grad = [np.zeros(weight.shape) for weight in self.weights]  # For weights and biases.

        # Forward propagation to save Z values and activations.
        activation = x
        saved_activations = [x]
        saved_z = []
        for bias, weight in zip(self.biases, self.weights):
            z = np.dot(weight, activation) + bias
            saved_z.append(z)
            activation = sigmoid(z)
            saved_activations.append(activation)

        # Backward propagation starts here.
        delta = (saved_activations[-1] - y) * sigmoid_derivative(saved_z[-1])
        bias_grad[-1] = delta
        weight_grad[-1] = np.dot(delta, saved_activations[-2].transpose())

        for l in range(2, self.num_layers):
            z = saved_z[-l]
            delta = np.dot(self.weights[-l + 1].transpose(), delta) * sigmoid_derivative(z)
            bias_grad[-l] = delta
            weight_grad[-l] = np.dot(delta, saved_activations[-l - 1].transpose())

        return bias_grad, weight_grad

    def evaluate(self, test_list):
        # This function calculates the total number of correct predictions on the 10000
        # test images. `test_results` is a list of 10000 tuples (x, y) where x is the
        # predicted value and y is the actual label.

        test_results = [(np.argmax(self.forward_propagation(x)), y)
                        for (x, y) in test_list]
        return sum(int(x == y) for x, y in test_results)

# Define some useful functions outside the network.
def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

def sigmoid_derivative(z):
    return sigmoid(z) * (1 - sigmoid(z))

def prepare_training_list(train_x, train_y):
    # Prepare the training data as a list. Each element is a tuple (x, y), where
    # x is a vector of pixel values, and y is a vector representing the label.

    train_x = train_x / 255  # Normalize.
    train_list = []

    for i in range(60000):
        x = np.zeros((784, 1))
        for j in range(28):
            for k in range(28):
                x[k + 28 * j] = train_x[i, j, k]

        y = np.zeros((10, 1))
        digit = train_y[i]
        y[digit] = 1

        pair = (x, y)
        train_list.append(pair)

    return train_list

def prepare_test_list(test_x, test_y):
    # This function is similar to the previous one, except that the second element
    # of each tuple is the actual label instead of a one-hot vector.

    test_x = test_x / 255
    test_list = []

    for i in range(10000):
        x = np.zeros((784, 1))
        for j in range(28):
            for k in range(28):
                x[k + 28 * j] = test_x[i, j, k]

        digit = test_y[i]

        pair = (x, digit)
        test_list.append(pair)

    return test_list

# Load the data.
(train_x, train_y), (test_x, test_y) = mnist.load_data()

# Prepare the data for the network.
train_list = prepare_training_list(train_x, train_y)
test_list = prepare_test_list(test_x, test_y)

# Define the network.
network = Network([784, 50, 10])

# Train and evaluate the network.
network.SGD(train_list, 30, 10, 3, test_list=test_list)
