In [None]:
import random
import numpy as np
from torchvision import datasets, transforms

In [None]:
!wget -O mnist.npz https://s3.amazonaws.com/img-datasets/mnist.npz

--2021-10-29 23:44:42--  https://s3.amazonaws.com/img-datasets/mnist.npz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.137.80
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.137.80|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11490434 (11M) [application/octet-stream]
Saving to: ‘mnist.npz’


2021-10-29 23:44:43 (17.2 MB/s) - ‘mnist.npz’ saved [11490434/11490434]



In [None]:
# Let's read the mnist dataset

def load_mnist(path='mnist.npz'):
    with np.load(path) as f:
        x_train, _y_train = f['x_train'], f['y_train']
        x_test, _y_test = f['x_test'], f['y_test']
        
    x_train = x_train.reshape(-1, 28 * 28) / 255.
    x_test = x_test.reshape(-1, 28 * 28) / 255.
    
    y_train = np.zeros((_y_train.shape[0], 10))
    y_train[np.arange(_y_train.shape[0]), _y_train] = 1
    
    y_test = np.zeros((_y_test.shape[0], 10))
    y_test[np.arange(_y_test.shape[0]), _y_test] = 1

    return (x_train, y_train), (x_test, y_test)

(x_train, y_train), (x_test, y_test) = load_mnist()

In [None]:
## WRONG: każdy piksel nie wyznacza nowej cechy.

# standarization


mean = x_train.mean(axis=0)[np.newaxis, :]
std = x_train.std(axis=0)[np.newaxis, :]

wrong_features = (std != 0)[0]

x_train[:, wrong_features] = (x_train[:, wrong_features] - mean[:, wrong_features]) / std[:, wrong_features]
x_test[:, wrong_features] = (x_test[:, wrong_features] - mean[:, wrong_features]) / std[:, wrong_features]

## Exercise 1

In this exercise your task is to fill in the gaps in this code by implementing the backpropagation algorithm
Once this is done, you can run the network on the MNIST example and see how it performs. Feel free to play with the parameters. Your model should achieve 90%+ accuracy after a few epochs.


In [None]:
def sigmoid(z):
    return 1.0/(1.0+np.exp(-z))

def sigmoid_prime(z):
    # Derivative of the sigmoid
    return sigmoid(z)*(1-sigmoid(z))

class Network(object):
    def __init__(self, sizes):
        # initialize biases and weights with random normal distr.
        # weights are indexed by target node first
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.biases = [np.random.randn(y, 1) for y in sizes[1:]] # [(30, 1), (10,1)]
        self.weights = [np.random.randn(y, x) 
                        for x, y in zip(sizes[:-1], sizes[1:])] # [(30, 784), (10, 30)]
    def feedforward(self, a):
        # Run the network on a single case
        for b, w in zip(self.biases, self.weights):
            a = sigmoid(np.dot(w, a)+b)
        return a
    
    def update_mini_batch(self, x_mini_batch, y_mini_batch, eta):
        # Update networks weights and biases by applying a single step
        # of gradient descent using backpropagation to compute the gradient.
        # The gradient is computed for a mini_batch.
        # eta is the learning rate
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        for x, y in zip(x_mini_batch, y_mini_batch):
            delta_nabla_b, delta_nabla_w = self.backprop(x.reshape(784,1), y.reshape(10,1))
            nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
            nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
        self.weights = [w-(eta/len(x_mini_batch))*nw 
                        for w, nw in zip(self.weights, nabla_w)]
        self.biases = [b-(eta/len(x_mini_batch))*nb 
                       for b, nb in zip(self.biases, nabla_b)]
        
    def backprop(self, x, y):
        # For a single input (x,y) return a tuple of lists.
        # First contains gradients over biases, second over weights.
        
        # First initialize the list of gradient arrays
        delta_nabla_b = [np.zeros_like(p) for p in self.biases]
        delta_nabla_w = [np.zeros_like(p) for p in self.weights]
        
        # Then go forward remembering all values before and after activations
        # in two other array lists
        a = x
        post_act = []
        for w, b in zip(self.weights, self.biases):
          a = sigmoid(np.dot(w, a) + b)
          post_act.append(a)
        
        # Now go backward from the final cost applying backpropagation
        delta_nabla_b[1] += (post_act[1] - y) * sigmoid_prime(post_act[1])
        delta_nabla_w[1] += delta_nabla_b[1] @ post_act[0].T

        delta_nabla_b[0] += self.weights[1].T @ delta_nabla_b[1] * sigmoid_prime(post_act[0])
        delta_nabla_w[0] += delta_nabla_b[0] @ x.T
        
        return delta_nabla_b, delta_nabla_w

    def evaluate(self, x_test_data, y_test_data):
        # Count the number of correct answers for test_data
        test_results = [(np.argmax(self.feedforward(x_test_data[i].reshape(784,1))), np.argmax(y_test_data[i]))
                        for i in range(len(x_test_data))]
        # return accuracy
        return np.mean([int(x == y) for (x, y) in test_results])
    
    def cost_derivative(self, output_activations, y):
        return (output_activations-y) 
    
    def SGD(self, training_data, epochs, mini_batch_size, eta, test_data=None):
        x_train, y_train = training_data
        if test_data:
            x_test, y_test = test_data
        for j in range(epochs):
            for i in range(x_train.shape[0] // mini_batch_size):
                x_mini_batch = x_train[i*mini_batch_size:(i*mini_batch_size + mini_batch_size)] 
                y_mini_batch = y_train[i*mini_batch_size:(i*mini_batch_size + mini_batch_size)] 
                self.update_mini_batch(x_mini_batch, y_mini_batch, eta)
            if test_data:
                print("Epoch: {0}, Accuracy: {1}".format(j, self.evaluate(x_test, y_test)))
            else:
                print("Epoch: {0}".format(j))


network = Network([784,30,10])
network.SGD((x_train, y_train), epochs=50, mini_batch_size=100, eta=0.5, test_data=(x_test, y_test))



SyntaxError: ignored

## Exercise 2 (Optional)

Implement a "fully vectorized" version, i.e. one using matrix operations instead of going over examples one by one within a minibatch.

In [None]:
# with regularization l2

def sigmoid(z):
    return 1.0/(1.0+np.exp(-z))

def sigmoid_prime(z):
    # Derivative of the sigmoid
    return sigmoid(z)*(1-sigmoid(z))

class Network2(object):
    def __init__(self, sizes):
        # initialize biases and weights with random normal distr.
        # weights are indexed by target node first
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.biases = [np.random.randn(y, 1) for y in sizes[1:]] # [(30, 1), (10,1)]
        self.weights = [np.random.randn(y, x) 
                        for x, y in zip(sizes[:-1], sizes[1:])] # [(30, 784), (10, 30)]
    def feedforward(self, a):
        # Run the network on a single case
        for b, w in zip(self.biases, self.weights):
            a = sigmoid(np.dot(w, a)+b)
        return a
    
    def update_mini_batch(self, x_batch, y_batch, eta):
        # Update networks weights and biases by applying a single step
        # of gradient descent using backpropagation to compute the gradient.
        # The gradient is computed for a mini_batch.
        # eta is the learning rate
        nabla_b, nabla_w = self.backprop(x_batch.T, y_batch.T)
        self.weights = [w-(eta/len(x_batch))*nw 
                        for w, nw in zip(self.weights, nabla_w)]
        self.biases = [b-(eta/len(x_batch))*nb 
                       for b, nb in zip(self.biases, nabla_b)]
        
    def backprop(self, x_batch, y_batch):
        # For a single input (x,y) return a tuple of lists.
        # First contains gradients over biases, second over weights.
        
        # First initialize the list of gradient arrays
        delta_nabla_b = [np.zeros_like(p) for p in self.biases]
        delta_nabla_w = [np.zeros_like(p) for p in self.weights]
        
        # Then go forward remembering all values before and after activations
        # in two other array lists
        a = x_batch
        post_act = []
        for w, b in zip(self.weights, self.biases):
          a = sigmoid(np.dot(w, a) + b)
          post_act.append(a)
        
        # Now go backward from the final cost applying backpropagation
        ph2 = (post_act[1] - y_batch) * post_act[1] * (1 - post_act[1])
        delta_nabla_b[1] += ph2.sum(axis=1)[:, np.newaxis]
        delta_nabla_w[1] += ph2 @ post_act[0].T + 0.0001 * x_batch.shape[1] * self.weights[1]
        
        ph1 = self.weights[1].T @ ph2 * post_act[0] * (1 - post_act[0])
        delta_nabla_b[0] += ph1.sum(axis=1)[:, np.newaxis]
        delta_nabla_w[0] += ph1 @ x_batch.T + 0.0001 * x_batch.shape[1] * self.weights[0]
        
        return delta_nabla_b, delta_nabla_w

    def evaluate(self, x_test_data, y_test_data):
        # Count the number of correct answers for test_data
        test_results = [(np.argmax(self.feedforward(x_test_data[i].reshape(784,1))), np.argmax(y_test_data[i]))
                        for i in range(len(x_test_data))]
        # return accuracy
        return np.mean([int(x == y) for (x, y) in test_results])
    
    def cost_derivative(self, output_activations, y):
        return (output_activations-y) 
    
    def SGD(self, training_data, epochs, mini_batch_size, eta, test_data=None):
        x_train, y_train = training_data
        if test_data:
            x_test, y_test = test_data
        for j in range(epochs):
            for i in range(x_train.shape[0] // mini_batch_size):
                x_mini_batch = x_train[i*mini_batch_size:(i*mini_batch_size + mini_batch_size)] 
                y_mini_batch = y_train[i*mini_batch_size:(i*mini_batch_size + mini_batch_size)] 
                self.update_mini_batch(x_mini_batch, y_mini_batch, eta)
            if test_data:
                print("Epoch: {0}, Accuracy: {1}".format(j, self.evaluate(x_test, y_test)))
            else:
                print("Epoch: {0}".format(j))


network = Network2([784,30,10])
network.SGD((x_train, y_train), epochs=100, mini_batch_size=100, eta=0.2, test_data=(x_test, y_test))



  after removing the cwd from sys.path.


Epoch: 0, Accuracy: 0.7756
Epoch: 1, Accuracy: 0.8308
Epoch: 2, Accuracy: 0.8479
Epoch: 3, Accuracy: 0.8555
Epoch: 4, Accuracy: 0.8614
Epoch: 5, Accuracy: 0.868
Epoch: 6, Accuracy: 0.871
Epoch: 7, Accuracy: 0.8715
Epoch: 8, Accuracy: 0.8739
Epoch: 9, Accuracy: 0.8768
Epoch: 10, Accuracy: 0.8784
Epoch: 11, Accuracy: 0.8806
Epoch: 12, Accuracy: 0.8814
Epoch: 13, Accuracy: 0.8824
Epoch: 14, Accuracy: 0.8839
Epoch: 15, Accuracy: 0.8849
Epoch: 16, Accuracy: 0.8856
Epoch: 17, Accuracy: 0.8859
Epoch: 18, Accuracy: 0.8863
Epoch: 19, Accuracy: 0.887
Epoch: 20, Accuracy: 0.8876
Epoch: 21, Accuracy: 0.8887
Epoch: 22, Accuracy: 0.8891
Epoch: 23, Accuracy: 0.8892
Epoch: 24, Accuracy: 0.8893
Epoch: 25, Accuracy: 0.8897
Epoch: 26, Accuracy: 0.8903
Epoch: 27, Accuracy: 0.8898
Epoch: 28, Accuracy: 0.8889
Epoch: 29, Accuracy: 0.8882
Epoch: 30, Accuracy: 0.8883
Epoch: 31, Accuracy: 0.8892
Epoch: 32, Accuracy: 0.8894
Epoch: 33, Accuracy: 0.89
Epoch: 34, Accuracy: 0.8896
Epoch: 35, Accuracy: 0.89
Epoch: 36