In [1]:
import numpy as np
from sklearn import datasets

## Exercise 3: Programming a Neural Network

In [2]:
class ReLULayer:
    def forward(self, input):
        # remember the input for later backpropagation
        self.input = input
        
        # standard ReLU
        self.output = np.maximum(0.0, input)

        return self.output

    def backward(self, upstream_gradient):
        # compute the derivative of ReLU from upstream_gradient and the stored input
        
        # derivation for ReLU is 0 for <0 and 1 for >0 (so np.sign)
        return np.sign(self.output) * upstream_gradient

    def update(self, learning_rate):
        pass # ReLU is parameter-free

In [3]:
class OutputLayer:
    def __init__(self, n_classes):
        self.n_classes = n_classes

    def forward(self, input):
        # remember the input for later backpropagation
        self.input = input

        # softmax (averaged exponentiation)
        softmax_prenormalized = np.exp(self.input)
        softmax = np.array([row / np.sum(row) for row in softmax_prenormalized])
        return softmax

    def backward(self, predicted_posteriors, true_labels):
        # return the loss derivative with respect to the stored inputs
        # (use cross-entropy loss and the chain rule for softmax, as derived in the lecture)
        
        # derivation is z_{Lk}-1 for correct label and z_{Lk} otherwise
        downstream_gradient = predicted_posteriors.copy()
        for i, k in enumerate(true_labels):
            downstream_gradient[i, k] -= 1

        return downstream_gradient

    def update(self, learning_rate):
        pass # softmax is parameter-free

In [4]:
class LinearLayer:
    def __init__(self, n_inputs, n_outputs):
        self.n_inputs  = n_inputs
        self.n_outputs = n_outputs
        
        # randomly initialize weights and intercepts
        self.B = np.random.normal(0, 1, (n_inputs, n_outputs))
        self.b = np.random.normal(0, 1, n_outputs)

    def forward(self, input):
        # remember the input for later backpropagation
        self.input = input
        
        # compute the scalar product of input and weights
        # (these are the preactivations for the subsequent non-linear layer)
        return input @ self.B + self.b

    def backward(self, upstream_gradient):
        # compute the derivative of the weights from upstream_gradient and the stored input
        self.grad_b = np.sum(upstream_gradient, axis=0)
        self.grad_B = self.input.T @ upstream_gradient
        
        # compute the downstream gradient to be passed to the preceding layer
        return upstream_gradient @ self.B.T

    def update(self, learning_rate):
        # update the weights by batch gradient descent
        self.B = self.B - learning_rate * self.grad_B
        self.b = self.b - learning_rate * self.grad_b

In [5]:
class MLP:
    def __init__(self, n_features, layer_sizes):
        # construct a multi-layer perceptron
        # with ReLU activation in the hidden layers and softmax output
        # (i.e. it predicts the posterior probability of a classification problem)
        #
        # n_features: number of inputs
        # len(layer_sizes): number of layers
        # layer_sizes[k]: number of neurons in layer k
        # (specifically: layer_sizes[-1] is the number of classes)
        self.n_layers = len(layer_sizes)
        self.layers   = []

        # create interior layers (linear + ReLU)
        n_in = n_features
        for n_out in layer_sizes[:-1]:
            self.layers.append(LinearLayer(n_in, n_out))
            self.layers.append(ReLULayer())
            n_in = n_out

        # create last linear layer + output layer
        n_out = layer_sizes[-1]
        self.layers.append(LinearLayer(n_in, n_out))
        self.layers.append(OutputLayer(n_out))

    def forward(self, X):
        # X is a mini-batch of instances
        batch_size = X.shape[0]
        
        # flatten the other dimensions of X (in case instances are images)
        X = X.reshape(batch_size, -1)
        
        # compute the forward pass
        # (implicitly stores internal activations for later backpropagation)
        result = X
        for layer in self.layers:
            result = layer.forward(result)
            
        return result

    def backward(self, predicted_posteriors, true_classes):
        # perform backpropagation w.r.t. the prediction for the latest mini-batch X
        downstream_gradient = self.layers[-1].backward(predicted_posteriors, true_classes)

        for i, layer in enumerate(reversed(self.layers[:-1])):
            downstream_gradient = layer.backward(downstream_gradient)

    def update(self, X, Y, learning_rate):
        posteriors = self.forward(X)
        self.backward(posteriors, Y)
        for layer in self.layers:
            layer.update(learning_rate)

    def train(self, x, y, n_epochs, batch_size, learning_rate):
        N = len(x)
        n_batches = N // batch_size
        for i in range(n_epochs):
            # reorder data for every epoch
            # (i.e. sample mini-batches without replacement)
            permutation = np.random.permutation(N)

            for batch in range(n_batches):
                # create mini-batch
                start = batch * batch_size
                x_batch = x[permutation[start:start+batch_size]]
                y_batch = y[permutation[start:start+batch_size]]

                # perform one forward and backward pass and update network parameters
                self.update(x_batch, y_batch, learning_rate)

In [6]:
# set training/test set size
N = 2000

# create training and test data
X_train, Y_train = datasets.make_moons(N, noise=0.05)
X_test,  Y_test  = datasets.make_moons(N, noise=0.05)
n_features = 2
n_classes  = 2

# standardize features to be in [-1, 1]
offset  = X_train.min(axis=0)
scaling = X_train.max(axis=0) - offset
X_train = ((X_train - offset) / scaling - 0.5) * 2.0
X_test  = ((X_test  - offset) / scaling - 0.5) * 2.0

In [7]:
# set hyperparameters (play with these!)
layer_sizes = [2, 3, 5, 30]
n_epochs = 10
batch_size = 10
learning_rate = 0.01

for size in layer_sizes:
    # create network
    network = MLP(n_features, [size, size, n_classes])
    
    # train
    network.train(X_train, Y_train, n_epochs, batch_size, learning_rate)
    
    # test
    predicted_posteriors = network.forward(X_test)
    
    # determine class predictions from posteriors by winner-takes-all rule
    predicted_classes = np.argmax(predicted_posteriors, axis=1)
    
    # compute and output the error rate of predicted_classes
    error_rate = np.sum(Y_test != predicted_classes) / N
    print(f"Network size {size} error rate:", error_rate)

Network size 2 error rate: 0.1115
Network size 3 error rate: 0.121
Network size 5 error rate: 0.072
Network size 30 error rate: 0.0
