In [12]:
import numpy as np
from sklearn import datasets

## Exercise 3: Programming a Neural Network

In [13]:
# code copied from provided file 'network.py'

In [14]:
class ReLULayer(object):
    def forward(self, input):
        # remember the input for later backpropagation
        self.input = input
        # return the ReLU of the input
        # relu_fct = lambda x: x if x > 0 else 0
        # relu = np.vectorize(relu_fct)(input)
        # return relu 
        return np.maximum(0.0, input )

    def backward(self, upstream_gradient):
        # compute the derivative of ReLU from upstream_gradient and the stored input

        # TODO: vectorize
        # batch_size = upstream_gradient.shape[0]   
        # downstream_gradient = np.array([upstream_gradient[i] @ np.diag(np.where(self.input[i] >= 0, 1, 0)) for i in range(batch_size)])
        # return downstream_gradient 
        # diagonal = np.where(self.input >= 0, 1, 0)
        # downstream_gradient = upstream_gradient @ np.diag(diagonal)
        # print(f"ReLU Shapes: {upstream_gradient.shape}, {diagonal.shape}, {np.diag(diagonal).shape}")
        
        return self.input * (self.input>0) * (upstream_gradient > 0)

    def update(self, learning_rate):
        pass # ReLU is parameter-free

In [15]:
class OutputLayer(object):
    def __init__(self, n_classes):
        self.n_classes = n_classes

    def forward(self, input):
        # remember the input for later backpropagation
        self.input = input
        # return the softmax of the input

        # TODO: vectorize
        softmax_prenormalized = np.exp(self.input)
        softmax = np.array([row / np.sum(row) for row in softmax_prenormalized])
        return softmax

    def backward(self, predicted_posteriors, true_labels):
        # return the loss derivative with respect to the stored inputs
        # (use cross-entropy loss and the chain rule for softmax,
        #  as derived in the lecture)
        # TODO: vectorize
        downstream_gradient = predicted_posteriors.copy()
        for i, k in enumerate(true_labels):
            # print(downstream_gradient[i, k])
            downstream_gradient[i, k] -= 1
            # print(downstream_gradient[i, k])



        return downstream_gradient
        # predicted_posteriors[:,-1] = (-1/ predicted_posteriors[:,0]) * (predicted_posteriors[:,-1] == np.float32(true_labels))
        # return predicted_posteriors

    def update(self, learning_rate):
        pass # softmax is parameter-free

In [16]:
class LinearLayer(object):
    def __init__(self, n_inputs, n_outputs):
        self.n_inputs  = n_inputs
        self.n_outputs = n_outputs
        # randomly initialize weights and intercepts
        self.B = np.random.normal(0, 1, (n_inputs, n_outputs))
        self.b = np.random.normal(0, 1, n_outputs)

    def forward(self, input):
        # remember the input for later backpropagation
        self.input = input
        # compute the scalar product of input and weights
        # (these are the preactivations for the subsequent non-linear layer)
        preactivations = input@self.B + self.b
        return preactivations

    def backward(self, upstream_gradient):
        # compute the derivative of the weights from
        # upstream_gradient and the stored input
        self.grad_b = np.sum(upstream_gradient, axis=0)
        self.grad_B = self.input.T @ upstream_gradient
        # print(f"Gradient shapes: {self.grad_B.shape}, {self.grad_b.shape}")
        # compute the downstream gradient to be passed to the preceding layer
        # print(f"Lin  Shapes: {upstream_gradient.shape}, {self.B.T.shape}")
        downstream_gradient = upstream_gradient @ self.B.T
        return downstream_gradient

    def update(self, learning_rate):
        # update the weights by batch gradient descent
        self.B = self.B - learning_rate * self.grad_B
        self.b = self.b - learning_rate * self.grad_b

In [17]:
class MLP(object):
    def __init__(self, n_features, layer_sizes):
        # construct a multi-layer perceptron
        # with ReLU activation in the hidden layers and softmax output
        # (i.e. it predicts the posterior probability of a classification problem)
        #
        # n_features: number of inputs
        # len(layer_sizes): number of layers
        # layer_sizes[k]: number of neurons in layer k
        # (specifically: layer_sizes[-1] is the number of classes)
        self.n_layers = len(layer_sizes)
        self.layers   = []

        # create interior layers (linear + ReLU)
        n_in = n_features
        for n_out in layer_sizes[:-1]:
            self.layers.append(LinearLayer(n_in, n_out))
            self.layers.append(ReLULayer())
            n_in = n_out

        # create last linear layer + output layer
        n_out = layer_sizes[-1]
        self.layers.append(LinearLayer(n_in, n_out))
        self.layers.append(OutputLayer(n_out))

    def forward(self, X):
        # X is a mini-batch of instances
        batch_size = X.shape[0]
        # flatten the other dimensions of X (in case instances are images)
        X = X.reshape(batch_size, -1)

        # compute the forward pass
        # (implicitly stores internal activations for later backpropagation)
        result = X
        for layer in self.layers:
            result = layer.forward(result)
        return result

    def backward(self, predicted_posteriors, true_classes):
        # perform backpropagation w.r.t. the prediction for the latest mini-batch X
        downstream_gradient = self.layers[-1].backward(predicted_posteriors, true_classes)

        for i, layer in enumerate(reversed(self.layers[:-1])):
            # print(f"Step {i}")
            downstream_gradient = layer.backward(downstream_gradient)


    def update(self, X, Y, learning_rate):
        posteriors = self.forward(X)
        self.backward(posteriors, Y)
        for layer in self.layers:
            layer.update(learning_rate)

    def train(self, x, y, n_epochs, batch_size, learning_rate):
        N = len(x)
        n_batches = N // batch_size
        for i in range(n_epochs):
            # print("Epoch", i)
            # reorder data for every epoch
            # (i.e. sample mini-batches without replacement)
            permutation = np.random.permutation(N)

            for batch in range(n_batches):
                # create mini-batch
                start = batch * batch_size
                x_batch = x[permutation[start:start+batch_size]]
                y_batch = y[permutation[start:start+batch_size]]

                # perform one forward and backward pass and update network parameters
                self.update(x_batch, y_batch, learning_rate)

In [18]:
# set training/test set size
N = 2000

# create training and test data
X_train, Y_train = datasets.make_moons(N, noise=0.05)
X_test,  Y_test  = datasets.make_moons(N, noise=0.05)
n_features = 2
n_classes  = 2

# standardize features to be in [-1, 1]
offset  = X_train.min(axis=0)
scaling = X_train.max(axis=0) - offset
X_train = ((X_train - offset) / scaling - 0.5) * 2.0
X_test  = ((X_test  - offset) / scaling - 0.5) * 2.0

In [19]:
# set hyperparameters (play with these!)
layer_sizes = [30, 30, n_classes]
n_epochs = 30
batch_size = 20
learning_rate = 0.05

In [20]:
# create network
network = MLP(n_features, layer_sizes)

In [21]:
# train
network.train(X_train, Y_train, n_epochs, batch_size, learning_rate)

In [22]:
# test
predicted_posteriors = network.forward(X_test)
# determine class predictions from posteriors by winner-takes-all rule
predicted_classes = np.argmax(predicted_posteriors, axis=1)
# compute and output the error rate of predicted_classes
error_rate = np.sum(Y_test != predicted_classes) / N
print("error rate:", error_rate)

error rate: 0.143
