This LinearLayer class depicts a neural network's linear (or dense, or completely connected) layer. It transforms its inputs linearly by multiplying them by a weights matrix and adding a bias vector. Backpropagation requires the gradients for the weights and biases, as well as the downstream gradient for the preceding layer, which the backward technique computes. The gradient descent step is applied to the weights and biases using the update method using the computed gradients.

In [1]:
import numpy as np
from sklearn import datasets

In [None]:
class ReLULayer(object):
    def forward(self, input):
        # remember the input for later backpropagation
        self.input = input
        # return the ReLU of the input
        relu = np.maximum(0,input)
        return relu

    def backward(self, upstream_gradient):
        # compute the derivative of ReLU from upstream_gradient and the stored input
        self.input = input
        grad_relu = input > 0
        downstream_gradient = upstream_gradient * grad_relu
        return downstream_gradient

    def update(self, learning_rate):
        pass # ReLU is parameter-free

####################################

class OutputLayer(object):
    def __init__(self, n_classes):
        self.n_classes = n_classes

    def forward(self, input):
        # remember the input for later backpropagation
        self.input = input
        # return the softmax of the input
        e_x = np.exp(input - np.max(input))
        softmax = e_x / e_x.sum(axis=0)
        return softmax

    def backward(self, predicted_posteriors, true_labels):
        # return the loss derivative with respect to the stored inputs
        # (use cross-entropy loss and the chain rule for softmax,
        #  as derived in the lecture)
        num_units = predicted_posteriors.shape[1]
        d_layer = np.eye(num_units)
        downstream_gradient = np.dot(true_labels, d_layer) #chain rule
        return downstream_gradient

    def update(self, learning_rate):
        pass # softmax is parameter-free

In [None]:
class LinearLayer(object):
    # Initialization function
    def __init__(self, n_inputs, n_outputs):
        self.n_inputs = n_inputs
        self.n_outputs = n_outputs
        # Randomly initialize weights (B) and biases (b)
        # Here, the weights matrix B has a size of (n_inputs, n_outputs)
        # and the biases vector b has a size of (n_outputs,)
        self.B = np.random.normal(size=(n_inputs, n_outputs))
        self.b = np.random.normal(size=(n_outputs,))

    # Forward pass function
    def forward(self, input):
        """
        Perform the forward pass of the linear layer.
        Inputs:
        :param  A 2D numpy array where each row is a sample and each column is a feature.
        Output:
        :returns: A 2D numpy array where each row is a sample and each column is a preactivation.
        """
        # Remember the input for later backpropagation
        self.input = input
        # Compute the linear transformation of input using weights and biases
        # This is the preactivation for the subsequent non-linear layer
        preactivations = np.dot(input, self.B) + self.b
        return preactivations

    # Backward pass function
    def backward(self, upstream_gradient):
        '''
         Inputs:
        - upstream_gradient: array of shape (batch_size, n_outputs)
          Gradient of the loss with respect to the outputs of this layer.

        Returns:
        - downstream_gradient: array of shape (batch_size, n_inputs)
          Gradient of the loss with respect to the inputs to this layer.
        '''
        # Compute the gradient of the biases
        # It's simply the sum of upstream gradients across the batch dimension
        self.grad_b = np.sum(upstream_gradient, axis=0)
        # Compute the gradient of the weights
        # It's the input transposed times the upstream gradient
        self.grad_B = np.dot(self.input.T, upstream_gradient)
        # Compute the downstream gradient to be passed to the preceding layer
        # It's the upstream gradient times the weights transposed
        downstream_gradient = np.dot(upstream_gradient, self.B.T)
        return downstream_gradient

    # Update function
    def update(self, learning_rate):
        ''' 
          Performs a gradient descent update of the weights and biases.

        Inputs:
        - learning_rate: float, learning rate for the gradient update.
        '''
        # Update the weights and biases by batch gradient descent
        # Here we subtract because we want to move opposite to the gradient for minimizing the loss
        self.B = self.B - learning_rate * self.grad_B
        self.b = self.b - learning_rate * self.grad_b


In [2]:
class MLP(object):
    # Initialization function
    def __init__(self, n_features, layer_sizes):
        # Construct a multi-layer perceptron with ReLU activation in the hidden layers and softmax output
        self.n_layers = len(layer_sizes)
        self.layers = []
        n_in = n_features
        for n_out in layer_sizes[:-1]:
            # Add Linear layer followed by a ReLU layer
            self.layers.append(LinearLayer(n_in, n_out))
            self.layers.append(ReLULayer())
            n_in = n_out

        # Create last linear layer + output layer
        n_out = layer_sizes[-1]
        self.layers.append(LinearLayer(n_in, n_out))
        self.layers.append(OutputLayer(n_out))

    # Forward pass function
    def forward(self, X):
        '''The forward function is a key component of a neural network model, as it carries out the computations of the network from the input layer all the way to the output layer.
        input:The input X is a 2-dimensional numpy array where each row corresponds to an instance in the current mini-batch of input data.
        output:The output of the forward function is another 2-dimensional numpy array of the same shape as the input.
        '''
        # X is a mini-batch of instances
        batch_size = X.shape[0]
        # Flatten the other dimensions of X (in case instances are images)
        X = X.reshape(batch_size, -1)

        # Compute the forward pass
        result = X
        for layer in self.layers:
            result = layer.forward(result)
        return result

    # Backward pass function
    def backward(self, predicted_posteriors, true_classes):
        '''This function specifically computes the gradient of the loss function with respect to the weights and biases in the network.
        Inputs:
        - predicted_posteriors: It's a 2D array with shape (batch_size, n_classes).[predicted probabilities for each class for each instance in the batch.]
        - true_classes:One-hot encoded 2d array with shape (batch_size, n_classes).[These are the true labels for each instance in the batch.]
        '''
        downstream_gradient = predicted_posteriors - true_classes # cross-entropy loss derivative
        for layer in reversed(self.layers[:-1]): # Exclude last layer which is OutputLayer
            downstream_gradient = layer.backward(downstream_gradient)
    # Handle OutputLayer separately
        self.layers[-1].backward(predicted_posteriors, true_classes)

    # Update function
    def update(self, X, Y, learning_rate):
        ''' The update function is responsible for performing one step of training, which includes forward propagation, backward propagation, and updating the weights and biases of the network.
        Inputs:
        - X: A batch of input data.  A numpy array
        - Y: The true labels corresponding to the input data X. A numpy array
        - learning_rate :This is a hyperparameter that determines the step size when updating the weights during backpropagation.
        '''
        # Compute the forward pass
        posteriors = self.forward(X)
        # Perform the backward pass
        self.backward(posteriors, Y)
        # Update parameters for each layer
        for layer in self.layers:
            layer.update(learning_rate)

    # Training function
    def train(self, x, y, n_epochs, batch_size, learning_rate):
        """
        Trains the MLP using mini-batch gradient descent.

        Inputs:
        - x: A numpy array of shape (N, D) giving training data, where N is the number of data points and D is the number of features.
        - y: A numpy array of shape (N, C) giving training labels, where C is the number of classes. Labels should be one-hot encoded.
        - n_epochs: An integer giving the number of training epochs (how many times the learning algorithm will work through the entire training set).
        - batch_size: An integer giving the number of training examples per mini-batch.
        - learning_rate: A float giving the learning rate for the optimization.

        Returns: No explicit return, but the weights and biases of the MLP are updated internally.
        """
        N = len(x)
        n_batches = N // batch_size
        for i in range(n_epochs):
            # Reorder data for every epoch
            permutation = np.random.permutation(N)

            for batch in range(n_batches):
                # Create mini-batch
                start = batch * batch_size
                x_batch = x[permutation[start:start+batch_size]]
                y_batch = y[permutation[start:start+batch_size]]

                # Perform one forward and backward pass and update network parameters
                self.update(x_batch, y_batch, learning_rate)
    @staticmethod
    def to_one_hot(Y, n_classes):
        ''' The to_one_hot function is a utility function that converts class labels into a one-hot encoded representation.
        Input:
        Y: a 1-D numpy array of size n_samples containing class labels. Each label is an integer between 0 and n_classes-1.
        n_classes: the number of unique classes, a scalar integer.
        Output:
        A 2-D numpy array of shape (n_samples, n_classes). 
        '''
        n_samples = len(Y)
        Y_one_hot = np.zeros((n_samples, n_classes))
        Y_one_hot[np.arange(n_samples), Y] = 1
        return Y_one_hot
