# Colab Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
"""
Change directory to where this file is located
"""
#%cd 'COPY&PASTE FILE DIRECTORY HERE'

# Import Modules

In [1]:
import copy
import numpy as np
import matplotlib.pyplot as plt
from mnist.data_utils import load_data

In [2]:
%pip install python-mnist

Collecting python-mnist
  Downloading python_mnist-0.7-py2.py3-none-any.whl (9.6 kB)
Installing collected packages: python-mnist
Successfully installed python-mnist-0.7
Note: you may need to restart the kernel to use updated packages.


#Utils

In [14]:
def leaky_relu(z, alpha=0.01):
    """
    Implement the leaky ReLU activation function.
    The method takes the input z and returns the output of the function.

    Set the value of alpha for the leaky ReLU funtion to 0.01.
    Question (a)

    """
    ##### YOUR CODE HERE #####
    return np.maximum(alpha * z, z)
    ##########################

def softmax(X):
    """
    Implement the softmax function.
    The method takes the input X and returns the output of the function.

    Question (a)
    """

    ##### YOUR CODE HERE #####
    exp_X = np.exp(X - np.max(X, axis=1, keepdims=True))
    return exp_X / np.sum(exp_X, axis=1, keepdims=True)

    ##########################

def load_batch(X, Y, batch_size, shuffle=True):
    """
    Generates batches with the remainder dropped.

    Do NOT modify this function
    """
    if shuffle:
        permutation = np.random.permutation(X.shape[0])
        X = X[permutation, :]
        Y = Y[permutation, :]
    num_steps = int(X.shape[0])//batch_size
    step = 0
    while step<num_steps:
        X_batch = X[batch_size*step:batch_size*(step+1)]
        Y_batch = Y[batch_size*step:batch_size*(step+1)]
        step+=1
        yield X_batch, Y_batch

def plot_dataset(images, labels, grid_width, grid_height, figure_width=5, figure_height=5, y_hats=None):
    """
    Plots image and labels.

    Do NOT modify this method.
    """
    f, ax = plt.subplots(grid_height, grid_width)
    f.set_size_inches(figure_width, figure_height)
    img_idx = 0
    for i in range(0, grid_height):
        for j in range(0, grid_width):
            image = images[img_idx]
            label = labels[img_idx]
            y_hat = Y_hat[img_idx]
            label_idx = int(np.argmax(label))
            y_hat_idx = int(np.argmax(y_hat))
            ax[i][j].axis('off')
            ax[i][j].set_title(f'Pred: {y_hat_idx}, Real: {label_idx}', color='k')
            ax[i][j].imshow(image, aspect='auto')
            img_idx += 1
        plt.subplots_adjust(left=0, bottom=0, right=1, top=1, wspace=0, hspace=0.25)
    plt.show()

#2-Layer Neural Network

In [15]:
class TwoLayerNN:
    """ a neural network with 2 layers """

    def __init__(self, input_dim, num_hiddens, num_classes):
        """
        Do NOT modify this function.
        """
        self.input_dim = input_dim
        self.num_hiddens = num_hiddens
        self.num_classes = num_classes
        self.params = self.initialize_parameters(input_dim, num_hiddens, num_classes)

    def initialize_parameters(self, input_dim, num_hiddens, num_classes):
        """
        Question (b)

        initializes parameters with He Initialization.
        - refer to https://paperswithcode.com/method/he-initialization for He intialization

        Inputs
        - input_dim
        - num_hiddens
        - num_classes
        Returns
        - params: a dictionary with the initialized parameters.
        """

        ##### YOUR CODE HERE #####
        params = {}
        params['W1'] = np.random.randn(input_dim, num_hiddens) * np.sqrt(2 / input_dim)
        params['b1'] = np.zeros((1, num_hiddens))
        params['W2'] = np.random.randn(num_hiddens, num_classes) * np.sqrt(2 / num_hiddens)
        params['b2'] = np.zeros((1, num_classes))
        return params
        ##########################

    def forward(self, X):
        """
        Defines and performs the feed forward step of a two-layer neural network.
        Specifically, the network structue is given by

          y = softmax(leaky_relu(X W1 + b1) W2 + b2)

        where X is the input matrix of shape (N, D), y is the class distribution matrix
        of shape (N, C), N is the number of examples (either the entire dataset or
        a mini-batch), D is the feature dimensionality, and C is the number of classes.

        Question (c)
        - ff_dict will be used to run backpropagation in backward method.

        Inputs
        - X: the input matrix of shape (N, D)

        Returns
        - y: the output of the model
        - ff_dict: a dictionary with all the fully connected units and activations.
        """

        ##### YOUR CODE HERE #####
        W1, b1, W2, b2 = self.params
        Z1 = np.dot(X, W1) + b1
        A1 = np.where(Z1 > 0, Z1, 0.01 * Z1)  # Leaky ReLU activation
        Z2 = np.dot(A1, W2) + b2
        
        y = self.softmax(Z2)
        ff_dict = {'X': X, 'Z1': Z1, 'A1': A1, 'Z2': Z2, 'y': y}

        return y, ff_dict
        ##########################

    def backward(self, X, Y, ff_dict):
        """
        Performs backpropagation over the two-layer neural network, and returns
        a dictionary of gradients of all model parameters.

        Question (d)

        Inputs:
         - X: the input matrix of shape (B, D), where B is the number of examples
              in a mini-batch, D is the feature dimensionality.
         - Y: the matrix of one-hot encoded ground truth classes of shape (B, C),
              where B is the number of examples in a mini-batch, C is the number
              of classes.
         - ff_dict: the dictionary containing all the fully connected units and
              activations.

        Returns:
         - grads: a dictionary containing the gradients of corresponding weights and biases.
        """
        ##### YOUR CODE HERE #####
        grads = {}
        B = X.shape[0]  # Batch size
        W1, b1, W2, b2 = self.params
        Z1, A1, Z2, y = ff_dict

        # Compute the gradient of the loss with respect to the output layer
        dZ2 = y - Y
        grads['W2'] = np.dot(A1.T, dZ2) / B
        grads['b2'] = np.sum(dZ2, axis=0, keepdims=True) / B

        # Compute the gradient of the loss with respect to the first hidden layer
        dA1 = np.dot(dZ2, W2.T)
        dZ1 = np.where(Z1 > 0, dA1, 0.01 * dA1)
        grads['W1'] = np.dot(X.T, dZ1) / B
        grads['b1'] = np.sum(dZ1, axis=0, keepdims=True) / B

        return grads
        ##########################

    def compute_loss(self, Y, Y_hat):
        """
        Computes cross entropy loss.

        Do NOT modify this function.

        Inputs
            Y:
            Y_hat:
        Returns
            loss:
        """
        epsilon = 1e-10
        Y_hat = np.clip(Y_hat, epsilon, 1 - epsilon)
        Y = Y.astype(float)
        Y_hat = Y_hat.astype(float)
        
        loss = -(1/Y.shape[0]) * np.sum(np.multiply(Y, np.log(Y_hat)))
        return loss

    def train(self, X, Y, X_val, Y_val, lr, n_epochs, batch_size, log_interval=1):
        """
        Runs mini-batch gradient descent.

        Do NOT Modify this method.

        Inputs
        - X
        - Y
        - X_val
        - Y_Val
        - lr
        - n_epochs
        - batch_size
        - log_interval
        """
        for epoch in range(n_epochs):
            for X_batch, Y_batch in load_batch(X, Y, batch_size):
                self.train_step(X_batch, Y_batch, batch_size, lr)
            if epoch % log_interval==0:
                Y_hat, ff_dict = self.forward(X)
                train_loss = self.compute_loss(Y, Y_hat)
                train_acc = self.evaluate(Y, Y_hat)
                Y_hat, ff_dict = self.forward(X_val)
                valid_loss = self.compute_loss(Y_val, Y_hat)
                valid_acc = self.evaluate(Y_val, Y_hat)
                print('epoch {:02} - train loss/acc: {:.3f} {:.3f}, valid loss/acc: {:.3f} {:.3f}'.\
                      format(epoch, train_loss, train_acc, valid_loss, valid_acc))

    def train_step(self, X_batch, Y_batch, batch_size, lr):
        """
        Updates the parameters using gradient descent.

        Question (e)

        Inputs
        - X_batch
        - Y_batch
        - batch_size
        - lr
        """
        ##### YOUR CODE HERE #####
        y_hat, ff_dict = self.forward(X_batch)

        grads = self.backward(X_batch, Y_batch, ff_dict)

        self.params['W1'] -= lr * grads['W1']
        self.params['b1'] -= lr * grads['b1']
        self.params['W2'] -= lr * grads['W2']
        self.params['b2'] -= lr * grads['b2']
        ##########################
    def evaluate(self, Y, Y_hat):
        """
        Computes classification accuracy.

        Question (f)

        Inputs
        - Y: A numpy array of shape (N, C) containing the softmax outputs,
             where C is the number of classes.
        - Y_hat: A numpy array of shape (N, C) containing the one-hot encoded labels,
             where C is the number of classes.

        Returns
            accuracy: the classification accuracy in float
        """
        ##### YOUR CODE HERE #####
        correct_predictions = np.sum(np.argmax(Y, axis=1) == np.argmax(Y_hat, axis=1))
        total_samples = len(Y)
        accuracy = correct_predictions / total_samples
        return accuracy
        ##########################

#Load MNIST

In [16]:
X_train, Y_train, X_test, Y_test = load_data()

idxs = np.arange(len(X_train))
np.random.shuffle(idxs)
split_idx = int(np.ceil(len(idxs)*0.8))
X_valid, Y_valid = X_train[idxs[split_idx:]], Y_train[idxs[split_idx:]]
X_train, Y_train = X_train[idxs[:split_idx]], Y_train[idxs[:split_idx]]
print()
print('Set validation data aside')
print('Training data shape: ', X_train.shape)
print('Training labels shape: ', Y_train.shape)
print('Validation data shape: ', X_valid.shape)
print('Validation labels shape: ', Y_valid.shape)

MNIST data loaded:
Training data shape: (60000, 784)
Training labels shape: (60000, 10)
Test data shape: (10000, 784)
Test labels shape: (10000, 10)

Set validation data aside
Training data shape:  (48000, 784)
Training labels shape:  (48000, 10)
Validation data shape:  (12000, 784)
Validation labels shape:  (12000, 10)


#Training & Evaluation

In [17]:
###
# Question (f)
# Tune the hyperparameters with validation data,
# and print the results by running the lines below.
###

In [18]:
# model instantiation
model = TwoLayerNN(input_dim=784, num_hiddens=64, num_classes=10)

In [19]:
# train the model
lr, n_epochs, batch_size = 2.0, 20, 256
model.train(X_train, Y_train, X_valid, Y_valid, lr, n_epochs, batch_size)


UFuncTypeError: ufunc 'multiply' did not contain a loop with signature matching types (dtype('<U32'), dtype('<U2')) -> None

In [None]:
# evalute the model on test data
Y_hat, _ = model.forward(X_test)
test_loss = model.compute_loss(Y_test, Y_hat)
test_acc = model.evaluate(Y_test, Y_hat)
print("Final test loss = {:.3f}, acc = {:.3f}".format(test_loss, test_acc))

In [None]:
###
# Question (g)
# Visualize your inaccurate results and briefly guess why the model may have predicted these numbers incorrectly
X_test.shape

Y_hat, _ = model.forward(X_test)

classes_pred = np.argmax(Y_hat, axis=1)
classes_gt = np.argmax(Y_test, axis=1)

images = X_test[classes_pred != classes_gt][:16]
images = np.reshape(images, (-1, 28, 28))
labels = Y_test[classes_pred != classes_gt][:16]
Y_hat = Y_hat[classes_pred != classes_gt][:16]
plot_dataset(images, labels, grid_width=8, grid_height=2, figure_width=20, figure_height=5, y_hats = Y_hat)

**Question (g)** :  Report the best combination of hyperparameters you find along with your final test accuracy


You Answer :