In [2]:
from IPython.display import clear_output

In [None]:
# Download the required libraries (needed when running outside colab where the environment doesn't come pre-loaded with libraries)

%pip install numpy
%pip install matplotlib
%pip install torchvision

clear_output()

Note: We're using torch modules (datasets, dataloaders) to download dataset and easily make batches. The NN will be made in numpy and every step will be implemented ourselves

In [18]:
import numpy as np

from torchvision.datasets import MNIST
from torch.utils.data import DataLoader

from torchvision.transforms.functional import to_tensor

import matplotlib.pyplot as plt

# Contents:

In this notebook you are required to create:

1. Implementation of 2 layer NN from scratch using numpy which classifies MNIST dataset

About MNIST:

the dataset consists of images of 28x28 size. The image each contains a handwritten digit from 0 to 9. Our model needs to take this image and classify it to the correct digit.

In [30]:
# MNIST function fetches the MNIST dataset. Without any transform param, the returned object is a Pillow image but we want to convert it to numerical form
# that is to say, a numpy array/torch tensor

# to_tensor is used to avoid errors when creating data loader later. we'll convert them to numpy arrays when the time comes
train_data = MNIST(root='./datasets', train=True, download=True, transform=to_tensor)
X = train_data.data
y = train_data.targets

test_data  = MNIST(root='./datasets', train=True, download=True, transform=to_tensor)


clear_output()

In [20]:
print(train_data.data[4].shape)
print(train_data.classes)
print(len(train_data.data))

torch.Size([28, 28])
['0 - zero', '1 - one', '2 - two', '3 - three', '4 - four', '5 - five', '6 - six', '7 - seven', '8 - eight', '9 - nine']
60000


### Instructions

1. Convert the data into train and test splits
2. Create a Neural Network implementation using Numpy. The network needs to have atleast 2 layers
3. Train the network on the train split. Keep track of the train and validation losses
4. After the training, plot both the train and test losses on the same graph
5. Test your model's performance on the test split and show the accuracy

In [21]:
# forward pass: Folien ab seite 25. lecture 5 for math functions
# Z1 = W1@X + b1
# A1 = sigmoid(Z1)

# Z2 = W2@A1 + b2
# A2 = sigmoid(Z2) # Same as yhat (since with a two layer network this is the end)
# backward pass: folien 38 right side

# activation functoin (sigmoid, tan h, relu, leaky relu)

#hyperparameter = parameter I need to provide

#learnable parameter = model dimensions

# epoch = whole cycle throught the dataset. IT is a hyper parameter (e.g. how often we go through the whole dataset (all 60k images))
# iteration = run only once (e.g. 64 batch size of 60k images)

In [22]:
def get_Z(X, w, b):
    return w @ X + b

def sigmoid(z):
  return 1 / (1 + np.exp(-z))

def get_loss (yhat, y):
  return np.mean(-y * np.log(yhat) - (1 - y) * np.log(1 - yhat), axis = 1)

In [36]:
print(type(X))
# Turn tensor into numpy array
X = X.numpy

print(type(X))

<class 'builtin_function_or_method'>


AttributeError: 'builtin_function_or_method' object has no attribute 'numpy'

# Own Implementation

In [37]:
ni = 28*28 # input
nh = 50 # Design choice = hyper parameter
no= 10 # output
lr = 0.0001
num_itr = 100

W1 = np.random.randn(nh, ni)
b1 = np.zeros((nh, 1))

W2 = np.random.randn(no, ni)
b2 = np.zeros((no, 1))

loss_history = []

for i in range(num_itr):
  # Forward pass
  Z1 = W1 @ X + b1
  A1 = sigmoid(Z1)
  Z2 = W2 @ A1 + b2
  # A2 = sigmoid(Z2) # Same as yhat (since with a two layer network this is the end)
  yhat = sigmoid(Z2)

  # Get loss
  ls = get_loss(yhat, y)
  loss_history.append(ls)

  # Backward pass
  # dl/dZ = dZ => convention to make it easier to code
  dZ2 = yhat - y
  dW2 = dZ2 @ A1.T
  b2 = np.sum(dZ2, axis=1, keepdims=True) # better when doing broadcasting, otherwise the dimension would be (o,) instead of (o,i)

  dZ1 = W2 @ (yhat - y) * A1 * (1 - A1)
  dW1 = dZ1 @ X.T
  b1 = np.sum(dZ1, axis = 1, keepdims = True)

  # Optimization (taking a step in the negative gradient direction)
  #.   Always make sure to first get all gradients calculated and then take a step
  W1 -= lr * dW1
  b1 -= lr * db1
  W2 -= lr * dW2
  b2 -= lr * db2


plt.plot(loss_history)

ValueError: matmul: Input operand 1 does not have enough dimensions (has 0, gufunc core with signature (n?,k),(k,m?)->(n?,m?) requires 1)

# AI advanced implementation

In [24]:
print(type(X))

<class 'torch.Tensor'>


In [31]:
for x in range(60000):

torch.Size([784])