In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision import datasets
from torchvision.transforms import ToTensor

## Fully Connected Layer

### Activation and Loss

In [2]:
class ReLU:
  def f(self, x):
    return np.maximum(0, x)
  
  def fp(self, x):
    return (x > 0).astype(float)
  
# Empty activation
class EmptyActivation:
  def f(self, x):
    return x

  def fp(self, x):
    return np.ones_like(x)

class SoftmaxCrossEntropy:
    def f(self, y_true, logits):
        # logits: shape (batch_size, num_classes)
        # y_true: shape (batch_size,) with class indices

        # Compute softmax
        exps = np.exp(logits - np.max(logits, axis=1, keepdims=True))
        probs = exps / np.sum(exps, axis=1, keepdims=True)
        self.probs = probs

        # Cross-entropy loss
        batch_size = y_true.shape[0]
        correct_logprobs = -np.log(probs[range(batch_size), y_true])
        return np.mean(correct_logprobs)

    def fp(self, y_true, logits):
        # Derivative: probs - y_true_onehot
        batch_size = y_true.shape[0]
        grad = self.probs.copy()
        grad[range(batch_size), y_true] -= 1
        return grad / batch_size

### Neural Network

In [3]:
# Modified to be used with CNN
class NeuralNetworkForCNN:
  def __init__(self, input_shape: int, hidden_units: list[int], output_shape: int):
    self.l = len(hidden_units) + 1  # number of layers
    self.W = []                     # weights
    self.b = []                     # biases
    self.a = []                     # activations
    
    prev_units = input_shape
    for units in hidden_units:
      weight = np.random.randn(prev_units, units) * np.sqrt(2 / prev_units)
      bias = np.zeros((1, units))
      self.W.append(weight)
      self.b.append(bias)
      self.a.append(ReLU())
      prev_units = units

    # Output layer
    weight = np.random.randn(prev_units, output_shape) * np.sqrt(2 / prev_units)
    bias = np.zeros((1, output_shape))
    self.W.append(weight)
    self.b.append(bias)
    self.a.append(EmptyActivation())

    self.loss_fn = SoftmaxCrossEntropy()
  
  def set_activation(self, a: list):
    self.a = a

  def set_loss_fn(self, L):
    self.loss_fn = L
  
  def forward(self, X):
    x = X.copy()
    zs = []  # pre-activation
    As = []  # post-activation
    for i in range(self.l):
      z = x @ self.W[i] + self.b[i]
      zs.append(z)
      A = self.a[i].f(z)
      As.append(A)
      x = A
    return zs, As
  
  def backward(self, X, y, zs, As, lr=0.01):
    # Partial derivative of loss w.r.t. output
    dL_dO = self.loss_fn.fp(y, As[-1])

    # Gradient of layer i+1 w.r.t. layer i
    dA = dL_dO
    for i in range(self.l-1, -1, -1):
      # Partial derivative of loss w.r.t. activation
      a_deriv = self.a[i].fp(zs[i])
      dL_dz = dA * a_deriv

      # Update gradient for layer i-1
      dA = dL_dz @ self.W[i].T

      # Partial derivative of loss w.r.t. bias
      dL_db = np.sum(dL_dz, axis=0, keepdims=True)

      # Partial derivative of loss w.r.t. weight
      dz_dW = As[i-1] if i > 0 else X
      dL_dW = dz_dW.T @ dL_dz

      # Update weights and biases
      self.b[i] -= lr * dL_db
      self.W[i] -= lr * dL_dW
    return dA
  
  def get_batches(self, X, y, batch_size):
    for i in range(0, len(X), batch_size):
      yield X[i:i+batch_size], y[i:i+batch_size]


  def train(self, X, y, epochs=100, lr=0.01, batch_size=32):
    for epoch in range(epochs):
      total_loss = 0
      batches = self.get_batches(X, y, batch_size)

      for X_batch, y_batch in batches:
        zs, As = self.forward(X_batch)
        loss = self.loss_fn.f(y_batch, As[-1])
        total_loss += loss
        self.backward(X_batch, y_batch, zs, As, lr)

      avg_loss = total_loss / (len(X) // batch_size)

  def test(self, X, y):
    test_activations = self.forward(X)[-1]
    y_pred = np.argmax(test_activations[-1], axis=1)
    accuracy = np.mean(y_pred == y)
    print(f"Test Accuracy: {accuracy * 100:.2f}%")

  def train_and_test(self, X_train, y_train, X_test, y_test, epochs=100, lr=0.01, batch_size=32):
    for epoch in range(epochs):
      total_loss = 0
      batches = self.get_batches(X_train, y_train, batch_size)

      for X_batch, y_batch in batches:
        zs, As = self.forward(X_batch)
        loss = self.loss_fn.f(y_batch, As[-1])
        total_loss += loss
        self.backward(X_batch, y_batch, zs, As, lr)

      avg_loss = total_loss / (len(X_train) // batch_size)
      
      test_activations = self.forward(X_test)[-1]
      y_pred = np.argmax(test_activations[-1], axis=1)
      accuracy = np.mean(y_pred == y_test)
      
      print(f"Epoch {epoch}, Loss: {avg_loss:.4f}, Test Accuracy: {accuracy * 100:.2f}%")

### Testing

In [4]:
# Load FashionMNIST dataset from pytorch
train_dataset = datasets.FashionMNIST(root='./data', train=True, download=True, transform=ToTensor())
test_dataset = datasets.FashionMNIST(root="./data", train=False, download=True, transform=ToTensor())
class_names = train_dataset.classes

# Extract data and labels
X_train = train_dataset.data.numpy().reshape(-1, 28*28).astype(np.float32) / 255.0 # Normalize to [0, 1]
y_train = train_dataset.targets.numpy()
X_test = test_dataset.data.numpy().reshape(-1, 28*28).astype(np.float32) / 255.0 # Normalize to [0, 1]
y_test = test_dataset.targets.numpy()

X_train.shape, X_test.shape, y_train.shape, y_test.shape, class_names

((60000, 784),
 (10000, 784),
 (60000,),
 (10000,),
 ['T-shirt/top',
  'Trouser',
  'Pullover',
  'Dress',
  'Coat',
  'Sandal',
  'Shirt',
  'Sneaker',
  'Bag',
  'Ankle boot'])

In [5]:
np.random.seed(42)

# Train and test the model
model = NeuralNetworkForCNN(input_shape=784, hidden_units=[128, 64], output_shape=10)
model.train_and_test(X_train, y_train, X_test, y_test, epochs=100, lr=0.01, batch_size=32)

Epoch 0, Loss: 0.6924, Test Accuracy: 80.89%
Epoch 1, Loss: 0.4858, Test Accuracy: 82.54%
Epoch 2, Loss: 0.4426, Test Accuracy: 83.82%
Epoch 3, Loss: 0.4160, Test Accuracy: 84.43%
Epoch 4, Loss: 0.3965, Test Accuracy: 85.07%
Epoch 5, Loss: 0.3809, Test Accuracy: 85.45%
Epoch 6, Loss: 0.3679, Test Accuracy: 85.67%
Epoch 7, Loss: 0.3565, Test Accuracy: 86.16%
Epoch 8, Loss: 0.3466, Test Accuracy: 86.25%
Epoch 9, Loss: 0.3375, Test Accuracy: 86.39%
Epoch 10, Loss: 0.3291, Test Accuracy: 86.62%
Epoch 11, Loss: 0.3215, Test Accuracy: 86.66%
Epoch 12, Loss: 0.3142, Test Accuracy: 86.84%
Epoch 13, Loss: 0.3076, Test Accuracy: 86.95%
Epoch 14, Loss: 0.3014, Test Accuracy: 87.23%
Epoch 15, Loss: 0.2957, Test Accuracy: 87.25%
Epoch 16, Loss: 0.2902, Test Accuracy: 87.34%
Epoch 17, Loss: 0.2851, Test Accuracy: 87.48%
Epoch 18, Loss: 0.2802, Test Accuracy: 87.59%
Epoch 19, Loss: 0.2754, Test Accuracy: 87.53%
Epoch 20, Loss: 0.2710, Test Accuracy: 87.68%
Epoch 21, Loss: 0.2668, Test Accuracy: 87.71

## CNN Without Padding and Pooling

In [25]:
class CNN_NoPaddingNoPooling:
  def __init__(self, input_shape, nn_hidden_units, output_shape, kernel_size, stride=1):
    self.channels, self.n, _ = input_shape # (channels, height, width) = (C, H, W), assume square image
    self.stride = stride # Use stride=1 for now
    cnn_shape = ((self.n - kernel_size) // self.stride) + 1 # Final shape after convolution
    
    self.kernel = np.random.randn(self.channels, kernel_size, kernel_size) * np.sqrt(2 / (self.channels * kernel_size * kernel_size))
    self.bias = np.zeros((1,))
    self.activation = ReLU()
    self.nn = NeuralNetworkForCNN(input_shape=cnn_shape**2, hidden_units=nn_hidden_units, output_shape=output_shape)

  # Only 2D convolution is implemented
  def conv2d(self, X_batch, kernel):
    batch_size, channels, n, _ = X_batch.shape
    k = kernel.shape[-1]
    final_shape = ((n - k) // self.stride) + 1
    output = np.zeros((batch_size, final_shape, final_shape))

    for i in range(0, n - k + 1, self.stride):
      for j in range(0, n - k + 1, self.stride):
        region = X_batch[:, :, i:i+k, j:j+k]                       # shape: (batch_size, channels, k, k)
        output[:, i, j] = np.sum(region * kernel, axis=(1, 2, 3))  # sum over (channels, k, k)

    return output

  def forward(self, X_batch):
    Z = self.conv2d(X_batch, self.kernel) + self.bias
    A = self.activation.f(Z)
    A_flat = A.reshape(A.shape[0], -1)  # flatten each sample
    nn_zs, nn_As = self.nn.forward(A_flat)
    return Z, A, A_flat, nn_zs, nn_As
  
  def backward(self, X_batch, y_batch, z, A, A_flat, nn_zs, nn_As, lr=0.1):
    dL_dA_flat = self.nn.backward(A_flat, y_batch, nn_zs, nn_As, lr)
    dL_dA = dL_dA_flat.reshape(A.shape)           # Reshape back to original shape

    dA_dz = self.activation.fp(z)                 # Gradient of activation function
    dL_dz = dL_dA * dA_dz                         # Gradient of loss w.r.t. z

    # Gradient of loss w.r.t. bias
    dL_db = np.sum(dL_dz, axis=(0, 1, 2))

    # Convolution backward: convolve input with dL_dz
    dL_dk = np.zeros_like(self.kernel)
    for b in range(X_batch.shape[0]):
      grad = self.conv2d(X_batch[b:b+1], dL_dz[b:b+1])
      dL_dk += grad

    # Average gradients over batch size
    dL_dk /= X_batch.shape[0]

    # Update kernel and bias
    self.kernel -= lr * dL_dk
    self.bias -= lr * dL_db

  def get_batches(self, X, y, batch_size):
    for i in range(0, len(X), batch_size):
      yield X[i:i+batch_size], y[i:i+batch_size]

  def train_and_test(self, X_train, y_train, X_test, y_test, epochs=100, lr=0.1, batch_size=32):
    for epoch in range(epochs):
      total_loss = 0
      batches = self.nn.get_batches(X_train, y_train, batch_size)

      for X_batch, y_batch in batches:
        Z, A, A_flat, nn_zs, nn_As = self.forward(X_batch)
        loss = self.nn.loss_fn.f(y_batch, nn_As[-1])
        total_loss += loss
        self.backward(X_batch, y_batch, Z, A, A_flat, nn_zs, nn_As, lr)

      avg_loss = total_loss / (len(X_train) // batch_size)

      # Test after each epoch
      test_activations = self.forward(X_test)[-1]
      y_pred = np.argmax(test_activations[-1], axis=1)
      accuracy = np.mean(y_pred == y_test)
      
      print(f"Epoch {epoch}, Loss: {avg_loss:.4f}, Test Accuracy: {accuracy * 100:.2f}%")

In [7]:
# Load FashionMNIST dataset from pytorch
train_dataset = datasets.FashionMNIST(root='./data', train=True, download=True, transform=ToTensor())
test_dataset = datasets.FashionMNIST(root="./data", train=False, download=True, transform=ToTensor())
class_names = train_dataset.classes

# Extract data and labels
X_train = train_dataset.data.numpy().astype(np.float32) / 255.0 # Normalize to [0, 1]
y_train = train_dataset.targets.numpy()
X_test = test_dataset.data.numpy().astype(np.float32) / 255.0 # Normalize to [0, 1]
y_test = test_dataset.targets.numpy()

X_train = X_train.reshape(-1, 1, 28, 28)  # Reshape to (batch_size, channels, height, width)
X_test = X_test.reshape(-1, 1, 28, 28)    # Reshape to (batch_size, channels, height, width)

X_train.shape, X_test.shape, y_train.shape, y_test.shape, class_names

((60000, 1, 28, 28),
 (10000, 1, 28, 28),
 (60000,),
 (10000,),
 ['T-shirt/top',
  'Trouser',
  'Pullover',
  'Dress',
  'Coat',
  'Sandal',
  'Shirt',
  'Sneaker',
  'Bag',
  'Ankle boot'])

In [26]:
np.random.seed(42)

# Train and test the model
model = CNN_NoPaddingNoPooling(input_shape=X_train.shape[1:], nn_hidden_units=[128, 64], output_shape=10, kernel_size=3, stride=1)
model.train_and_test(X_train, y_train, X_test, y_test, epochs=5, lr=0.1, batch_size=32)

Epoch 0, Loss: 0.5789, Test Accuracy: 82.06%
Epoch 1, Loss: 0.4184, Test Accuracy: 83.89%
Epoch 2, Loss: 0.3730, Test Accuracy: 84.72%
Epoch 3, Loss: 0.3460, Test Accuracy: 85.85%
Epoch 4, Loss: 0.3248, Test Accuracy: 86.60%


## CNN No Padding, With Max Pooling

In [13]:
class CNN_NoPadding_WithPooling:
  def __init__(self, input_shape, nn_hidden_units, output_shape, kernel_size, stride=1, pool_size=2, pool_stride=2):
    self.channels, self.n, _ = input_shape # (channels, height, width) = (C, H, W), assume square image

    # Convolution layer
    self.kernel_size = kernel_size
    self.stride = stride
    self.cnn_shape = ((self.n - kernel_size) // self.stride) + 1 # Final shape after convolution
    
    self.kernel = np.random.randn(self.channels, kernel_size, kernel_size) * np.sqrt(2 / (self.channels * kernel_size * kernel_size))
    self.bias = np.zeros((1,))
    self.activation = ReLU()

    # Pooling layer
    self.pool_size = pool_size
    self.pool_stride = pool_stride

    # Fully connected layer
    self.pooled_shape = ((self.cnn_shape - pool_size) // pool_stride) + 1   # Final shape after pooling
    self.nn = NeuralNetworkForCNN(input_shape=self.pooled_shape**2, hidden_units=nn_hidden_units, output_shape=output_shape)

  # Only 2D convolution is implemented
  def conv2d(self, X_batch, kernel):
    batch_size, channels, n, _ = X_batch.shape
    k = kernel.shape[-1]
    final_shape = ((n - k) // self.stride) + 1
    output = np.zeros((batch_size, final_shape, final_shape))

    for i in range(0, n - k + 1, self.stride):
      for j in range(0, n - k + 1, self.stride):
        region = X_batch[:, :, i:i+k, j:j+k]                       # shape: (batch_size, channels, k, k)
        output[:, i, j] = np.sum(region * kernel, axis=(1, 2, 3))  # sum over (channels, k, k)

    return output

  # Only 2D pooling is implemented, use max pooling for now
  def max_pool2d(self, X):
    batch_size, n, _ = X.shape
    final_shape = ((n - self.pool_size) // self.pool_stride) + 1
    output = np.zeros((batch_size, final_shape, final_shape))

    for i in range(0, n - self.pool_size + 1, self.pool_stride):
      for j in range(0, n - self.pool_size + 1, self.pool_stride):
        region = X[:, i:i+self.pool_size, j:j+self.pool_size]  # (batch_size, pool_size, pool_size)
        output[:, i // self.pool_stride, j // self.pool_stride] = np.max(region, axis=(1, 2))

    return output

  def forward(self, X_batch):
    Z = self.conv2d(X_batch, self.kernel) + self.bias     # Convolution
    A = self.activation.f(Z)                              # Activation
    A_pool = self.max_pool2d(A)                           # Pooling
    A_flat = A_pool.reshape(A_pool.shape[0], -1)          # Flatten
    nn_zs, nn_As = self.nn.forward(A_flat)                # Fully connected
    return Z, A, A_pool, A_flat, nn_zs, nn_As
  
  # Compute backward propapagation for max pooling
  def max_pool2d_backward(self, dL_dA_pool, A):
    batch_size, n, _ = A.shape
    dL_dA = np.zeros_like(A)

    for i in range(0, n - self.pool_size + 1, self.pool_stride):
      for j in range(0, n - self.pool_size + 1, self.pool_stride):
        region = A[:, i:i+self.pool_size, j:j+self.pool_size]             # (batch_size, pool_size, pool_size)
        max_mask = np.max(region, axis=(1, 2), keepdims=True) == region   # Mask for max values
        dL_dA[:, i:i+self.pool_size, j:j+self.pool_size] += max_mask * dL_dA_pool[:, i // self.pool_stride, j // self.pool_stride][:, np.newaxis, np.newaxis]

    return dL_dA
  
  def backward(self, X_batch, y_batch, z, A, A_pool, A_flat, nn_zs, nn_As, lr=0.1):
    dL_dA_flat = self.nn.backward(A_flat, y_batch, nn_zs, nn_As, lr)
    dL_dA_pool = dL_dA_flat.reshape(A_pool.shape)           # Reshape back to original shape
    
    dL_dA = self.max_pool2d_backward(dL_dA_pool, A)    # Backprop through pooling

    dA_dz = self.activation.fp(z)                      # Gradient of activation function
    dL_dz = dL_dA * dA_dz                              # Gradient of loss w.r.t. z

    # Gradient of loss w.r.t. bias
    dL_db = np.sum(dL_dz, axis=(0, 1, 2))

    # Convolution backward: convolve input with dL_dz
    dL_dk = np.zeros_like(self.kernel)
    for b in range(X_batch.shape[0]):
      grad = self.conv2d(X_batch[b:b+1], dL_dz[b:b+1])
      dL_dk += grad

    # Average gradients over batch size
    dL_dk /= X_batch.shape[0]

    # Update kernel and bias
    self.kernel -= lr * dL_dk
    self.bias -= lr * dL_db

  def get_batches(self, X, y, batch_size):
    for i in range(0, len(X), batch_size):
      yield X[i:i+batch_size], y[i:i+batch_size]

  def train_and_test(self, X_train, y_train, X_test, y_test, epochs=100, lr=0.1, batch_size=32):
    for epoch in range(epochs):
      total_loss = 0
      batches = self.nn.get_batches(X_train, y_train, batch_size)

      for X_batch, y_batch in batches:
        Z, A, A_pool, A_flat, nn_zs, nn_As = self.forward(X_batch)
        loss = self.nn.loss_fn.f(y_batch, nn_As[-1])
        total_loss += loss
        self.backward(X_batch, y_batch, Z, A, A_pool, A_flat, nn_zs, nn_As, lr)

      avg_loss = total_loss / (len(X_train) // batch_size)

      # Test after each epoch
      test_activations = self.forward(X_test)[-1]
      y_pred = np.argmax(test_activations[-1], axis=1)
      accuracy = np.mean(y_pred == y_test)
      
      print(f"Epoch {epoch}, Loss: {avg_loss:.4f}, Test Accuracy: {accuracy * 100:.2f}%")

In [5]:
# Load FashionMNIST dataset from pytorch
train_dataset = datasets.FashionMNIST(root='./data', train=True, download=True, transform=ToTensor())
test_dataset = datasets.FashionMNIST(root="./data", train=False, download=True, transform=ToTensor())
class_names = train_dataset.classes

# Extract data and labels
X_train = train_dataset.data.numpy().astype(np.float32) / 255.0 # Normalize to [0, 1]
y_train = train_dataset.targets.numpy()
X_test = test_dataset.data.numpy().astype(np.float32) / 255.0 # Normalize to [0, 1]
y_test = test_dataset.targets.numpy()

X_train = X_train.reshape(-1, 1, 28, 28)  # Reshape to (batch_size, channels, height, width)
X_test = X_test.reshape(-1, 1, 28, 28)    # Reshape to (batch_size, channels, height, width)

X_train.shape, X_test.shape, y_train.shape, y_test.shape, class_names

((60000, 1, 28, 28),
 (10000, 1, 28, 28),
 (60000,),
 (10000,),
 ['T-shirt/top',
  'Trouser',
  'Pullover',
  'Dress',
  'Coat',
  'Sandal',
  'Shirt',
  'Sneaker',
  'Bag',
  'Ankle boot'])

In [14]:
np.random.seed(42)

# Train and test the model
model = CNN_NoPadding_WithPooling(input_shape=X_train.shape[1:], 
                                  nn_hidden_units=[128, 64], 
                                  output_shape=10, 
                                  kernel_size=3, 
                                  stride=1, 
                                  pool_size=2, 
                                  pool_stride=2)
model.train_and_test(X_train, y_train, X_test, y_test, epochs=5, lr=0.1, batch_size=32)

Epoch 0, Loss: 0.6298, Test Accuracy: 77.75%
Epoch 1, Loss: 0.4820, Test Accuracy: 82.07%
Epoch 2, Loss: 0.4399, Test Accuracy: 82.76%
Epoch 3, Loss: 0.4128, Test Accuracy: 83.61%
Epoch 4, Loss: 0.3942, Test Accuracy: 84.22%


## CNN With Padding and Max Pooling

In [38]:
class CNN_WithPadding_WithPooling:
  def __init__(self, input_shape, nn_hidden_units, output_shape, kernel_size, stride=1, padding_valid=True, pool_size=2, pool_stride=2):
    self.channels, self.n, _ = input_shape # (channels, height, width) = (C, H, W), assume square image

    # Convolution layer
    self.kernel_size = kernel_size
    self.stride = stride
    self.padding_valid = padding_valid # true for valid padding, false for same padding

    if padding_valid:
      self.cnn_shape = ((self.n - kernel_size) // self.stride) + 1 # Final shape after convolution
    else:
      self.pad =  (kernel_size - 1) // 2
      self.cnn_shape = ((self.n + 2*self.pad - kernel_size) // self.stride) + 1
    
    self.kernel = np.random.randn(self.channels, kernel_size, kernel_size) * np.sqrt(2 / (self.channels * kernel_size * kernel_size))
    self.bias = np.zeros((1,))
    self.activation = ReLU()

    # Pooling layer
    self.pool_size = pool_size
    self.pool_stride = pool_stride

    # Fully connected layer
    self.pooled_shape = ((self.cnn_shape - pool_size) // pool_stride) + 1   # Final shape after pooling
    self.nn = NeuralNetworkForCNN(input_shape=self.pooled_shape**2, hidden_units=nn_hidden_units, output_shape=output_shape)

  # Only 2D convolution is implemented
  def conv2d(self, X_batch, kernel, padding_valid):
    batch_size, channels, n, _ = X_batch.shape
    k = kernel.shape[-1]

    if not padding_valid:
      X_batch = np.pad(X_batch, ((0, 0), (0, 0), (self.pad, self.pad), (self.pad, self.pad)), mode='constant')
      n += 2 * self.pad

    final_shape = ((n - k) // self.stride) + 1
    output = np.zeros((batch_size, final_shape, final_shape))

    for i in range(0, n - k + 1, self.stride):
      for j in range(0, n - k + 1, self.stride):
        region = X_batch[:, :, i:i+k, j:j+k]                       # shape: (batch_size, channels, k, k)
        output[:, i, j] = np.sum(region * kernel, axis=(1, 2, 3))  # sum over (channels, k, k)

    return output

  # Only 2D pooling is implemented, use max pooling for now
  def max_pool2d(self, X):
    batch_size, n, _ = X.shape
    final_shape = ((n - self.pool_size) // self.pool_stride) + 1
    output = np.zeros((batch_size, final_shape, final_shape))

    for i in range(0, n - self.pool_size + 1, self.pool_stride):
      for j in range(0, n - self.pool_size + 1, self.pool_stride):
        region = X[:, i:i+self.pool_size, j:j+self.pool_size]  # (batch_size, pool_size, pool_size)
        output[:, i // self.pool_stride, j // self.pool_stride] = np.max(region, axis=(1, 2))

    return output

  def forward(self, X_batch):
    Z = self.conv2d(X_batch, self.kernel, self.padding_valid) + self.bias     # Convolution
    A = self.activation.f(Z)                                                  # Activation
    A_pool = self.max_pool2d(A)                                               # Pooling
    A_flat = A_pool.reshape(A_pool.shape[0], -1)                              # Flatten
    nn_zs, nn_As = self.nn.forward(A_flat)                                    # Fully connected
    return Z, A, A_pool, A_flat, nn_zs, nn_As
  
  # Compute backward propapagation for max pooling
  def max_pool2d_backward(self, dL_dA_pool, A):
    batch_size, n, _ = A.shape
    dL_dA = np.zeros_like(A)

    for i in range(0, n - self.pool_size + 1, self.pool_stride):
      for j in range(0, n - self.pool_size + 1, self.pool_stride):
        region = A[:, i:i+self.pool_size, j:j+self.pool_size]             # (batch_size, pool_size, pool_size)
        max_mask = np.max(region, axis=(1, 2), keepdims=True) == region   # Mask for max values
        dL_dA[:, i:i+self.pool_size, j:j+self.pool_size] += max_mask * dL_dA_pool[:, i // self.pool_stride, j // self.pool_stride][:, np.newaxis, np.newaxis]

    return dL_dA
  
  def backward(self, X_batch, y_batch, z, A, A_pool, A_flat, nn_zs, nn_As, lr=0.1):
    dL_dA_flat = self.nn.backward(A_flat, y_batch, nn_zs, nn_As, lr)
    dL_dA_pool = dL_dA_flat.reshape(A_pool.shape)           # Reshape back to original shape
    
    dL_dA = self.max_pool2d_backward(dL_dA_pool, A)    # Backprop through pooling

    dA_dz = self.activation.fp(z)                      # Gradient of activation function
    dL_dz = dL_dA * dA_dz                              # Gradient of loss w.r.t. z

    # Gradient of loss w.r.t. bias
    dL_db = np.sum(dL_dz)

    # Convolution backward: convolve input with dL_dz
    dL_dk = np.zeros_like(self.kernel)
    for b in range(X_batch.shape[0]):
      grad = self.conv2d(X_batch[b:b+1], dL_dz[b:b+1], padding_valid=self.padding_valid)
      dL_dk += grad

    # Average gradients over batch size
    dL_dk /= X_batch.shape[0]

    # Update kernel and bias
    self.kernel -= lr * dL_dk
    self.bias -= lr * dL_db

  def get_batches(self, X, y, batch_size):
    for i in range(0, len(X), batch_size):
      yield X[i:i+batch_size], y[i:i+batch_size]

  def train_and_test(self, X_train, y_train, X_test, y_test, epochs=100, lr=0.1, batch_size=32):
    for epoch in range(epochs):
      total_loss = 0
      batches = self.nn.get_batches(X_train, y_train, batch_size)

      for X_batch, y_batch in batches:
        Z, A, A_pool, A_flat, nn_zs, nn_As = self.forward(X_batch)
        loss = self.nn.loss_fn.f(y_batch, nn_As[-1])
        total_loss += loss
        self.backward(X_batch, y_batch, Z, A, A_pool, A_flat, nn_zs, nn_As, lr)

      avg_loss = total_loss / (len(X_train) // batch_size)

      # Test after each epoch
      test_activations = self.forward(X_test)[-1]
      y_pred = np.argmax(test_activations[-1], axis=1)
      accuracy = np.mean(y_pred == y_test)
      
      print(f"Epoch {epoch}, Loss: {avg_loss:.4f}, Test Accuracy: {accuracy * 100:.2f}%")

In [15]:
# Load FashionMNIST dataset from pytorch
train_dataset = datasets.FashionMNIST(root='./data', train=True, download=True, transform=ToTensor())
test_dataset = datasets.FashionMNIST(root="./data", train=False, download=True, transform=ToTensor())
class_names = train_dataset.classes

# Extract data and labels
X_train = train_dataset.data.numpy().astype(np.float32) / 255.0 # Normalize to [0, 1]
y_train = train_dataset.targets.numpy()
X_test = test_dataset.data.numpy().astype(np.float32) / 255.0 # Normalize to [0, 1]
y_test = test_dataset.targets.numpy()

X_train = X_train.reshape(-1, 1, 28, 28)  # Reshape to (batch_size, channels, height, width)
X_test = X_test.reshape(-1, 1, 28, 28)    # Reshape to (batch_size, channels, height, width)

X_train.shape, X_test.shape, y_train.shape, y_test.shape, class_names

((60000, 1, 28, 28),
 (10000, 1, 28, 28),
 (60000,),
 (10000,),
 ['T-shirt/top',
  'Trouser',
  'Pullover',
  'Dress',
  'Coat',
  'Sandal',
  'Shirt',
  'Sneaker',
  'Bag',
  'Ankle boot'])

In [39]:
np.random.seed(42)

# Train and test the model
model = CNN_WithPadding_WithPooling(input_shape=X_train.shape[1:], 
                                  nn_hidden_units=[128, 64], 
                                  output_shape=10, 
                                  kernel_size=3, 
                                  stride=1, 
                                  padding_valid=False,
                                  pool_size=2, 
                                  pool_stride=2)
model.train_and_test(X_train, y_train, X_test, y_test, epochs=5, lr=0.1, batch_size=32)

Epoch 0, Loss: 0.5804, Test Accuracy: 80.97%
Epoch 1, Loss: 0.4320, Test Accuracy: 83.82%
Epoch 2, Loss: 0.3895, Test Accuracy: 84.90%
Epoch 3, Loss: 0.3628, Test Accuracy: 85.62%
Epoch 4, Loss: 0.3427, Test Accuracy: 85.93%


## CNN With Multiple Filters

In [None]:
class CNN_Multiple_Filters:
  def __init__(self, input_shape, nn_hidden_units, output_shape, kernel_size, stride=1, padding_valid=True, num_kernels=1, pool_size=2, pool_stride=2):
    self.channels, self.n, _ = input_shape # (channels, height, width) = (C, H, W), assume square image

    # Convolution layer
    self.kernel_size = kernel_size
    self.stride = stride
    self.padding_valid = padding_valid # true for valid padding, false for same padding
    self.num_kernels = num_kernels

    if padding_valid:
      self.cnn_shape = ((self.n - kernel_size) // self.stride) + 1 # Final shape after convolution
    else:
      self.pad =  (kernel_size - 1) // 2
      self.cnn_shape = ((self.n + 2*self.pad - kernel_size) // self.stride) + 1
    
    self.kernel = np.random.randn(self.num_kernels, self.channels, kernel_size, kernel_size) * np.sqrt(2 / (self.channels * kernel_size * kernel_size))
    self.bias = np.zeros((self.num_kernels,))
    self.activation = ReLU()

    # Pooling layer
    self.pool_size = pool_size
    self.pool_stride = pool_stride

    # Fully connected layer
    self.pooled_shape = ((self.cnn_shape - pool_size) // pool_stride) + 1   # Final shape after pooling
    self.nn = NeuralNetworkForCNN(input_shape=num_kernels*(self.pooled_shape**2), hidden_units=nn_hidden_units, output_shape=output_shape)

  # Only 2D convolution is implemented
  def conv2d(self, X_batch, kernel, padding_valid):
    batch_size, channels, n, _ = X_batch.shape
    num_kernels, k = kernel.shape[0], kernel.shape[-1]

    if not padding_valid:
      X_batch = np.pad(X_batch, ((0, 0), (0, 0), (self.pad, self.pad), (self.pad, self.pad)), mode='constant')
      n += 2 * self.pad

    final_shape = ((n - k) // self.stride) + 1
    output = np.zeros((batch_size, num_kernels, final_shape, final_shape))

    for f in range(num_kernels):
      for i in range(0, n - k + 1, self.stride):
        for j in range(0, n - k + 1, self.stride):
          region = X_batch[:, :, i:i+k, j:j+k]                       # shape: (batch_size, channels, k, k)
          output[:, f, i, j] = np.sum(region * kernel[f], axis=(1, 2, 3))  # sum over (channels, k, k)

    return output

  # Only 2D pooling is implemented, use max pooling for now
  def max_pool2d(self, X):
    batch_size, num_kernels, n, _ = X.shape
    final_shape = ((n - self.pool_size) // self.pool_stride) + 1
    output = np.zeros((batch_size, num_kernels, final_shape, final_shape))

    for i in range(0, n - self.pool_size + 1, self.pool_stride):
      for j in range(0, n - self.pool_size + 1, self.pool_stride):
        region = X[:, :, i:i+self.pool_size, j:j+self.pool_size]  # (batch_size, num_kernels, pool_size, pool_size)
        output[:, :, i // self.pool_stride, j // self.pool_stride] = np.max(region, axis=(2, 3))

    return output

  def forward(self, X_batch):
    Z = self.conv2d(X_batch, self.kernel, self.padding_valid) + self.bias[None, :, None, None]     # Convolution
    A = self.activation.f(Z)                                                                       # Activation
    A_pool = self.max_pool2d(A)                                                                    # Pooling
    A_flat = A_pool.reshape(A_pool.shape[0], -1)                                                   # Flatten
    nn_zs, nn_As = self.nn.forward(A_flat)                                                         # Fully connected
    return Z, A, A_pool, A_flat, nn_zs, nn_As
  
  # Compute backward propapagation for max pooling
  def max_pool2d_backward(self, dL_dA_pool, A):
    batch_size, num_kernels, n, _ = A.shape
    dL_dA = np.zeros_like(A)

    for f in range(num_kernels):
      for i in range(0, n - self.pool_size + 1, self.pool_stride):
        for j in range(0, n - self.pool_size + 1, self.pool_stride):
          region = A[:, f, i:i+self.pool_size, j:j+self.pool_size]             # (batch_size, num_kernels, pool_size, pool_size)
          max_mask = np.max(region, axis=(1, 2), keepdims=True) == region      # Mask for max values
          grad = dL_dA_pool[:, f, i // self.pool_stride, j // self.pool_stride][:, np.newaxis, np.newaxis]
          dL_dA[:, f, i:i+self.pool_size, j:j+self.pool_size] += max_mask * grad

    return dL_dA
  
  def backward(self, X_batch, y_batch, z, A, A_pool, A_flat, nn_zs, nn_As, lr=0.1):
    dL_dA_flat = self.nn.backward(A_flat, y_batch, nn_zs, nn_As, lr)
    dL_dA_pool = dL_dA_flat.reshape(A_pool.shape)           # Reshape back to original shape
    
    dL_dA = self.max_pool2d_backward(dL_dA_pool, A)    # Backprop through pooling

    dA_dz = self.activation.fp(z)                      # Gradient of activation function
    dL_dz = dL_dA * dA_dz                              # Gradient of loss w.r.t. z

    # Gradient of loss w.r.t. bias
    dL_db = np.sum(dL_dz, axis=(0, 2, 3))  # Sum over batch size and spatial dimensions

    # Convolution backward: convolve input with dL_dz
    dL_dk = np.zeros_like(self.kernel)
    for b in range(X_batch.shape[0]):
      input_ = X_batch[b:b+1].transpose(1, 0, 2, 3)     # (channels, 1, n, n)
      grad_out = dL_dz[b:b+1]                           # (1, num_kernels, n, n)

      grad = self.conv2d(grad_out, input_, padding_valid=self.padding_valid)  # (1, num_kernels, k, k)

      dL_dk += grad.transpose(1, 0, 2, 3)               # Convert back to (num_kernels, channels, k, k)

    # Average gradients over batch size
    dL_dk /= X_batch.shape[0]

    # Update kernel and bias
    self.kernel -= lr * dL_dk
    self.bias -= lr * dL_db

  def get_batches(self, X, y, batch_size):
    for i in range(0, len(X), batch_size):
      yield X[i:i+batch_size], y[i:i+batch_size]

  def train_and_test(self, X_train, y_train, X_test, y_test, epochs=100, lr=0.1, batch_size=32):
    for epoch in range(epochs):
      total_loss = 0
      batches = self.nn.get_batches(X_train, y_train, batch_size)

      for X_batch, y_batch in batches:
        Z, A, A_pool, A_flat, nn_zs, nn_As = self.forward(X_batch)
        loss = self.nn.loss_fn.f(y_batch, nn_As[-1])
        total_loss += loss
        self.backward(X_batch, y_batch, Z, A, A_pool, A_flat, nn_zs, nn_As, lr)

      avg_loss = total_loss / (len(X_train) // batch_size)

      # Test after each epoch
      test_activations = self.forward(X_test)[-1]
      y_pred = np.argmax(test_activations[-1], axis=1)
      accuracy = np.mean(y_pred == y_test)
      
      print(f"Epoch {epoch}, Loss: {avg_loss:.4f}, Test Accuracy: {accuracy * 100:.2f}%")

In [27]:
# Load FashionMNIST dataset from pytorch
train_dataset = datasets.FashionMNIST(root='./data', train=True, download=True, transform=ToTensor())
test_dataset = datasets.FashionMNIST(root="./data", train=False, download=True, transform=ToTensor())
class_names = train_dataset.classes

# Extract data and labels
X_train = train_dataset.data.numpy().astype(np.float32) / 255.0 # Normalize to [0, 1]
y_train = train_dataset.targets.numpy()
X_test = test_dataset.data.numpy().astype(np.float32) / 255.0 # Normalize to [0, 1]
y_test = test_dataset.targets.numpy()

X_train = X_train.reshape(-1, 1, 28, 28)  # Reshape to (batch_size, channels, height, width)
X_test = X_test.reshape(-1, 1, 28, 28)    # Reshape to (batch_size, channels, height, width)

X_train.shape, X_test.shape, y_train.shape, y_test.shape, class_names

((60000, 1, 28, 28),
 (10000, 1, 28, 28),
 (60000,),
 (10000,),
 ['T-shirt/top',
  'Trouser',
  'Pullover',
  'Dress',
  'Coat',
  'Sandal',
  'Shirt',
  'Sneaker',
  'Bag',
  'Ankle boot'])

In [None]:
np.random.seed(42)

# Train and test the model
model = CNN_Multiple_Filters(input_shape=X_train.shape[1:], 
                                  nn_hidden_units=[128, 64], 
                                  output_shape=10, 
                                  kernel_size=3, 
                                  stride=1, 
                                  padding_valid=False,
                                  num_kernels=4,
                                  pool_size=2, 
                                  pool_stride=2)
model.train_and_test(X_train, y_train, X_test, y_test, epochs=5, lr=0.1, batch_size=32)

Epoch 0, Loss: 0.5590, Test Accuracy: 79.90%
Epoch 1, Loss: 0.3970, Test Accuracy: 83.15%
Epoch 2, Loss: 0.3549, Test Accuracy: 83.17%
Epoch 3, Loss: 0.3279, Test Accuracy: 85.46%
Epoch 4, Loss: 0.3091, Test Accuracy: 86.13%
