In [4]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision import datasets
from torchvision.transforms import ToTensor

## Fully Connected Layer

### Activation and Loss

In [None]:
class ReLU:
  def f(self, x):
    return np.maximum(0, x)
  
  def fp(self, x):
    return (x > 0).astype(float)
  
# Empty activation
class EmptyActivation:
  def f(self, x):
    return x

  def fp(self, x):
    return np.ones_like(x)

class SoftmaxCrossEntropy:
    def f(self, y_true, logits):
        # logits: shape (batch_size, num_classes)
        # y_true: shape (batch_size,) with class indices

        # Compute softmax
        exps = np.exp(logits - np.max(logits, axis=1, keepdims=True))
        probs = exps / np.sum(exps, axis=1, keepdims=True)
        self.probs = probs

        # Cross-entropy loss
        batch_size = y_true.shape[0]
        correct_logprobs = -np.log(probs[range(batch_size), y_true])
        return np.mean(correct_logprobs)

    def fp(self, y_true, logits):
        # Derivative: probs - y_true_onehot
        batch_size = y_true.shape[0]
        grad = self.probs.copy()
        grad[range(batch_size), y_true] -= 1
        return grad / batch_size

### Neural Network

In [23]:
# Modified to be used with CNN
class NeuralNetworkForCNN:
  def __init__(self, input_shape: int, hidden_units: list[int], output_shape: int):
    self.l = len(hidden_units) + 1  # number of layers
    self.W = []                     # weights
    self.b = []                     # biases
    self.a = []                     # activations
    
    prev_units = input_shape
    for units in hidden_units:
      weight = np.random.randn(prev_units, units) * np.sqrt(2 / prev_units)
      bias = np.zeros((1, units))
      self.W.append(weight)
      self.b.append(bias)
      self.a.append(ReLU())
      prev_units = units

    # Output layer
    weight = np.random.randn(prev_units, output_shape) * np.sqrt(2 / prev_units)
    bias = np.zeros((1, output_shape))
    self.W.append(weight)
    self.b.append(bias)
    self.a.append(EmptyActivation())

    self.loss_fn = SoftmaxCrossEntropy()
  
  def set_activation(self, a: list):
    self.a = a

  def set_loss_fn(self, L):
    self.loss_fn = L
  
  def forward(self, X):
    x = X.copy()
    zs = []  # pre-activation
    As = []  # post-activation
    for i in range(self.l):
      z = x @ self.W[i] + self.b[i]
      zs.append(z)
      A = self.a[i].f(z)
      As.append(A)
      x = A
    return zs, As
  
  def backward(self, X, y, zs, As, lr=0.01):
    # Partial derivative of loss w.r.t. output
    dL_dO = self.loss_fn.fp(y, As[-1])

    # Gradient of layer i+1 w.r.t. layer i
    dA = dL_dO
    for i in range(self.l-1, -1, -1):
      # Partial derivative of loss w.r.t. activation
      a_deriv = self.a[i].fp(zs[i])
      dL_dz = dA * a_deriv

      # Update gradient for layer i-1
      dA = dL_dz @ self.W[i].T

      # Partial derivative of loss w.r.t. bias
      dL_db = np.sum(dL_dz, axis=0, keepdims=True)

      # Partial derivative of loss w.r.t. weight
      dz_dW = As[i-1] if i > 0 else X
      dL_dW = dz_dW.T @ dL_dz

      # Update weights and biases
      self.b[i] -= lr * dL_db
      self.W[i] -= lr * dL_dW
    return dA
  
  def get_batches(self, X, y, batch_size):
    for i in range(0, len(X), batch_size):
      yield X[i:i+batch_size], y[i:i+batch_size]


  def train(self, X, y, epochs=100, lr=0.01, batch_size=32):
    for epoch in range(epochs):
      total_loss = 0
      batches = self.get_batches(X, y, batch_size)

      for X_batch, y_batch in batches:
        zs, As = self.forward(X_batch)
        loss = self.loss_fn.f(y_batch, As[-1])
        total_loss += loss
        self.backward(X_batch, y_batch, zs, As, lr)

      avg_loss = total_loss / (len(X) // batch_size)

      if epoch % 1 == 0:
        print(f"Epoch {epoch}, Loss: {avg_loss:.4f}")

### Testing

In [70]:
# Load FashionMNIST dataset from pytorch
train_dataset = datasets.FashionMNIST(root='./data', train=True, download=True, transform=ToTensor())
test_dataset = datasets.FashionMNIST(root="./data", train=False, download=True, transform=ToTensor())
class_names = train_dataset.classes

# Extract data and labels
X_train = train_dataset.data.numpy().reshape(-1, 28*28).astype(np.float32) / 255.0 # Normalize to [0, 1]
y_train = train_dataset.targets.numpy()
X_test = test_dataset.data.numpy().reshape(-1, 28*28).astype(np.float32) / 255.0 # Normalize to [0, 1]
y_test = test_dataset.targets.numpy()

X_train.shape, X_test.shape, y_train.shape, y_test.shape, class_names

((60000, 784),
 (10000, 784),
 (60000,),
 (10000,),
 ['T-shirt/top',
  'Trouser',
  'Pullover',
  'Dress',
  'Coat',
  'Sandal',
  'Shirt',
  'Sneaker',
  'Bag',
  'Ankle boot'])

In [71]:
np.random.seed(42)
model = NeuralNetworkForCNN(input_shape=784, hidden_units=[128, 64], output_shape=10)

# Train
model.train(X_train, y_train, epochs=100, lr=0.1)

# Test
_, test_activations = model.forward(X_test)
y_pred = np.argmax(test_activations[-1], axis=1)
accuracy = np.mean(y_pred == y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Epoch 0, Loss: 0.5369
Epoch 1, Loss: 0.3908
Epoch 2, Loss: 0.3513
Epoch 3, Loss: 0.3263
Epoch 4, Loss: 0.3079
Epoch 5, Loss: 0.2924
Epoch 6, Loss: 0.2792
Epoch 7, Loss: 0.2679
Epoch 8, Loss: 0.2584
Epoch 9, Loss: 0.2503
Epoch 10, Loss: 0.2417
Epoch 11, Loss: 0.2360
Epoch 12, Loss: 0.2309
Epoch 13, Loss: 0.2232
Epoch 14, Loss: 0.2182
Epoch 15, Loss: 0.2106
Epoch 16, Loss: 0.2062
Epoch 17, Loss: 0.2019
Epoch 18, Loss: 0.1966
Epoch 19, Loss: 0.1914
Epoch 20, Loss: 0.1846
Epoch 21, Loss: 0.1815
Epoch 22, Loss: 0.1783
Epoch 23, Loss: 0.1768
Epoch 24, Loss: 0.1761
Epoch 25, Loss: 0.1719
Epoch 26, Loss: 0.1665
Epoch 27, Loss: 0.1621
Epoch 28, Loss: 0.1606
Epoch 29, Loss: 0.1561
Epoch 30, Loss: 0.1539
Epoch 31, Loss: 0.1516
Epoch 32, Loss: 0.1479
Epoch 33, Loss: 0.1485
Epoch 34, Loss: 0.1420
Epoch 35, Loss: 0.1435
Epoch 36, Loss: 0.1375
Epoch 37, Loss: 0.1373
Epoch 38, Loss: 0.1347
Epoch 39, Loss: 0.1309
Epoch 40, Loss: 0.1272
Epoch 41, Loss: 0.1330
Epoch 42, Loss: 0.1240
Epoch 43, Loss: 0.121

## CNN Without Padding and Pooling

In [None]:
class CNN_NoPaddingNoPooling:
  def __init__(self, input_shape, nn_hidden_units, output_shape, kernel_size, stride=1):
    self.stride = stride # Use stride=1 for now
    cnn_shape = ((input_shape - kernel_size) // self.stride) + 1 # Final shape after convolution
    
    self.kernel = np.random.randn(kernel_size, kernel_size) * np.sqrt(2 / (kernel_size * kernel_size))
    self.bias = np.zeros((cnn_shape, cnn_shape))
    self.activation = ReLU()
    self.nn = NeuralNetworkForCNN(input_shape=cnn_shape**2, hidden_units=nn_hidden_units, output_shape=output_shape)
    pass

  # Only 2D convolution is implemented
  def conv2d(self, X_batch, kernel):
    batch_size, n, _ = X_batch.shape
    k = kernel.shape[0]
    final_shape = ((n - k) // self.stride) + 1
    output = np.zeros((batch_size, final_shape, final_shape))

    for i in range(0, final_shape, self.stride):
      for j in range(0, final_shape, self.stride):
        region = X_batch[:, i:i+k, j:j+k]                       # shape: (batch_size, k, k)
        output[:, i, j] = np.sum(region * kernel, axis=(1, 2))  # sum over k x k

    return output

  def forward(self, X_batch):
    Z = self.conv2d(X_batch, self.kernel) + self.bias
    A = self.activation.f(Z)
    A_flat = A.reshape(A.shape[0], -1)  # flatten each sample
    nn_zs, nn_As = self.nn.forward(A_flat)
    return Z, A, A_flat, nn_zs, nn_As
  
  def backward(self, X_batch, y_batch, z, A, A_flat, nn_zs, nn_As, lr=0.1):
    dL_dAF = self.nn.backward(A_flat, y_batch, nn_zs, nn_As, lr)
    dL_dA = dL_dAF.reshape(A.shape)               # Reshape back to original shape

    dA_dz = self.activation.fp(z)                 # Gradient of activation function
    dL_dz = dL_dA * dA_dz                         # Gradient of loss w.r.t. z

    # Gradient of loss w.r.t. bias
    dL_db = np.sum(dL_dz, axis=0)

    # Convolution backward: convolve input with dL_dz
    dL_dk = np.zeros_like(self.kernel)
    for b in range(X_batch.shape[0]):
      dL_dk += self.conv2d(X_batch[b:b+1], dL_dz[b]).squeeze()

    # Average gradients over batch size
    dL_dk /= X_batch.shape[0]

    # Update kernel and bias
    self.kernel -= lr * dL_dk
    self.bias -= lr * dL_db

  def get_batches(self, X, y, batch_size):
    for i in range(0, len(X), batch_size):
      yield X[i:i+batch_size], y[i:i+batch_size]

  def train(self, X, y, epochs=100, lr=0.1, batch_size=32):
    for epoch in range(epochs):
      total_loss = 0
      batches = self.nn.get_batches(X, y, batch_size)

      for X_batch, y_batch in batches:
        Z, A, A_flat, nn_zs, nn_As = self.forward(X_batch)
        loss = self.nn.loss_fn.f(y_batch, nn_As[-1])
        total_loss += loss
        self.backward(X_batch, y_batch, Z, A, A_flat, nn_zs, nn_As, lr)

      avg_loss = total_loss / (len(X) // batch_size)
      print(f"Epoch {epoch}, Loss: {avg_loss:.4f}")

In [72]:
# Load FashionMNIST dataset from pytorch
train_dataset = datasets.FashionMNIST(root='./data', train=True, download=True, transform=ToTensor())
test_dataset = datasets.FashionMNIST(root="./data", train=False, download=True, transform=ToTensor())
class_names = train_dataset.classes

# Extract data and labels
X_train = train_dataset.data.numpy().astype(np.float32) / 255.0 # Normalize to [0, 1]
y_train = train_dataset.targets.numpy()
X_test = test_dataset.data.numpy().astype(np.float32) / 255.0 # Normalize to [0, 1]
y_test = test_dataset.targets.numpy()

X_train.shape, X_test.shape, y_train.shape, y_test.shape, class_names

((60000, 28, 28),
 (10000, 28, 28),
 (60000,),
 (10000,),
 ['T-shirt/top',
  'Trouser',
  'Pullover',
  'Dress',
  'Coat',
  'Sandal',
  'Shirt',
  'Sneaker',
  'Bag',
  'Ankle boot'])

In [None]:
np.random.seed(42)

# Train
model = CNN_NoPaddingNoPooling(input_shape=28, nn_hidden_units=[], output_shape=10, kernel_size=3, stride=1)
model.train(X_train, y_train, epochs=20, lr=0.1, batch_size=32)

# Test
_, _, _, _, test_activations = model.forward(X_test)
y_pred = np.argmax(test_activations[-1], axis=1)
accuracy = np.mean(y_pred == y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Epoch 0, Loss: 0.6251
Epoch 1, Loss: 0.4949
Epoch 2, Loss: 0.4690
Epoch 3, Loss: 0.4530
Epoch 4, Loss: 0.4410
Epoch 5, Loss: 0.4310
Epoch 6, Loss: 0.4224
Epoch 7, Loss: 0.4150
Epoch 8, Loss: 0.4085
Epoch 9, Loss: 0.4027
Epoch 10, Loss: 0.3975
Epoch 11, Loss: 0.3928
Epoch 12, Loss: 0.3887
Epoch 13, Loss: 0.3849
Epoch 14, Loss: 0.3814
Epoch 15, Loss: 0.3781
Epoch 16, Loss: 0.3751
Epoch 17, Loss: 0.3723
Epoch 18, Loss: 0.3697
Epoch 19, Loss: 0.3673
Test Accuracy: 85.44%
