In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms

In [None]:
# data loader for MNIST
# defines a transformation that converts PIL images into pytorch tensors between 0 and 1
transform = transforms.ToTensor()
# downloading MNIST training and test datasets
train_set = datasets.MNIST(root="./data", train=True, download=True, transform=transform)
test_set = datasets.MNIST(root="./data", train=False, download=True, transform=transform)
# DataLoader wraps the datasets to provide batching, shuffling and parallel loading 
train_loader = torch.utils.data.DataLoader(train_set, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=1000, shuffle=False)

100%|██████████| 9.91M/9.91M [00:07<00:00, 1.31MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 70.5kB/s]
100%|██████████| 1.65M/1.65M [00:03<00:00, 483kB/s] 
100%|██████████| 4.54k/4.54k [00:00<00:00, 13.8MB/s]


In [None]:
class MLP(nn.Module):
    # fc1 -> fully connected layer transforming 784 inputs (flattened 28x28 image) to 128 neurons.
    # relu1 -> ReLU activation introduces non-linearity
    # fc2 -> fully connected layer maps 128 neurons to 64
    # fc2 -> outputs 10 logits (0-9)
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(28 * 28, 128)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(128, 64)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(64, 10) # 10 output classes
    
    # x.view(-1, 28*28) reshapes the 2D image to 1D tensor (784 features1)
    # -1 lets pytorch infer the batch size
    # output is raw scores (logits) not probabilities 
    def forward(self, x):
        x = x.view(-1, 28*28) # flatten
        x = self.relu1(self.fc1(x))
        x = self.relu2(self.fc2(x))
        return self.fc3(x)    # raw(logits)

In [None]:
# initialize network, loss, optimizer
model = MLP()
# defines teh loss function -> CE combines softmax and negative log likelihood for classification 
criterion = nn.CrossEntropyLoss()
# adam optimiser
optimizer = optim.Adam(model.parameters(), lr = 0.001)

# train the network (1 epoch for brevity)
model.train()
for epoch in range(1):
    for xb, yb in train_loader:
        # clears gradients from previous step 
        optimizer.zero_grad()
        # runs forward pass on input batch xb, outputs logits
        out = model(xb)
        # calculates the loss
        loss = criterion(out, yb)
        # computes gradients via backpropogation
        loss.backward()
        # updates weights using gradients 
        optimizer.step()
    print(f'Epoch {epoch+1}, loss: {loss.item():.4f}')

Epoch 1, loss: 0.1023


In [None]:
# evaluate accuracy 
# sets the model to evaluation mode (disables output / batch norm updates)
model.eval()
correct = 0 # counts number of correct predictions 
total = 0   # sums total samples 

# disables gradient calculations for faster inference and lower memory
with torch.no_grad():
    for xb, yb in test_loader:
        preds = model(xb).argmax(dim=1) # gets predicted class indices from logits
        correct += (preds == yb).sum().item()
        total += yb.size(0)
print(f'Test Accuracy: {correct / total:.2%}')

Test Accuracy: 95.20%


### Manually

In [18]:
import torch 
from torchvision import datasets, transforms
import numpy as np

# transform = transforms.ToTensor()
# train_set = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
# test_set = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

In [19]:
def torch_to_numpy(dataset):
    """
    Convert torch dataset to a tuple of (X,y) numpy arrays,
    flattening images to 1D vectors
    """
    X = []
    Y = []

    for img, label in dataset:
        X.append(img.numpy().flatten()) # flatten 28x28 to 784
        Y.append(label)
    return np.stack(X), np.array(Y)

X_train, y_train = torch_to_numpy(train_set)
X_test, y_test = torch_to_numpy(train_set)

In [20]:
# one-hot encode the labels for use with softmax/CE
def one_hot(y, num_classes=10):
    """
    convert vector of numeric class labels to one-hot encoded matrix.
    [1,4] => [[0 1 0 0 0 0 0 0 0 0], [0 0 0 0 1 0 0 0 0 0]]
    """
    return np.eye(num_classes)[y]
y_train_oh = one_hot(y_train)
y_test_oh = one_hot(y_test)

In [21]:
# Initialize neural network parameters
input_size = 784    # MNIST images are 28*28 pixels
hidden_size = 128   # number of neorons in hidden layer
output_size = 10    # output classes (0-9)

np.random.seed(42)
W1 = np.random.randn(input_size, hidden_size) * 0.01 # weights for input->hidden
b1 = np.zeros((1, hidden_size)) # bias for hidden
W2 = np.random.randn(hidden_size, output_size) * 0.01 # weights for hidden->output
b2 = np.zeros((1, output_size)) # bias for output

# define activation functions and loss
def relu(x):
    """Apply the ReLU (Rectified Linear Unit) function elementwise"""
    return np.maximum(0,x)

def relu_deriv(x):
    """Derivative of ReLU: 1 if x > 0, else 0"""
    return (x > 0).astype(float)

def softmax(x):
    """
    Apply softmax to each row of x for classification.
    Computes normalized probabilities for each class
    """
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True)) # stability
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

def cross_entropy(y_pred, y_true):
    """
    Compute the mean CE loss (for softmax one-hot)
    y_pred: predicted probabilities; y_true: one-hot true labels.
    """
    eps = 1e-9
    return -np.mean(np.sum(y_true * np.log(y_pred + eps), axis=1))


In [22]:
# Training hyperparameters
lr = 0.1
batch_size = 64 # small batches for stochastic gradient descent-like behaviour 
epochs = 1

N = X_train.shape[0]
for epoch in range(epochs):
    perm = np.random.permutation(N) # shuffle the dataset each epoch
    X_train, y_train_oh, = X_train[perm], y_train_oh[perm]

    for i in range(0, N, batch_size):
        # Select batch 
        xb = X_train[i:i+batch_size]
        yb = y_train_oh[i:i+batch_size]

        # Forward pass (compute scores and activations layer by layer)
        z1 = xb @ W1 + b1   # linear transformation: input to hidden layer
        a1 = relu(z1)       # non linear activation on hidden outputs
        z2 = a1 @ W2 + b2   # linear transform: hidden to output layer
        a2 = softmax(z2)    # softmax converts outputs to probabilities 

        # comput loss (optional but helps monitor training)
        if i % (batch_size * 100) == 0:
            loss = cross_entropy(a2, yb)
            print(f"Epoch {epoch+1}, batch {i//batch_size}: loss = {loss:.4f}")

        # Backpropogation (manual derivative calculations for MLP)
        dz2 = (a2 - yb) / batch_size    # derivative of loss wrt output logits
        dW2 = a1.T @ dz2                # output weight gradients
        db2 = np.sum(dz2, axis=0, keepdims=True)

        da1 = dz2 @ W2.T                # propogate gradient to hidden activations
        dz1 = da1 * relu_deriv(z1)      # chain rule in ReLU derivative 
        dW1 = xb.T @ dz1                # input weight gradients
        db1 = np.sum(dz1, axis=0, keepdims=True)

        # parameter update (gradient descent)
        W2 -= lr * dW2  # output weights
        b2 -= lr * db2  # output bias 
        W1 -= lr * dW1  # input weights
        b1 -= lr * db1  # input bias 


Epoch 1, batch 0: loss = 2.3037
Epoch 1, batch 100: loss = 0.9558
Epoch 1, batch 200: loss = 0.5121
Epoch 1, batch 300: loss = 0.2371
Epoch 1, batch 400: loss = 0.3362
Epoch 1, batch 500: loss = 0.1789
Epoch 1, batch 600: loss = 0.4208
Epoch 1, batch 700: loss = 0.2471
Epoch 1, batch 800: loss = 0.4152
Epoch 1, batch 900: loss = 0.1429


In [23]:
# Evaluate model accuracy on test set (forward pass)
z1 = X_test @ W1 + b1      # input to hidden
a1 = relu(z1)              # ReLU activation 
z2 = a1 @ W2 + b2          # Hidden output
a2 = softmax(z2)           # softmac for probabilities 

y_pred = np.argmax(a2, axis=1)  # predicted digits: class with maximum probability
accuracy = np.mean(y_pred == y_test)
print(f"Test Accuracy: {accuracy:.2%}")

Test Accuracy: 91.49%
