# Deep Learning - Exercise 2
## Emanuele Fontana

This notebook contains implementations of MLP from scratch using only NumPy and PyTorch.

## Import Libraries


In [1]:
import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import torch


## Load Data

In [None]:
# Load dataset
digits = load_digits()
X, y = digits.data, digits.target

# One-hot encode labels
encoder = OneHotEncoder(sparse_output=False)
y_one_hot = encoder.fit_transform(y.reshape(-1, 1))

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y_one_hot, test_size=0.2, random_state=42)
#Train/eval split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Normalize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Train shape:", X_train_scaled.shape)
print("Test shape:", X_test_scaled.shape)

Train shape: (1437, 64)
Test shape: (360, 64)


## Definition of useful functions

In [3]:
def leaky_relu(x, alpha=0.01):
    return torch.where(x > 0, x, alpha * x)

def leaky_relu_derivative(x, alpha=0.01):
    return torch.where(x > 0, torch.ones_like(x), alpha * torch.ones_like(x))

def softmax(x):
    exp_x = torch.exp(x - torch.max(x, dim=1, keepdim=True).values)
    return exp_x / torch.sum(exp_x, dim=1, keepdim=True)

def cross_entropy_loss(y_true, y_pred):
    n_samples = y_true.shape[0]
    log_preds = torch.log(y_pred + 1e-15)
    loss = -torch.sum(y_true * log_preds) / n_samples
    return loss




## Define MLP from Scratch

In [None]:
class MLP:
    def __init__(self, layer_sizes, alpha=0.01, lr=0.01, early_stopping=False,batch_size=16, weight_decay=0.0):
        self.learning_rate = lr
        self.early_stopping = early_stopping
        # We initialize weights with He initialization
        self.W1 = torch.randn(layer_sizes[0], layer_sizes[1]) * np.sqrt(2. / layer_sizes[0])
        self.b1 = torch.zeros(1, layer_sizes[1])
        self.W2 = torch.randn(layer_sizes[1], layer_sizes[2]) * np.sqrt(2. / layer_sizes[1])
        self.b2 = torch.zeros(1, layer_sizes[2])
        self.best_loss = float('inf')
        self.no_improve_epochs = 0
        self.batch_size = batch_size
        self.weight_decay = weight_decay


    def forward(self, X):
        self.Z1 = X @ self.W1 + self.b1
        self.A1 = leaky_relu(self.Z1)
        self.Z2 = self.A1 @ self.W2 + self.b2
        self.A2 = softmax(self.Z2)
        return self.A2

    def backward(self, X, y_true):
        n_samples = y_true.shape[0]
        dZ2 = self.A2 - y_true
        dW2 = self.A1.T @ dZ2 / n_samples
        db2 = torch.sum(dZ2, axis=0, keepdim=True) / n_samples

        dA1 = dZ2 @ self.W2.T
        dZ1 = dA1 * leaky_relu_derivative(self.Z1)
        dW1 = X.T @ dZ1 / n_samples
        db1 = torch.sum(dZ1, axis=0, keepdim=True) / n_samples

        # Apply weight decay to gradients (L2 regularization). Use total training size N if available
        denom = getattr(self, 'train_size', n_samples)
        if self.weight_decay != 0:
            dW2 = dW2 + (self.weight_decay * self.W2) / denom
            dW1 = dW1 + (self.weight_decay * self.W1) / denom

        # Update weights and biases
        self.W2 -= self.learning_rate * dW2
        self.b2 -= self.learning_rate * db2
        self.W1 -= self.learning_rate * dW1
        self.b1 -= self.learning_rate * db1

    def train(self, X, y, epochs=10000):
        # Save total training size so regularization uses consistent N
        self.train_size = X.shape[0]
        for epoch in range(epochs):
            
            # Mini-batch approach
            n_samples = X.shape[0]
            indices = torch.randperm(n_samples)
            
            for i in range(0, n_samples, self.batch_size):
                batch_indices = indices[i:i + self.batch_size]
                X_batch = X[batch_indices]
                y_batch = y[batch_indices]
                
                y_pred = self.forward(X_batch)
                loss = cross_entropy_loss(y_batch, y_pred)
                # Add L2 regularization term to batch loss (normalized by total N)
                if self.weight_decay != 0:
                    reg_term = (self.weight_decay / (2 * self.train_size)) * (torch.sum(self.W1 ** 2) + torch.sum(self.W2 ** 2))
                    loss = loss + reg_term
                self.backward(X_batch, y_batch)
            
            # Calculate loss on full dataset for early stopping
            if self.early_stopping:
                y_pred_full = self.forward(X)
                loss = cross_entropy_loss(y, y_pred_full)
                # Add L2 regularization term to full-dataset loss used for early stopping
                if self.weight_decay != 0:
                    reg_term_full = (self.weight_decay / (2 * X.shape[0])) * (torch.sum(self.W1 ** 2) + torch.sum(self.W2 ** 2))
                    loss = loss + reg_term_full
                
                if loss.item() < self.best_loss:
                    self.best_loss = loss.item()
                    self.no_improve_epochs = 0
                else:
                    self.no_improve_epochs += 1
                    if self.no_improve_epochs >= 10:
                        print(f"Early stopping at epoch {epoch}")
                        break

            if epoch % 100 == 0:
                print(f"Epoch {epoch}, Loss: {loss.item()}")

    def predict(self, X):
        y_pred = self.forward(X)
        return torch.argmax(y_pred, axis=1)

## Run Training and Evaluation

In [None]:
# Convert data to torch tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)
# Initialize and train MLP
mlp = MLP(layer_sizes=[64, 32, 10], lr=0.001,  early_stopping=True, batch_size=64,weight_decay=0.0001)
mlp.train(X_train_tensor, y_train_tensor, epochs=1000,X_val=X_val, y_val=y_val)
# Evaluate on test set
y_test_pred = mlp.predict(X_test_tensor)
accuracy = torch.sum(y_test_pred == torch.argmax(y_test_tensor, axis=1)).item() / y_test_tensor.shape[0]
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Epoch 0, Loss: 3.0411300659179688
Epoch 100, Loss: 1.0298587083816528
Epoch 200, Loss: 0.5782808661460876
Epoch 300, Loss: 0.390580952167511
Epoch 400, Loss: 0.2936271131038666
Epoch 500, Loss: 0.2367899864912033
Epoch 600, Loss: 0.199468195438385
Epoch 700, Loss: 0.17310523986816406
Epoch 800, Loss: 0.1531699001789093
Epoch 900, Loss: 0.1373472958803177
Test Accuracy: 96.39%


# Now let's use MNIST dataset

In [None]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# Now let's use MNIST dataset
mnist_train = datasets.MNIST(root='./data', train=True, download=True)
mnist_test = datasets.MNIST(root='./data', train=False, download=True)

train_loader = DataLoader(mnist_train, batch_size=64, shuffle=True)
test_loader = DataLoader(mnist_test, batch_size=1000, shuffle=False)
# Convert data to torch tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# Initialize and train MLP
mlp = MLP(layer_sizes=[64, 32, 10], lr=0.01,early_stopping=True)
mlp.train(X_train_tensor, y_train_tensor, epochs=1000)
# Evaluate on test set
y_test_pred = mlp.predict(X_test_tensor)
accuracy = torch.sum(y_test_pred == torch.argmax(y_test_tensor, axis=1)).item() / y_test_tensor.shape[0]
print(f"Test Accuracy: {accuracy * 100:.2f}%")

100.0%
100.0%
100.0%
100.0%


Epoch 0, Loss: 2.8666553497314453
Epoch 5000, Loss: 0.046251021325588226
Epoch 10000, Loss: 0.020444655790925026
Epoch 15000, Loss: 0.011924443766474724
Epoch 20000, Loss: 0.008118928410112858
Epoch 25000, Loss: 0.006047670263797045
Epoch 30000, Loss: 0.004771843086928129
Epoch 35000, Loss: 0.0039109475910663605
Epoch 40000, Loss: 0.0032968998420983553
Epoch 45000, Loss: 0.0028389401268213987
Test Accuracy: 96.67%
