# Deep Learning - Exercise 2
## Emanuele Fontana

This notebook contains implementations of MLP from scratch using only NumPy and PyTorch.

## Import Libraries


In [9]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import torch


## Load Data

In [10]:
# Load dataset
digits = load_digits()
X, y = digits.data, digits.target

# One-hot encode labels
encoder = OneHotEncoder(sparse_output=False)
y_one_hot = encoder.fit_transform(y.reshape(-1, 1))

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y_one_hot, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Normalize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)


print("Train shape:", X_train_scaled.shape)
print("Test shape:", X_test_scaled.shape)

Train shape: (1149, 64)
Test shape: (360, 64)


## Definition of useful functions

In [11]:
def leaky_relu(x, alpha=0.01):
    return torch.where(x > 0, x, alpha * x)

def leaky_relu_derivative(x, alpha=0.01):
    return torch.where(x > 0, torch.ones_like(x), alpha * torch.ones_like(x))

def softmax(x):
    exp_x = torch.exp(x - torch.max(x, dim=1, keepdim=True).values)
    return exp_x / torch.sum(exp_x, dim=1, keepdim=True)

def cross_entropy_loss(y_true, y_pred):
    n_samples = y_true.shape[0]
    log_preds = torch.log(y_pred)
    loss = -torch.sum(y_true * log_preds) / n_samples
    return loss



## Define MLP from Scratch

In [25]:
class MLP:
    def __init__(self, layer_sizes, lr=0.01,
                 early_stopping=False, weight_decay=0.0,
                 lr_schedule=None, lr_decay=0.5, step_size=10,
                 gamma=0.95, min_lr=1e-6):

        self.learning_rate = lr
        self.early_stopping = early_stopping
        self.weight_decay = weight_decay

        # learning-rate scheduler params
        self.lr_schedule = lr_schedule
        self.lr_decay = lr_decay
        self.step_size = step_size
        self.gamma = gamma
        self.min_lr = min_lr

        # He initialization
        self.W1 = torch.randn(layer_sizes[0], layer_sizes[1]) * np.sqrt(2. / layer_sizes[0])
        self.b1 = torch.zeros(1, layer_sizes[1])
        self.W2 = torch.randn(layer_sizes[1], layer_sizes[2]) * np.sqrt(2. / layer_sizes[1])
        self.b2 = torch.zeros(1, layer_sizes[2])

        self.best_loss = float('inf')
        self.no_improve_epochs = 0


    def forward(self, X):
        self.Z1 = X @ self.W1 + self.b1
        self.A1 = leaky_relu(self.Z1)
        self.Z2 = self.A1 @ self.W2 + self.b2
        self.A2 = softmax(self.Z2)
        return self.A2


    def backward(self, X, y_true):
        n_samples = y_true.shape[0]

        # Output layer (softmax + CE)
        dZ2 = self.A2 - y_true
        dW2 = (self.A1.T @ dZ2) / n_samples
        db2 = torch.sum(dZ2, axis=0, keepdim=True) / n_samples

        # Hidden
        dA1 = dZ2 @ self.W2.T
        dZ1 = dA1 * leaky_relu_derivative(self.Z1)
        dW1 = (X.T @ dZ1) / n_samples
        db1 = torch.sum(dZ1, axis=0, keepdim=True) / n_samples

        # L2 regularization
        dW2 += 2 * self.weight_decay * self.W2
        dW1 += 2 * self.weight_decay * self.W1

        # Gradient update
        self.W2 -= self.learning_rate * dW2
        self.b2 -= self.learning_rate * db2
        self.W1 -= self.learning_rate * dW1
        self.b1 -= self.learning_rate * db1


    def _update_lr(self, epoch):
        """Update LR according to chosen schedule"""

        if self.lr_schedule is None:
            return

        if self.lr_schedule == 'step':
            # decay each step_size epochs
            if (epoch + 1) % self.step_size == 0:
                self.learning_rate = max(self.learning_rate * self.lr_decay, self.min_lr)


        elif self.lr_schedule == 'exponential':
            self.learning_rate = max(self.learning_rate * self.gamma, self.min_lr)



    def train(self, X_train, y_train, X_val, Y_val,
              epochs=10000, num_patience=10, batch_size=32):

        for epoch in range(epochs):
            permutation = torch.randperm(X_train.size()[0])
            for i in range(0, X_train.size()[0], batch_size):
                idx = permutation[i:i+batch_size]
                X_batch, y_batch = X_train[idx], y_train[idx]

                self.forward(X_batch)
                self.backward(X_batch, y_batch)

            # Evaluate validation
            y_val_pred = self.forward(X_val)
            val_ce = cross_entropy_loss(Y_val, y_val_pred)
            l2_penalty = self.weight_decay * (torch.sum(self.W1**2) + torch.sum(self.W2**2))
            val_loss = (val_ce + l2_penalty).item()

            # learning-rate scheduling
            self._update_lr(epoch)

            # early stopping
            if self.early_stopping:
                if val_loss < self.best_loss:
                    self.best_loss = val_loss
                    self.no_improve_epochs = 0
                else:
                    self.no_improve_epochs += 1

                if self.no_improve_epochs >= num_patience:
                    print(f"Early stopping at epoch {epoch+1}")
                    break


    def predict(self, X):
        y_pred = self.forward(X)
        return torch.argmax(y_pred, axis=1)


## Run Training and Evaluation

In [None]:
# Convert data to torch tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)
# Initialize and train MLP with STEP learning rate schedule
mlp = MLP(layer_sizes=[64, 32, 10], lr=0.01, early_stopping=True, weight_decay=0.001, lr_schedule='step', lr_decay=0.01, step_size=100, min_lr=1e-6)
mlp.train(X_train_tensor, y_train_tensor, X_val=X_val_tensor, Y_val=y_val_tensor, epochs=1000,batch_size=16)
# Evaluate on test set
y_test_pred = mlp.predict(X_test_tensor)
accuracy = torch.sum(y_test_pred == torch.argmax(y_test_tensor, axis=1)).item() / y_test_tensor.shape[0]
print(f"Test Accuracy with STEP: {accuracy * 100:.2f}%")

# Initialize and train MLP with EXPONENTIAL learning rate schedule
mlp = MLP(layer_sizes=[64, 32, 10], lr=0.01, early_stopping=True, weight_decay=0.001, lr_schedule='exponential', gamma=0.95, min_lr=1e-6)
mlp.train(X_train_tensor, y_train_tensor, X_val=X_val_tensor, Y_val=y_val_tensor, epochs=1000,batch_size=64)
# Evaluate on test set
y_test_pred = mlp.predict(X_test_tensor)
accuracy = torch.sum(y_test_pred == torch.argmax(y_test_tensor, axis=1)).item() / y_test_tensor.shape[0]
print(f"Test Accuracy with EXPONENTIAL: {accuracy * 100:.2f}%")

# Now let's use MNIST dataset

In [24]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# Now let's use MNIST dataset
mnist_train = datasets.MNIST(root='./data', train=True, download=True)
mnist_test = datasets.MNIST(root='./data', train=False, download=True)

train_loader = DataLoader(mnist_train, batch_size=64, shuffle=True)
test_loader = DataLoader(mnist_test, batch_size=1000, shuffle=False)



X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)



# Initialize and train MLP
mlp = MLP(layer_sizes=[64, 32, 10], lr=0.01, early_stopping=True,weight_decay=0.001)
mlp.train(X_train_tensor, y_train_tensor, X_val=X_val_tensor, Y_val=y_val_tensor, epochs=100,batch_size=64)
# Evaluate on test set
y_test_pred = mlp.predict(X_test_tensor)
accuracy = torch.sum(y_test_pred == torch.argmax(y_test_tensor, axis=1)).item() / y_test_tensor.shape[0]
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Accuracy: 95.00%
