# Multi-Layer Perceptron

The baseline test described in the paper is that using an MLP. On the dataset, it achieved a 69.7% measured in AUC.

In [25]:
import math
from typing import Callable, List

import numpy as np

import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

from tinygrad import Tensor, nn
from tinygrad.engine.lazy import LazyBuffer
from tinygrad.tensor import Function


## Creating the Dataset

The methodology in the paper prescribes a 65/15/20 train/val/test set with 5 cross validation splits. The data is found in `ticdata2000.txt` and the pair of `ticeval2000.txt` and `tictgts2000.txt` (tic=The Insurance Company, eval=Evaluation/Test, tgts=Targets).


In [26]:
def load_data(file_path: str, delimiter: str = "\t", has_target: bool = True):
    data = np.loadtxt(file_path, delimiter=delimiter)
    if has_target:
        X, y = data[:, :-1], data[:, -1]
        return X, y
    return data


def normalize_nonbinary_columns(X: np.ndarray) -> np.ndarray:
    X = X.copy()
    for i in range(X.shape[1]):
        unique_vals = np.unique(X[:, i])
        if not (len(unique_vals) == 2 and np.array_equal(unique_vals, [0, 1])):
            mean = np.mean(X[:, i])
            std = np.std(X[:, i])
            if std > 0:
                X[:, i] = (X[:, i] - mean) / std
    return X


In [27]:
# Instead of combining train and test data, use the original datasets
X_train_orig, y_train_orig = load_data("dataset/ticdata2000.txt")
X_train_orig = normalize_nonbinary_columns(X_train_orig)

X_test_orig = load_data("dataset/ticeval2000.txt", has_target=False)
X_test_orig = normalize_nonbinary_columns(X_test_orig)
y_test_orig = np.loadtxt("dataset/tictgts2000.txt", delimiter="\t")

kf = KFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_orig)):
    X_train = X_train_orig[train_idx]
    y_train = y_train_orig[train_idx]
    X_val = X_train_orig[val_idx]
    y_val = y_train_orig[val_idx]
    
    X_test = X_test_orig
    y_test = y_test_orig

## Defining the Model

- 2 hidden layers:
  - $l$ is the input size
  - 1<sup>st</sup> hidden layer had $m_1l$ units where $1\le m_1\le 8$.
  - 2<sup>nd</sup> hidden layer had $m_2l$ units where $1\le m_2\le 3$.
- SELU Activation Function.
- Batch Normalization after each layer

In [28]:
# https://github.com/pytorch/pytorch/blob/96aaa311c0251d24decb9dc5da4957b7c590af6f/torch/nn/modules/activation.py#L507
class Selu(Function):
    _alpha: float = 1.6732632423543772848170429916717
    _lambda: float = 1.0507009873554804934193349852946

    def forward(self, x: LazyBuffer) -> LazyBuffer:
        alpha_buf = x.const_like(self._alpha)
        lambda_buf = x.const_like(self._lambda)
        self.ret = lambda_buf * LazyBuffer.where(
            x >= 0, x, alpha_buf * ((x * (1 / math.log(2))).exp2() - 1)
        )
        return self.ret

    def backward(self, grad_output: LazyBuffer) -> LazyBuffer:
        alpha_buf = self.ret.const_like(self._alpha)
        lambda_buf = self.ret.const_like(self._lambda)
        dx = LazyBuffer.where(
            self.ret >= 0,
            lambda_buf,
            lambda_buf * alpha_buf * (self.ret * (1 / math.log(2))).exp2(),
        )
        return dx * grad_output


In [33]:
class MLP:
    def __init__(self, l: int, m1: int = 4, m2: int = 2, init_scale: float = 1.0) -> None:
        self.layers: List[Callable[[Tensor], Tensor]] = [
            lambda x: nn.Linear(l, m1 * l)(x),
            nn.BatchNorm(m1 * l),
            Selu.apply,
            lambda x: nn.Linear(m1 * l, m2 * l)(x),
            nn.BatchNorm(m2 * l), 
            Selu.apply,
            lambda x: nn.Linear(m2 * l, 1)(x),
            Tensor.sigmoid,
        ]

    def __call__(self, x: Tensor) -> Tensor:
        return x.sequential(self.layers)

## Training the Model

- Evaluation metric was Area under the Curve (AUC).
- Cross Entropy Loss.
- AdamW optimizer.
- Constant Learning Rate (What value?).
- Trained with early stopping based on the performance of validation set.
  - Stopping patience (# of epochs) was 15.

In [34]:
# Define all hyperparameter combinations
hyperparams = []
for lr in [0.01, 0.003, 0.001]:
    for batch_size in [64, 128, 256]:
        for patience in [15, 30]:
            for m1, m2 in [(4, 2), (6, 2), (8, 2), (4, 3)]:
                hyperparams.append({
                    "lr": lr,
                    "batch_size": batch_size,
                    "patience": patience,
                    "m1": m1,
                    "m2": m2
                })

# Store results for each hyperparameter combination
all_results = []

# Create the 5-fold CV splitting once
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_splits = list(kf.split(X_train_orig))

# Loop through each hyperparameter combination
for hp_idx, hp in enumerate(hyperparams):
    print(f"\n==== Testing Hyperparameter Set {hp_idx+1}/{len(hyperparams)} ====")
    print(f"lr={hp['lr']}, batch_size={hp['batch_size']}, patience={hp['patience']}, m1={hp['m1']}, m2={hp['m2']}")
    
    # For this hyperparameter combination, run 5-fold CV
    fold_results = []
    
    for fold, (train_idx, val_idx) in enumerate(fold_splits):
        print(f"\n--- Starting Fold {fold+1}/5 ---")
        
        # Get data for this fold using the train/val split
        X_train = X_train_orig[train_idx]
        y_train = y_train_orig[train_idx]
        X_val = X_train_orig[val_idx]
        y_val = y_train_orig[val_idx]
        
        # Use original test set for evaluation
        X_test = X_test_orig
        y_test = y_test_orig
        
        # Initialize model with the current hyperparameters
        input_size = X_train.shape[1]
        model = MLP(l=input_size, m1=hp['m1'], m2=hp['m2'])
        optim = nn.optim.AdamW(nn.state.get_parameters(model), lr=hp['lr'])
        
        # Define training step using current batch size
        @Tensor.train()
        def train_step() -> Tensor:
            optim.zero_grad()
            batch_size = min(hp['batch_size'], len(X_train))
            samples = np.random.randint(0, len(X_train), batch_size)
            X_batch = Tensor(X_train[samples], dtype="float32")
            y_batch = Tensor(y_train[samples], dtype="float32")
            loss = model(X_batch).cross_entropy(y_batch).backward()
            optim.step()
            return loss
        
        @Tensor.test()
        def get_val_auc() -> float:
            y_pred = model(Tensor(X_val, dtype="float32"))
            return roc_auc_score(y_val, y_pred.numpy())
        
        @Tensor.test()
        def get_test_auc() -> float:
            y_pred = model(Tensor(X_test, dtype="float32"))
            return roc_auc_score(y_test, y_pred.numpy())
        
        # Training loop with early stopping using current patience
        best_val_auc = 0
        best_model_state = None
        n_epochs = 0
        epochs_no_improve = 0
        
        while n_epochs < 1000:
            loss = train_step()
            
            if n_epochs % 5 == 0:
                val_auc = get_val_auc()
                print(f"Epoch {n_epochs}, Validation AUC: {val_auc:.4f}")
                
                if val_auc > best_val_auc:
                    best_val_auc = val_auc
                    best_model_state = [p.numpy().copy() for p in nn.state.get_parameters(model)]
                    epochs_no_improve = 0
                else:
                    epochs_no_improve += 5
                
                if epochs_no_improve >= hp['patience']:
                    print(f"Early stopping triggered after {n_epochs} epochs")
                    break
            
            n_epochs += 1
        
        # Restore best model
        if best_model_state is not None:
            for param, best_param in zip(nn.state.get_parameters(model), best_model_state):
                param.assign(best_param)
        
        # Evaluate on test set
        test_auc = get_test_auc()
        fold_results.append(test_auc)
        print(f"Fold {fold+1} Test AUC: {100 * test_auc:.2f}%")
    
    # Calculate and store results for this hyperparameter set
    mean_auc = np.mean(fold_results)
    std_auc = np.std(fold_results)
    
    result = {
        "hyperparams": hp,
        "fold_results": fold_results,
        "mean_auc": mean_auc,
        "std_auc": std_auc
    }
    all_results.append(result)
    
    print(f"\n==== Hyperparameter Set Summary ====")
    print(f"lr={hp['lr']}, batch_size={hp['batch_size']}, patience={hp['patience']}, m1={hp['m1']}, m2={hp['m2']}")
    print(f"Mean AUC: {100 * mean_auc:.2f}% ± {100 * std_auc:.2f}%")

# Find the best hyperparameter combination
best_result = max(all_results, key=lambda x: x["mean_auc"])
best_hp = best_result["hyperparams"]

print("\n==== Best Hyperparameter Combination ====")
print(f"lr={best_hp['lr']}, batch_size={best_hp['batch_size']}, patience={best_hp['patience']}, m1={best_hp['m1']}, m2={best_hp['m2']}")
print(f"Mean AUC: {100 * best_result['mean_auc']:.2f}% ± {100 * best_result['std_auc']:.2f}%")


==== Testing Hyperparameter Set 1/72 ====
lr=0.01, batch_size=64, patience=15, m1=4, m2=2

--- Starting Fold 1/5 ---
Epoch 0, Validation AUC: 0.4738
Epoch 5, Validation AUC: 0.5014
Epoch 10, Validation AUC: 0.4809
Epoch 15, Validation AUC: 0.4388
Epoch 20, Validation AUC: 0.5294
Epoch 25, Validation AUC: 0.4374
Epoch 30, Validation AUC: 0.4887
Epoch 35, Validation AUC: 0.4740
Early stopping triggered after 35 epochs
Fold 1 Test AUC: 46.99%

--- Starting Fold 2/5 ---
Epoch 0, Validation AUC: 0.5178
Epoch 5, Validation AUC: 0.5868
Epoch 10, Validation AUC: 0.4802
Epoch 15, Validation AUC: 0.4839
Epoch 20, Validation AUC: 0.4309
Early stopping triggered after 20 epochs
Fold 2 Test AUC: 47.89%

--- Starting Fold 3/5 ---
Epoch 0, Validation AUC: 0.3906
Epoch 5, Validation AUC: 0.4864
Epoch 10, Validation AUC: 0.6809
Epoch 15, Validation AUC: 0.4109
Epoch 20, Validation AUC: 0.5123
Epoch 25, Validation AUC: 0.6074
Early stopping triggered after 25 epochs
Fold 3 Test AUC: 45.90%

--- Startin