# Multi-Layer Perceptron

The baseline test described in the paper is that using an MLP. On the dataset, it achieved a 69.7% measured in AUC.

In [6]:
import math
from typing import Callable, List

import numpy as np

from tinygrad import Tensor, nn
from tinygrad.engine.lazy import LazyBuffer
from tinygrad.tensor import Function
from tinygrad.helpers import colored, trange


AttributeError: module 'functools' has no attribute 'cached_propertyFexp'

## Creating the Dataset

The methodology in the paper prescribes a 65/15/20 train/val/test set with 5 cross validation splits. The data is found in `ticdata2000.txt` and the pair of `ticeval2000.txt` and `tictgts2000.txt` (tic=The Insurance Company, eval=Evaluation/Test, tgts=Targets).


In [3]:
def load_data(file_path: str, delimiter: str = "\t", has_target: bool = True):
    data = np.loadtxt(file_path, delimiter=delimiter)
    if has_target:
        X, y = data[:, :-1], data[:, -1]
        return X, y
    return data


def normalize_nonbinary_columns(X: np.ndarray) -> np.ndarray:
    for i in range(X.shape[1]):
        unique_vals = np.unique(X[:, i])
        if not np.array_equal(unique_vals, [0, 1]):
            X[:, i] = (X[:, i] - np.mean(X[:, i])) / np.std(X[:, i])
    return X


In [4]:
with open("dataset/names.txt", "r") as file:
    feature_names = [line.strip() for line in file.readlines()]

X_train, y_train = load_data("dataset/ticdata2000.txt")
X_train = normalize_nonbinary_columns(X_train)

X_test = load_data("dataset/ticeval2000.txt", has_target=False)
X_test = normalize_nonbinary_columns(X_test)
y_test = np.loadtxt("dataset/tictgts2000.txt", delimiter="\t")

X_combined = np.vstack((X_train, X_test))
y_combined = np.concatenate((y_train, y_test))

num_samples = X_combined.shape[0]
indices = np.random.permutation(num_samples)
X_shuffled = X_combined[indices]
y_shuffled = y_combined[indices]

train_end = int(0.65 * num_samples)
val_end = int((0.65 + 0.15) * num_samples)

X_train_new = X_shuffled[:train_end]
y_train_new = y_shuffled[:train_end]
X_val_new = X_shuffled[train_end:val_end]
y_val_new = y_shuffled[train_end:val_end]
X_test_new = X_shuffled[val_end:]
y_test_new = y_shuffled[val_end:]


## Defining the Model

- 2 hidden layers:
  - $l$ is the input size
  - 1<sup>st</sup> hidden layer had $m_1l$ units where $1\le m_1\le 8$.
  - 2<sup>nd</sup> hidden layer had $m_2l$ units where $1\le m_2\le 3$.
- SELU Activation Function.
- Batch Normalization after each layer

In [5]:
# https://github.com/pytorch/pytorch/blob/96aaa311c0251d24decb9dc5da4957b7c590af6f/torch/nn/modules/activation.py#L507
class Selu(Function):
    _alpha: float = 1.6732632423543772848170429916717
    _lambda: float = 1.0507009873554804934193349852946

    def forward(self, x: LazyBuffer) -> LazyBuffer:
        self.ret = self._lambda * LazyBuffer.where(x >= 0, x, self._alpha * (x.exp() - 1))
        return self.ret

    def backward(self, grad_output: LazyBuffer) -> LazyBuffer:
        dx = LazyBuffer.where(
            self.ret >= 0,
            self._lambda,
            self._lambda * self._alpha * self.x.exp(),
        )
        return dx * grad_output


NameError: name 'Function' is not defined

In [1]:
class MLP:
    def __init__(self, l: int, m1: int = 4, m2: int = 2) -> None:
        self.layers: List[Callable[[Tensor], Tensor]] = [
            nn.Linear(l, m1 * l),
            nn.BatchNorm(m1 * l),
            Selu.forward,
            nn.Linear(m1 * l, m2 * l),
            nn.BatchNorm(m2 * l),
            Selu.forward,
            nn.Linear(m2 * l, 1),
            Tensor.sigmoid,
        ]

    def __call__(self, x: Tensor) -> Tensor:
        return x.sequential(self.layers)


NameError: name 'Tensor' is not defined

## Training the Model

- Evaluation metric was Area under the Curve (AUC).
- Cross Entropy Loss.
- AdamW optimizer.
- Constant Learning Rate (What value?).
- Trained with early stopping based on the performance of validation set.
  - Stopping patience (# of epochs) was 15.

In [27]:
model = MLP(l=X_train_new.shape[1])
optim = nn.optim.AdamW(nn.state.get_parameters(model))

@Tensor.train()
def train_step() -> Tensor:
    optim.zero_grad()
    samples = np.random.randint(0, X_train_new.shape[0], 128)
    X_batch = Tensor(X_train_new[samples], dtype="float32")
    y_batch = Tensor(y_train_new[samples], dtype="float32")

    loss = model(X_batch).cross_entropy(y_batch).backward()
    optim.step()
    return loss

@Tensor.test()
def get_test_auc() -> Tensor:
    return (model(Tensor(X_test_new)).argmax(axis=1) == Tensor(y_test_new)).mean() * 100

test_acc = float("nan")
for i in (t := trange(15)):
    loss = train_step()
    test_acc = get_test_auc().item()
    t.set_description(f"loss: {loss.item():6.2f} test_accuracy: {test_acc:5.2f}%")

if 0.0 <= test_acc < 100:
    print(colored(f"{test_acc=} >= 0.0", "green"))
else:
    raise ValueError(colored(f"{test_acc=} < 0.0", "red"))


loss:   0.00 test_accuracy: 94.20%: 100%|███████| 15/15 [00:02<00:00,  5.63it/s]


[32mtest_acc=94.19847869873047 >= 0.0[0m
