In [1]:
from typing import Any, Dict

import numpy as np
import pandas as pd
import wandb
import rtdl
import scipy.special
import sklearn
import sklearn.model_selection
import torch
import torch.nn as nn
import torch.nn.functional as F
import zero

In [2]:
device = torch.device('cpu')
# Docs: https://yura52.github.io/delu/0.0.4/reference/api/zero.improve_reproducibility.html
zero.improve_reproducibility(seed=1024)
task_type = 'regression'

In [3]:
df = pd.read_csv('data/train.csv', index_col=0)
df.head()

Unnamed: 0,LD,Velocity (km/s),a (degrees),$ (degrees),DI
0,1,1.0,-60,0,0.4142
1,4,1.25,90,30,0.43447
2,4,2.15,-30,30,0.84327
3,1,1.5,30,0,0.67618
4,2,2.15,30,45,0.77519


In [4]:
assert task_type in ['binclass', 'multiclass', 'regression']

y_all = df['DI'].astype('float32' if task_type == 'regression' else 'int64').to_numpy()
X_all = df.drop('DI', axis=1).astype('float32').to_numpy()

X = {}
y = {}

X['train'], X['test'], y['train'], y['test'] = sklearn.model_selection.train_test_split(
    X_all, y_all, train_size=0.8
)
X['train'], X['val'], y['train'], y['val'] = sklearn.model_selection.train_test_split(
    X['train'], y['train'], train_size=0.8
)

In [5]:
pd.Series(X)

train    [[4.0, 0.25, -45.0, 0.0], [2.0, 1.81, -60.0, 7...
test     [[1.0, 1.81, 30.0, 30.0], [2.0, 0.5, 0.0, 0.0]...
val      [[2.0, 1.0, 0.0, 30.0], [4.0, 1.25, -90.0, 30....
dtype: object

In [6]:
pd.Series(y)

train    [0.02226, 0.23859, 0.30764, 0.00691, 0.70507, ...
test     [0.7193, 0.17152, 0.70523, 0.43667, 0.40425, 0...
val      [0.39144, 0.43447, 0.40332, 0.41014, 0.02402, ...
dtype: object

In [7]:
# not the best way to preprocess features, but enough for the demonstration
preprocess = sklearn.preprocessing.StandardScaler()
preprocess.fit(X['train'])

X = {
    k: torch.tensor(preprocess.transform(v), device=device)
    for k, v in X.items()
}
y = {k: torch.tensor(v, device=device) for k, v in y.items()}


In [8]:
# !!! CRUCIAL for neural networks when solving regression problems !!!
if task_type == 'regression':
    y_mean = y['train'].mean().item()
    y_std = y['train'].std().item()
    y = {k: (v - y_mean) / y_std for k, v in y.items()}
else:
    y_std = y_mean = None

if task_type != 'multiclass':
    y = {k: v.float() for k, v in y.items()}

In [9]:
d_out = 1

model = rtdl.MLP.make_baseline(
    d_in=X_all.shape[1],
    d_layers=[64, 64, 64],
    dropout=0.2,
    d_out=d_out,
)
lr = 0.0003
weight_decay = 0.0

# model = rtdl.ResNet.make_baseline(
#     d_in=X_all.shape[1],
#     d_main=128,
#     d_intermidiate=256,
#     dropout_first=0.2,
#     dropout_second=0.0,
#     n_blocks=2,
#     d_out=d_out,
# )
# lr = 0.001
# weight_decay = 0.0

# model = rtdl.FTTransformer.make_default(
#     n_num_features=X_all.shape[1],
#     cat_cardinalities=None,
#     last_layer_query_idx=[-1],  # it makes the model faster and does NOT affect its output
#     d_out=d_out,
# )

# === ABOUT CATEGORICAL FEATURES ===
# IF you use MLP, ResNet or any other simple feed-forward model (NOT transformer-based model)
# AND there are categorical features
# THEN you have to implement a wrapper that handles categorical features.
# The example below demonstrates how it can be achieved using rtdl.CategoricalFeatureTokenizer.
# ==================================
# 1. When you have both numerical and categorical features, you should prepare you data like this:
#    (X_num<float32>, X_cat<int64>) instead of X<float32>
#    Each column in X_cat should contain values within the range from 0 to <(the number of unique values in column) - 1>;
#    use sklean.preprocessing.OrdinalEncoder to achieve this;
# 2. Prepare a list of so called "cardinalities":
#    cardinalities[i] = <the number of unique values of the i-th categorical feature>
# 3. See the commented example below and adapt it for your needs.
#
# class Model(nn.Module):
#     def __init__(
#         self,
#         n_num_features: int,
#         cat_tokenizer: rtdl.CategoricalFeatureTokenizer,
#         mlp_kwargs: Dict[str, Any],
#     ):
#         super().__init__()
#         self.cat_tokenizer = cat_tokenizer
#         self.model = rtdl.MLP.make_baseline(
#             d_in=n_num_features + cat_tokenizer.n_tokens * cat_tokenizer.d_token,
#             **mlp_kwargs,
#         )
#
#     def forward(self, x_num, x_cat):
#         return self.model(
#             torch.cat([x_num, self.cat_tokenizer(x_cat).flatten(1, -1)], dim=1)
#         )
#
# model = Model(
#     # `None` means "Do not transform numerical features"
#     # `d_token` is the size of embedding for ONE categorical feature
#     X_num_all.shape[1],
#     rtdl.CategoricalFeatureTokenizer(cardinalities, d_token, True, 'uniform'),
#     mlp_kwargs,
# )
# Then the model should be used as `model(x_num, x_cat)` instead of of `model(x)`.

model.to(device)
optimizer = (
    model.make_default_optimizer()
    if isinstance(model, rtdl.FTTransformer)
    else torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
)
loss_fn = (
    F.binary_cross_entropy_with_logits
    if task_type == 'binclass'
    else F.cross_entropy
    if task_type == 'multiclass'
    else F.mse_loss
)

In [10]:
def apply_model(x_num, x_cat=None):
    if isinstance(model, rtdl.FTTransformer):
        return model(x_num, x_cat)
    elif isinstance(model, (rtdl.MLP, rtdl.ResNet)):
        assert x_cat is None
        return model(x_num)
    else:
        raise NotImplementedError(
            f'Looks like you are using a custom model: {type(model)}.'
            ' Then you have to implement this branch first.'
        )


@torch.no_grad()
def evaluate(part):
    model.eval()
    prediction = []
    for batch in zero.iter_batches(X[part], 1024):
        prediction.append(apply_model(batch))
    prediction = torch.cat(prediction).squeeze(1).cpu().numpy()
    target = y[part].cpu().numpy()

    if task_type == 'binclass':
        prediction = np.round(scipy.special.expit(prediction))
        score = sklearn.metrics.accuracy_score(target, prediction)
    elif task_type == 'multiclass':
        prediction = prediction.argmax(1)
        score = sklearn.metrics.accuracy_score(target, prediction)
    else:
        assert task_type == 'regression'
        score = sklearn.metrics.mean_squared_error(target, prediction) ** 0.5 * y_std
    return score


# Create a dataloader for batches of indices
# Docs: https://yura52.github.io/delu/reference/api/zero.data.IndexLoader.html
batch_size = 32
train_loader = zero.data.IndexLoader(len(X['train']), batch_size, device=device)

# Create a progress tracker for early stopping
# Docs: https://yura52.github.io/delu/reference/api/zero.ProgressTracker.html
progress = zero.ProgressTracker(patience=100)

print(f'Test score before training: {evaluate("test"):.4f}')

Test score before training: 0.2947


In [11]:
# wandb init
config = {
    "learning_rate": 0.0003,
    "epochs": 10,
    "batch_size": 32,
    "neurons": 64,
    "dropout": 0.2,
    # "kfolds": 15,
}

# wandb.init(
#     # group=f'impute&OHC&normalization',
#     # name=f'fold_{fold}',
#     project='testing',
#     config=config)

n_epochs = config['epochs']
report_frequency = len(X['train']) // batch_size // 5
val_scores = []
test_scores = []
for epoch in range(1, n_epochs + 1):
    for iteration, batch_idx in enumerate(train_loader):
        model.train()
        optimizer.zero_grad()
        x_batch = X['train'][batch_idx]
        y_batch = y['train'][batch_idx]
        loss = loss_fn(apply_model(x_batch).squeeze(1), y_batch)
        loss.backward()
        optimizer.step()
        if iteration % report_frequency == 0:
            print(f'(epoch) {epoch} (batch) {iteration} (loss) {loss.item():.4f}')

    val_score = evaluate('val')
    test_score = evaluate('test')

    # val_scores.append(val_score)
    # test_scores.append(test_score)

    # wandb loggers
    # wandb.log({
    #     # "train loss": test_score,
    #     "valid loss": val_score,
    #     "test loss": test_score,
    #     # "train rmse": train_rmse,
    #     # "valid rmse": valid_rmse,
    #     # "running seed": running_seed,
    #     "epoch": epoch,
    #     "batch_idx": batch_idx
    #     })

    print(f'Epoch {epoch:03d} | Validation score: {val_score:.4f} | Test score: {test_score:.4f}', end='')
    progress.update((-1 if task_type == 'regression' else 1) * val_score)
    if progress.success:
        print(' <<< BEST VALIDATION EPOCH', end='')
    print()
    if progress.fail:
        break

# Close run
# wandb.finish()

(epoch) 1 (batch) 0 (loss) 1.0061
(epoch) 1 (batch) 5 (loss) 1.1991
(epoch) 1 (batch) 10 (loss) 1.1380
(epoch) 1 (batch) 15 (loss) 0.8603
(epoch) 1 (batch) 20 (loss) 0.8749
(epoch) 1 (batch) 25 (loss) 1.1549
Epoch 001 | Validation score: 0.2584 | Test score: 0.2722 <<< BEST VALIDATION EPOCH
(epoch) 2 (batch) 0 (loss) 0.8908
(epoch) 2 (batch) 5 (loss) 1.0080
(epoch) 2 (batch) 10 (loss) 0.9535
(epoch) 2 (batch) 15 (loss) 0.7217
(epoch) 2 (batch) 20 (loss) 0.7865
(epoch) 2 (batch) 25 (loss) 0.8783
Epoch 002 | Validation score: 0.2252 | Test score: 0.2361 <<< BEST VALIDATION EPOCH
(epoch) 3 (batch) 0 (loss) 0.7059
(epoch) 3 (batch) 5 (loss) 0.7493
(epoch) 3 (batch) 10 (loss) 0.6161
(epoch) 3 (batch) 15 (loss) 0.5001
(epoch) 3 (batch) 20 (loss) 0.4258
(epoch) 3 (batch) 25 (loss) 0.6713
Epoch 003 | Validation score: 0.1655 | Test score: 0.1727 <<< BEST VALIDATION EPOCH
(epoch) 4 (batch) 0 (loss) 0.4273
(epoch) 4 (batch) 5 (loss) 0.4318
(epoch) 4 (batch) 10 (loss) 0.2749
(epoch) 4 (batch) 15 

In [15]:
print(model.__class__.__name__)

MLP
