In [None]:
from pathlib import Path
import pandas as pd
from andi_unicorns.data import *
from andi_unicorns.utils import *
from andi_unicorns.models import *
from sklearn.metrics import f1_score
from sklearn.metrics import mean_absolute_error as mae

# Example notebook
> Example notebook showing how to load and predict with the models used in the AnDi Challenge

The models are named following the convention `name_dim{dimension}_t{task}_{id}_custom.pth`. We've only had time to train the models for dimension 1 and tasks 1 and 2. The following function will load the ensemble asuming that the pre-trained models are in a `models/` directory, change the path at convenience. 

In [None]:
def load_task_model(task, dim=1, model_path=Path("models/")):
    "Loads a pre-trained model given a task and a dimension."
    if task == 1:   n_mod, act = 7, False
    elif task == 2: n_mod, act = 10, True 
    names = [f"hydra_dim{dim}_t{task}_{i}_custom.pth" for i in range(n_mod)]
    models = [load_model(name, path=model_path).cuda() for name in names]
    for model in models: model.eval()
    return Ensemble(models, add_act=act)

The way our models work is with dataloaders that take the raw dataset in `.txt` format and transform it to a dataframe with pytorch tensors (may take a while). Provide a path to the directory where the `task{task}.txt` and `ref{task}.txt` files are. I am assuming you won't be trying to train a model, so the dataloader will be ready for validation, preserving the order of the data. 

In [None]:
def get_dataloader(task, path, dim=1, bs=128):
    "Provides dataloader from .txt files."
    if not isinstance(path, Path): path = Path(path)
    df = pd.DataFrame(columns=['dim', 'y', 'x', 'len'], dtype=object)
    with open(path/f"task{task}.txt", "r") as D, open(path/f"ref{task}.txt") as Y:
        trajs = csv.reader(D, delimiter=";", lineterminator="\n", quoting=csv.QUOTE_NONNUMERIC)
        labels = csv.reader(Y, delimiter=";", lineterminator="\n", quoting=csv.QUOTE_NONNUMERIC)
        for t, y in zip(trajs, labels):
            d, x = int(t[0]), t[1:]
            x = tensor(x).view(d, -1).T
            label = tensor(y[1:]) if task is 3 else y[1]
            df = df.append({'dim': d, 'y': label, 'x': x, 'len': len(x)}, ignore_index=True)
    
    df = df[df['dim'] == dim]
    ds = L(zip(df['x'], df['y'])) if task == 1 else L(zip(df['x'], df['y'].astype(int)))
    return DataLoader(ds, bs=bs, before_batch=pad_trajectories, device=default_device())

In order to get the predictions, the next functions can be called.

In [None]:
def get_preds_truth(model, dl): return get_preds(model, dl), get_truth(dl)

def get_preds(model, dl):
    "Validates model on specific task and dimension."
    return torch.cat([to_detach(model(xb)) for xb, _ in dl]) 

def get_truth(dl):
    "Retrieves labels from dataloader"
    return torch.cat([to_detach(yb) for _, yb in dl])

## Task 1 example

Here we assume that there's a directory `data/train` containing the validation data. Change the `data_path` at your convenience.

In [None]:
task = 1
data_path = Path("data/train")
model = load_task_model(task)
dl = get_dataloader(task, data_path)

The predictions are the exponents so we can compute the mean absolute error straight away. 

In [None]:
preds, true = get_preds_truth(model, dl)
score = mae(preds, true)
print(f"MAE: {score:.4f}")

## Task 2 example

Same as in the previous example, change the `data_path` at convenience. 

In [None]:
task = 2
data_path = Path("data/train")
model = load_task_model(task)
dl = get_dataloader(task, data_path)

In this case, the predictions are in the format required for the submission. Hence, if we want to get the actual labels we need to call `.argmax(1)` over the output. 

In [None]:
preds, true = get_preds_truth(model, dl)
labels = preds.argmax(1).squeeze()
score = f1_score(true, labels, average='micro')
print(f"F1: {score:.4f}")