In [1]:
import torch 
import torch.nn as nn
import torch.nn.functional as F

import json
from tqdm.notebook import tqdm

from os.path import join as pjoin
import numpy as np

from dataloader import get_dataloader
from model import BERT_MLM

In [2]:
d_model = 128
device = 'cuda'
n_layers = 2

model = BERT_MLM(d_model, 2*d_model, tokenizer_path="data/datasets_full/tokenizer.json", dropout=0.2, device=device)
#model.load_state_dict(torch.load("saved_models/2106_1/2106_1.pt"))

In [3]:
train_dl = get_dataloader(batch_size=128, drop_last=True, mode="train", device=device, mlm_index="all", json_path="data/datasets_full", tokenizer_path="data/datasets_full/tokenizer.json")
valid_dl = get_dataloader(batch_size=128, drop_last=True, mode="valid", device=device, mlm_index="all", json_path="data/datasets_full", tokenizer_path="data/datasets_full/tokenizer.json")

In [4]:
criterion = nn.NLLLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4
    )

epochs = 1
n_prev_epochs = 0

for epoch in range(epochs):
    epoch = epoch + n_prev_epochs
    
    model.train(); torch.cuda.empty_cache()
    epoch_loss, epoch_acc = [], []

    for batch in tqdm(train_dl):
        optimizer.zero_grad()
        
        tokens = batch["tokens"]
        targets = batch["target_tokens"]
        
        predicted_tokens = model(tokens)

        was_mlm_token = tokens == 2
        mlm_predicted_tokens = predicted_tokens[was_mlm_token]

        loss = criterion(mlm_predicted_tokens, targets)
        loss.backward()
        optimizer.step()

        epoch_loss.append(loss.detach().item())
        epoch_acc.append((targets == mlm_predicted_tokens.argmax(dim=1)).float().mean().item())

    epoch_loss = np.array(epoch_loss).mean()
    epoch_acc = np.array(epoch_acc).mean()

    print(f"Epoch {1+epoch}:", "train: loss {:.2f}; accuracy {:.2f}".format(epoch_loss, epoch_acc))

    model.eval(); torch.cuda.empty_cache()
    epoch_loss, epoch_acc = [], []

    for batch in tqdm(valid_dl):
        tokens = batch["tokens"]
        targets = batch["target_tokens"]
        
        predicted_tokens = model(tokens)

        was_mlm_token = tokens == 2
        mlm_predicted_tokens = predicted_tokens[was_mlm_token]

        loss = criterion(mlm_predicted_tokens, targets)
        
        epoch_loss.append(loss.detach().item())
        epoch_acc.append((targets == mlm_predicted_tokens.argmax(dim=1)).float().mean().item())
        
        break

    epoch_loss = np.array(epoch_loss).mean()
    epoch_acc = np.array(epoch_acc).mean()

    print(f"Epoch {1+epoch}:", "valid: loss {:.2f}; accuracy {:.2f}".format(epoch_loss, epoch_acc))

0it [00:00, ?it/s]

Epoch 1: train: loss 3.86; accuracy 0.29


0it [00:00, ?it/s]

Epoch 1: valid: loss 3.12; accuracy 0.41


In [None]:
print(torch.cuda.memory_summary())

In [None]:
#torch.save(model.state_dict(), "saved_models/2006_1/2006_1.pt")

# Calcul des AUPRC AUROC pour les 3 classifications

In [None]:
test_dl = get_dataloader(batch_size=128, drop_last=True, shuffle=True, mode="test", device="cpu", mlm_index="labels", json_path="data/datasets_full", tokenizer=tok)

In [None]:
from sklearn.metrics import average_precision_score as AUPRC
from sklearn.metrics import roc_auc_score as AUROC

torch.cuda.empty_cache()

classes = ["dies", "readm", "duration"]
binary_mapper = {
    "dies": [268, 27], 
    "readm": [94, 33], 
    "duration": [110, 32]
    }

labels, binary_probas = {cls:[] for cls in classes}, {cls:[] for cls in classes}

for batch in tqdm(test_dl):
    tokens = batch["tokens"].to(device)
    targets = batch["target_tokens"].to(device)
    
    predicted_logits = model(tokens)

    was_mlm_index = tokens == 2
    mlm_predicted_logits = predicted_logits[was_mlm_index]
    predicted_tokens = mlm_predicted_logits.argmax(dim=1)

    predicted_tokens = predicted_tokens.reshape((-1, 3)).to('cpu').detach().numpy()
    targets = targets.reshape((-1, 3)).to('cpu').detach().numpy()

    for idx, inference_type in enumerate(classes):
        y_true = targets[:, idx]

        one, zero = binary_mapper[inference_type]

        slicer = torch.tensor([zero, one])
        logits_of_interest = mlm_predicted_logits[idx::len(classes), slicer]
        probas_of_interest = torch.nn.Softmax(dim=-1)(logits_of_interest).to('cpu').detach().numpy()
        probas_of_interest = probas_of_interest[:, 1]

        y_true_binary = np.where(y_true == one, np.ones_like(y_true), np.zeros_like(y_true))

        labels[inference_type].append(y_true_binary)
        binary_probas[inference_type].append(probas_of_interest)

In [None]:
model.training

In [None]:
labels = {k:np.concatenate(v) for k,v in labels.items()}
binary_probas = {k:np.concatenate(v) for k,v in binary_probas.items()}

In [None]:
AUROC(labels["duration"], binary_probas["duration"]), AUROC(labels["readm"], binary_probas["readm"]), AUROC(labels["dies"], binary_probas["dies"])

In [None]:
AUPRC(labels["duration"], binary_probas["duration"]), AUPRC(labels["readm"], binary_probas["readm"]), AUPRC(labels["dies"], binary_probas["dies"])