# Tests:

In [1]:
import yaml
from tqdm import tqdm 
from datasets import  load_from_disk
from torch.utils.data import DataLoader, Dataset
from PatientTrajectoryForecasting.utils.utils import (
    load_data,
    get_paths,
)

from torch import optim
import torch.nn as nn
import torch
import os

In [2]:
class Encoder(nn.Module):
    def __init__(self, source_vocab_size, embedding_dim = 714, hidden_dim = 714, n_layers = 1, batch_first = True, bidirectional = True):
        super().__init__()
 
        self.embedding = nn.Embedding(source_vocab_size, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, n_layers, batch_first = batch_first, bidirectional = bidirectional)
        self.encode = nn.Linear(512, 1)
        self.fuse_directions = nn.Linear(714 * (1 + 1 * bidirectional) , hidden_dim)
        self.hidden_layer = nn.Linear(hidden_dim, hidden_dim)
 
    def forward(self, input_batch):
        embed = self.embedding(input_batch)
        outputs, hidden = self.rnn(embed)
        encoded = self.encode(outputs.transpose(1, 2)).squeeze(-1)
        fused_dirs = self.fuse_directions(encoded)
        outs = torch.nn.functional.softmax(self.hidden_layer(fused_dirs), dim = -1)
 
        return outs

In [3]:
class ForcastWithNotes(Dataset):
    def __init__(self, source_sequences, target_sequences, hospital_ids, tokenized_notes):
        self.source_sequences = source_sequences
        self.target_sequences = target_sequences
        self.hospital_ids = hospital_ids
        self.tokenized_notes = load_from_disk(tokenized_notes)
    def __len__(self):
        return len(self.source_sequences)
    def __getitem__(self, idx):
        hospital_ids = self.hospital_ids[idx]
        hospital_ids_lens = len(hospital_ids)

        return  {'source_sequences':torch.tensor(self.source_sequences[idx]),
                 'target_sequences': torch.tensor(self.target_sequences[idx]),
                 'tokenized_notes':self.tokenized_notes[hospital_ids],
                 'hospital_ids_lens': hospital_ids_lens}

def custom_collate_fn(batch):
    source_sequences = [item['source_sequences'] for item in batch]
    target_sequences = []
    for item in batch:
        one_hot = torch.zeros(714)
        one_hot[item['target_sequences']] = 1
        target_sequences.append(one_hot)
        
    source_sequences = torch.stack(source_sequences, dim=0)
    
    target_sequences = torch.stack(target_sequences, dim=0)

    return {
        'source_sequences': source_sequences,
        'target_sequences': target_sequences,
    }

In [4]:
def custom_collate_fn(batch):
    source_sequences = [item['source_sequences'] for item in batch]
    target_sequences = []
    for item in batch:
        one_hot = torch.zeros(714)
        one_hot[item['target_sequences']] = 1
        target_sequences.append(one_hot)
        
    source_sequences = torch.stack(source_sequences, dim=0)
    
    target_sequences = torch.stack(target_sequences, dim=0)

    return {
        'source_sequences': source_sequences,
        'target_sequences': target_sequences,
    }

In [5]:
def custom_collate_fn_test(batch):
    source_sequences = [item['source_sequences'] for item in batch]
    target_sequences = [item['target_sequences'] for item in batch]
    
    source_sequences = torch.stack(source_sequences, dim=0)
    target_sequences = torch.stack(target_sequences, dim=0)

    return {
        'source_sequences': source_sequences,
        'target_sequences': target_sequences,
    }

In [6]:
with open('PatientTrajectoryForecasting/paths.yaml', 'r') as file:
        path_config = yaml.safe_load(file)

train_data_path = get_paths(path_config,
                        'SDP',
                        False,
                        False,
                        train = True,
                        processed_data = True,
                        with_notes = True)


source_sequences, target_sequences, source_tokens_to_ids, target_tokens_to_ids, _, __, hospital_ids_source = load_data(train_data_path['processed_data_path'],
                                                                                                                   processed_data = True, reindexed = True)
# Load the datasets
train_dataset = torch.load('final_dataset/train_dataset.pth')
val_dataset = torch.load('final_dataset/val_dataset.pth')
test_dataset = torch.load('final_dataset/test_dataset.pth')

old_to_new_ids_source file not availble, mapping is the same as the old one


In [7]:
def create_model(source_tokens_to_ids, target_tokens_to_ids, embedding_dim = 714, hidden_dim = 714, n_layers = 1):
    # Define the required dimensions and hyper parameters
   
 
    # Instanciate the models
    model = Encoder(len(source_tokens_to_ids), embedding_dim, hidden_dim, n_layers=n_layers )
 
    model = model.to(DEVICE)
 
    # Define the optimizer
    optimizer = optim.Adadelta(model.parameters())
    criterion = torch.nn.BCELoss()
 
    return model, optimizer, criterion

In [8]:
train_batch_size = 512
val_batch_size = train_batch_size * 2
train_dataloader = DataLoader(train_dataset,
                                  shuffle = True,
                                  batch_size = train_batch_size,
                                  num_workers = int(os.environ["SLURM_CPUS_PER_TASK"]),
                                  pin_memory = True,
                                  collate_fn = custom_collate_fn)

val_dataloader = DataLoader(val_dataset,
                            shuffle = False,
                            batch_size = val_batch_size,
                            num_workers = int(os.environ["SLURM_CPUS_PER_TASK"]),
                            pin_memory = True,
                            collate_fn = custom_collate_fn)


test_dataloader = DataLoader(test_dataset,
                             shuffle = False,
                             batch_size = val_batch_size,
                             num_workers = int(os.environ["SLURM_CPUS_PER_TASK"]),
                             pin_memory = True,
                             collate_fn = custom_collate_fn_test)

In [9]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [23]:
def train(train_iterator, valid_iterator, source_tokens_to_ids, target_tokens_to_ids, epochs=10, patience=10):
    
    model, optimizer, criterion = create_model(source_tokens_to_ids, target_tokens_to_ids)
    best_val_loss = float('inf')
    epochs_without_improvement = 0
 
    for epoch in range(1, epochs + 1):
        pbar = tqdm(total=len(train_iterator), bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}', unit=' batches', ncols=200)
 
        training_loss = []
        # set training mode
        model.train()
 
        # Loop through the training batch
        for i, batch in enumerate(train_iterator):
            # Get the source and target tokens
            src = batch['source_sequences'].to(DEVICE)
            trg = batch['target_sequences'].to(DEVICE)
 
            optimizer.zero_grad()
 
            # Forward pass
            output = model(src)
 
            # Calculate the loss
            loss = criterion(output, trg)
 
            # back propagation
            loss.backward()
 
            optimizer.step()
 
            training_loss.append(loss.item())
 
            pbar.set_postfix(
                epoch=f" {epoch}, train loss= {round(sum(training_loss) / len(training_loss), 4)}", refresh=True)
            pbar.update()

        pred_trgs, targets = get_sequences_lig_doctor(model, test_dataloader, max_len = 96)

        test_mapk = {f"test_map@{k}": mapk(targets, pred_trgs, k) for k in ks}
        test_recallk = {f"test_recall@{k}": recallTop(targets, pred_trgs, rank = [k])[0] for k in ks}
 
    return test_mapk, test_recallk

In [24]:
train(train_dataloader, val_dataloader, source_tokens_to_ids, target_tokens_to_ids, epochs = 1)

 98%|█████████▊| 51/52 [00:20<00:00,  2.70 batches/s, epoch=1, train loss= 0.1376]                                                                                                                      
scoring:   0%|          | 0/8 [00:00<?, ?it/s][A
scoring:  12%|█▎        | 1/8 [00:03<00:23,  3.36s/it][A
scoring:  25%|██▌       | 2/8 [00:03<00:09,  1.59s/it][A
scoring:  38%|███▊      | 3/8 [00:04<00:05,  1.02s/it][A
scoring:  50%|█████     | 4/8 [00:04<00:03,  1.33it/s][A
scoring:  62%|██████▎   | 5/8 [00:04<00:01,  1.65it/s][A
scoring:  75%|███████▌  | 6/8 [00:05<00:01,  1.93it/s][A
scoring:  88%|████████▊ | 7/8 [00:05<00:00,  2.18it/s][A
scoring: 100%|██████████| 8/8 [00:05<00:00,  1.42it/s][A
100%|██████████| 52/52 [00:29<00:00,  1.75 batches/s, epoch=1, train loss= 0.1376]                                                                                                                      


({'test_map@20': 0.29729965586703744,
  'test_map@40': 0.268215048857014,
  'test_map@60': 0.24991464844147562},
 {'test_recall@20': 0.18689054511057224,
  'test_recall@40': 0.29336155234242206,
  'test_recall@60': 0.35899976201540224})

In [12]:
from typing import Dict

In [19]:
def get_sequences_lig_doctor(model, dataloader : torch.utils.data.dataloader.DataLoader,
                             max_len : int = 150,
                             DEVICE : str ='cuda:0'):
    model.eval()
    pred_trgs = []
    targets = []
    with torch.inference_mode():
        for batch in tqdm(dataloader, desc='scoring'):
            source_input_ids, target_input_ids = batch['source_sequences'].to(DEVICE),batch['target_sequences'].to(DEVICE)
            output = model(source_input_ids)
            preds = torch.topk(output, k = max_len).indices
            for i in range(target_input_ids.size(0)):
                filtred_preds = preds[i][preds[i]>4]
                filtred_targets = target_input_ids[i][:len(filtred_preds)]
                pred_trgs.extend([filtred_preds.tolist()])
                targets.extend([filtred_targets.tolist()])
    return pred_trgs, targets

In [22]:
pred_trgs, targets = get_sequences_lig_doctor(model, test_dataloader, max_len = 96)

scoring: 100%|██████████| 8/8 [00:05<00:00,  1.46it/s]


In [17]:
cd PatientTrajectoryForecasting

/home/sifal.klioui/PatientTrajectoryForecasting


In [18]:
from utils.eval import mapk, recallTop


ks = [20, 40, 60]

test_mapk = {f"test_map@{k}": mapk(targets, pred_trgs, k) for k in ks}
test_recallk = {f"test_recall@{k}": recallTop(targets, pred_trgs, rank = [k])[0] for k in ks}
print(test_mapk, test_recallk)

{'test_map@20': 0.26531395405940383, 'test_map@40': 0.2282488541575416, 'test_map@60': 0.2109436663301642} {'test_recall@20': 0.15444687756068506, 'test_recall@40': 0.27659506717594756, 'test_recall@60': 0.353855012723892}


In [45]:
preds = torch.topk(output, k = max_len).indices

In [58]:
test_dataloader.batch_size 

1024

In [55]:
len(test_dataloader)

8

In [69]:
for i in range(test_dataloader.batch_size):
    filtred_preds = preds[i][preds[i]>4]
    filtred_targets = target_input_ids[i][:len(filtred_preds)]
    break

In [70]:
len(filtred_targets) == len(filtred_preds)

True

In [None]:
off = []
off.extend(

In [66]:
target_input_ids

torch.Size([1024, 96])

In [64]:
len(preds[i][preds[i]>4]

91

In [None]:
target_input_ids[0].shape

In [None]:
from typing import List
import numpy as np

In [26]:
from typing import List
import numpy as np

In [25]:
def get_random_stats(targets: List[List[int]], seq_len : int = 96, ks : List[int] = [20, 40, 60], num_runs_avg : int = 5):
    """
    Returns the average MAP@k and Recall@k scores for a random forecasting model.

    Args:
        targets (List[List[int]]): The list of target sequences.
        seq_len (int, optional): The length of the forecasted sequence. Defaults to 96.
        ks (List[int], optional): The list of k values for MAP@k and Recall@k. Defaults to [20, 40, 60].
        num_runs_avg (int, optional): The number of runs to average the results over. Defaults to 5.
    Returns:
        Dict[str, float], Dict[str, float]: The average MAP@k and Recall@k scores.
    """
    # targets = [concated_dt[i]['target_sequences'].numpy().tolist() for i in range(len(concated_dt))]
    unique_targets = list(set([item for sublist in targets for item in sublist]))
    
    cumulative_mapk = {f"test_map@{k}": 0.0 for k in ks}
    cumulative_recallk = {f"test_recall@{k}": 0.0 for k in ks}
    
    for _ in range(num_runs_avg):
        
        forecasted = [np.random.choice(unique_targets, size=seq_len, replace=False).tolist() for _ in range(len(targets))]
    
        run_mapk = {f"test_map@{k}": mapk(targets, forecasted, k) for k in ks}
        run_recallk = {f"test_recall@{k}": recallTop(targets, forecasted, rank=[k])[0] for k in ks}
    
        # Accumulate results
        for k in ks:
            cumulative_mapk[f"test_map@{k}"] += run_mapk[f"test_map@{k}"]
            cumulative_recallk[f"test_recall@{k}"] += run_recallk[f"test_recall@{k}"]
    
    # Compute average results
    average_mapk = {f"test_map@{k}": cumulative_mapk[f"test_map@{k}"] / num_runs_avg for k in ks}
    average_recallk = {f"test_recall@{k}": cumulative_recallk[f"test_recall@{k}"] / num_runs_avg for k in ks}

    return average_mapk, average_recallk    

In [27]:
get_random_stats(targets)

({'test_map@20': 0.05347963114250881,
  'test_map@40': 0.0573658107319462,
  'test_map@60': 0.05544175641942543},
 {'test_recall@20': 0.029036955706191657,
  'test_recall@40': 0.05811648229377542,
  'test_recall@60': 0.08716030403904587})

In [28]:
len(forecasted[0]), len(set(forecasted[0]))

(96, 96)