In [1]:
import sys
sys.path.append('..')
sys.path.append('../ehrshot')
import copy
from typing import Literal
import argparse
import pandas as pd
import numpy as np
import os

import torch
from torch import nn
from torch.distributions import Distribution
from torch_uncertainty.utils.distributions import cat_dist
from torch_uncertainty.routines import ClassificationRoutine
from torch_uncertainty.utils import TUTrainer
from torch_uncertainty.models import deep_ensembles, mc_dropout
from torch_uncertainty.transforms import RepeatTarget
import torchvision.transforms as T

from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
pd.options.display.max_seq_items = 2000

In [2]:
import torch_uncertainty.metrics.classification.brier_score as brier_score

In [3]:
print(brier_score)

<module 'torch_uncertainty.metrics.classification.brier_score' from '/home/zizhang/anaconda3/envs/uq_ehr/lib/python3.10/site-packages/torch_uncertainty/metrics/classification/brier_score.py'>


In [4]:
results_dict = {}


unique_tasks_1 = ['value_los', 'value_icu']
unique_tasks_2 = ['value_hypoglycemia', 'value_hyperkalemia', 'value_hyponatremia', 'value_anemia', 'value_thrombocytopenia']
unique_tasks_3 = ['value_new_hypertension', 'value_new_hyperlipidemia', 'value_new_acutemi']


all_tasks = [unique_tasks_1, unique_tasks_2, unique_tasks_3]
all_tasks_name = ['general_operation_v1', 'lab_test', 'new_diagnose']
embed_df = pd.read_csv('embedding_matrix/embed.csv')

In [5]:
# embed_task = unique_tasks_1 + unique_tasks_2 + unique_tasks_3
# def generate_embeddings(tasks, dimensions, mean, std_dev):
#     np.random.seed(42)
#     embeddings = {}

#     for task in tasks:
#         embeddings[task] = np.random.normal(mean, std_dev, dimensions)

#     # Convert the dictionary of embeddings to a DataFrame
#     df_embeddings = pd.DataFrame(embeddings)

#     return df_embeddings

# df = generate_embeddings(embed_task, 768, 0, 0.5)
# df.to_csv('embedding_matrix/embed.csv', index = False)

In [6]:
class NN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.dropout1 = nn.Dropout(p=0.2)
        self.fc2 = nn.Linear(hidden_size, num_classes)
#         self.fc2 = nn.Linear(hidden_size, hidden_size)
#         self.dropout2 = nn.Dropout(p=0.2)
#         self.fc3 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout1(x)
        x = F.relu(self.fc2(x))
#         x = self.dropout2(x)
#         x = F.relu(self.fc3(x))
        return x


def optim_recipe(model, lr_mult: float = 1.0):
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01 * lr_mult, momentum=0.9)
    return {"optimizer": optimizer}

In [7]:
max_epochs = 50
batch_size = 64

for i in tqdm(range(len(all_tasks))):
# for i in [0]:
    general_task_name = all_tasks_name[i]

    folder_path = f'same_time_data/{general_task_name}'
    
    train_x_name = os.path.join(folder_path, 'x_train.csv')
    train_y_name = os.path.join(folder_path, 'y_train.csv')
    val_x_name = os.path.join(folder_path, 'x_val.csv')
    val_y_name = os.path.join(folder_path, 'y_val.csv')
    test_x_name = os.path.join(folder_path, 'x_test.csv')
    test_y_name = os.path.join(folder_path, 'y_test.csv')

    X_train_all = []
    X_val_all = []
    
    y_train_all = []
    y_val_all = []
    
    for j in tqdm(range(len(all_tasks[i]))):
    # for j in range(1):
        specific_task_name = all_tasks[i][j]
        specific_task_embed = embed_df[specific_task_name].values
        
        X_train = pd.read_csv(train_x_name).to_numpy()
        X_val = pd.read_csv(val_x_name).to_numpy()
        
        X_train = X_train + specific_task_embed
        X_val = X_val + specific_task_embed
        
        y_train = pd.read_csv(train_y_name)[specific_task_name].astype(int).to_numpy()
        y_val = pd.read_csv(val_y_name)[specific_task_name].astype(int).to_numpy()
        
        X_train_all.append(X_train)
        X_val_all.append(X_val)
        
        y_train_all.append(y_train)
        y_val_all.append(y_val)
        
    X_train_all = np.concatenate(X_train_all, axis = 0)
    X_val_all = np.concatenate(X_val_all, axis = 0)
    
    y_train_all = np.concatenate(y_train_all)
    y_val_all = np.concatenate(y_val_all)
    
    class_weights = compute_class_weight('balanced', classes=np.unique(y_train_all), y=y_train_all)
    class_weights = torch.tensor(class_weights, dtype=torch.float)
    
    X_train_all = torch.tensor(X_train_all).float()
    X_val_all = torch.tensor(X_val_all).float()
    
    y_train_all = torch.tensor(y_train_all).long()
    y_val_all = torch.tensor(y_val_all).long()
    
    train_dataset = TensorDataset(X_train_all, y_train_all)
    val_dataset = TensorDataset(X_val_all, y_val_all)
    
    train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dl = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    ###################################################################################################
    
    
    input_size = X_train_all.shape[1]
    hidden_size = 128
    num_classes = 2
    model = NN(input_size, hidden_size, num_classes)
    
    ensemble = deep_ensembles(
        model,
        num_estimators=5,
        task="classification",
        reset_model_parameters=True,
    )
    
    trainer = TUTrainer(accelerator="gpu", max_epochs=max_epochs)

    ens_routine = ClassificationRoutine(
        is_ensemble=True,
        num_classes=2,
        model=ensemble,
        loss=nn.CrossEntropyLoss(weight=class_weights),
        format_batch_fn=RepeatTarget(
            5
        ), 
        optim_recipe=optim_recipe(
            ensemble, 1
        ),
        eval_ood=False,
    )

    trainer.fit(ens_routine, train_dataloaders=train_dl, val_dataloaders=val_dl)
    
    for k in tqdm(range(len(all_tasks[i]))):
        specific_task_name_test = all_tasks[i][k]
        specific_task_embed_k = embed_df[specific_task_name_test].values
        
        X_test = pd.read_csv(test_x_name).to_numpy()
        X_test = X_test + specific_task_embed_k
        y_test = pd.read_csv(test_y_name)[specific_task_name_test].astype(int).to_numpy()
        
        X_test = torch.tensor(X_test).float()
        y_test = torch.tensor(y_test).long()
        
        test_dataset = TensorDataset(X_test, y_test)
        test_dl = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
        
        ens_perf = trainer.test(ens_routine, dataloaders=[test_dl])

        results_dict[specific_task_name_test] = ens_perf
        
    del trainer, ens_routine, model

  0%|                                                     | 0/3 [00:00<?, ?it/s]
  0%|                                                     | 0/2 [00:00<?, ?it/s][A
 50%|██████████████████████▌                      | 1/2 [00:00<00:00,  3.12it/s][A
100%|█████████████████████████████████████████████| 2/2 [00:00<00:00,  3.09it/s][A
Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3090 Ti') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/d

Sanity Checking: |                                        | 0/? [00:00<?, ?it/s]

/home/zizhang/anaconda3/envs/uq_ehr/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.


torch.Size([128, 2]) torch.Size([128])


/home/zizhang/anaconda3/envs/uq_ehr/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.


Training: |                                               | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


torch.Size([4104, 2]) torch.Size([4104])



  0%|                                                     | 0/2 [00:00<?, ?it/s][ALOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/home/zizhang/anaconda3/envs/uq_ehr/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.


Testing: |                                                | 0/? [00:00<?, ?it/s]

torch.Size([2037, 2]) torch.Size([2037])



 50%|██████████████████████▌                      | 1/2 [00:00<00:00,  2.42it/s][ALOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/home/zizhang/anaconda3/envs/uq_ehr/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.


Testing: |                                                | 0/? [00:00<?, ?it/s]

torch.Size([2037, 2]) torch.Size([2037])



100%|█████████████████████████████████████████████| 2/2 [00:00<00:00,  2.45it/s][A
 33%|███████████████                              | 1/3 [00:02<00:05,  2.66s/it]
  0%|                                                     | 0/5 [00:00<?, ?it/s][A
 20%|█████████                                    | 1/5 [00:06<00:27,  6.94s/it][A
 33%|███████████████                              | 1/3 [00:09<00:19,  9.60s/it]


ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [8]:
# import json
# with open('results_model_uq_v2/results_multi_task_deep_ensemble.json', 'w') as f:
#     json.dump(results_dict, f)


import json
with open('results_model_uq_v3/results_multi_task_deep_ensemble.json', 'w') as f:
    json.dump(results_dict, f)


In [9]:
results_dict.keys()

dict_keys(['value_los', 'value_icu', 'value_hypoglycemia', 'value_hyperkalemia', 'value_hyponatremia', 'value_anemia', 'value_thrombocytopenia', 'value_new_hypertension', 'value_new_hyperlipidemia', 'value_new_acutemi'])

In [10]:
# max_epochs = 50
# batch_size = 64

# # for i in tqdm(range(len(all_tasks))):
# for i in [0]:
#     general_task_name = all_tasks_name[i]

#     folder_path = f'same_time_data/{general_task_name}'
    
#     train_x_name = os.path.join(folder_path, 'x_train.csv')
#     train_y_name = os.path.join(folder_path, 'y_train.csv')
#     val_x_name = os.path.join(folder_path, 'x_val.csv')
#     val_y_name = os.path.join(folder_path, 'y_val.csv')
#     test_x_name = os.path.join(folder_path, 'x_test.csv')
#     test_y_name = os.path.join(folder_path, 'y_test.csv')

#     X_train = pd.read_csv(train_x_name).to_numpy()
#     X_val = pd.read_csv(val_x_name).to_numpy()
#     X_test = pd.read_csv(test_x_name).to_numpy()

#     X_all = pd.DataFrame()
#     X_val = pd.DataFrame()
#     X_test = 
    
#     for j in tqdm(range(len(all_tasks[i]))):
#     # for j in range(1):
#         specific_task_name = all_tasks[i][j]
#         y_train = pd.read_csv(train_y_name)[specific_task_name].astype(int).to_numpy()
#         y_val = pd.read_csv(val_y_name)[specific_task_name].astype(int).to_numpy()
#         y_test = pd.read_csv(test_y_name)[specific_task_name].astype(int).to_numpy()

#         assert len(np.unique(y_train)) == 2
#         # Create class weights
#         class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
#         class_weights = torch.tensor(class_weights, dtype=torch.float)

#         X_train = torch.tensor(X_train).float()
#         X_val = torch.tensor(X_val).float()
#         X_test = torch.tensor(X_test).float()

#         y_train = torch.tensor(y_train).long()
#         y_val = torch.tensor(y_val).long()
#         y_test = torch.tensor(y_test).long()

#         # Create TensorDatasets
#         train_dataset = TensorDataset(X_train, y_train)
#         val_dataset = TensorDataset(X_val, y_val)
#         test_dataset = TensorDataset(X_test, y_test)

#         # Create DataLoaders
#         train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
#         val_dl = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
#         test_dl = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

#         input_size = X_train.shape[1]
#         hidden_size = 128
#         num_classes = 2
#         model = NN(input_size, hidden_size, num_classes)

#         trainer = TUTrainer(accelerator="gpu", max_epochs=max_epochs)

#         ens_routine = ClassificationRoutine(
#             is_ensemble=True,
#             num_classes=2,
#             model=model,
#             loss=nn.CrossEntropyLoss(weight=class_weights),
#             format_batch_fn=RepeatTarget(
#                 1
#             ), 
#             optim_recipe=optim_recipe(
#                 model, 1
#             ),
#             eval_ood=False,
#         )

#         trainer.fit(ens_routine, train_dataloaders=train_dl, val_dataloaders=val_dl)

#         ens_perf = trainer.test(ens_routine, dataloaders=[test_dl])

#         results_dict[specific_task_name] = ens_perf
        
#         del trainer, ens_routine, model

In [11]:
# X_train.numpy().std(), X_train.numpy().mean()