In [1]:
import sys
sys.path.append('..')
sys.path.append('../ehrshot')
import copy
from typing import Literal
import argparse
import pandas as pd
import numpy as np
import os
import json

import torch
from torch import nn
from torch.distributions import Distribution
from torch_uncertainty.utils.distributions import cat_dist
from torch_uncertainty.routines import ClassificationRoutine
from torch_uncertainty.utils import TUTrainer
from torch_uncertainty.models import deep_ensembles, mc_dropout
from torch_uncertainty.transforms import RepeatTarget
import torchvision.transforms as T

from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import roc_auc_score
from tqdm import tqdm

In [2]:
unique_tasks_1 = ['guo_los', 'guo_readmission', 'guo_icu']
unique_tasks_2 = ['new_hypertension', 'new_hyperlipidemia', 'new_pancan', 'new_celiac', 'new_lupus', 'new_acutemi']
unique_tasks_3 = ['lab_thrombocytopenia', 'lab_hyperkalemia', 'lab_hyponatremia', 'lab_anemia', 'lab_hypoglycemia']

all_tasks = [unique_tasks_1, unique_tasks_2, unique_tasks_3]
all_tasks_name = ['unique_tasks_1', 'unique_tasks_2', 'unique_tasks_3']

labeling_functions=[
    "guo_los",
    "guo_readmission",
    "guo_icu",
    "new_hypertension",
    "new_hyperlipidemia",
    "new_pancan",
    "new_celiac",
    "new_lupus",
    "new_acutemi",
    "lab_thrombocytopenia",
    "lab_hyperkalemia",
    "lab_hyponatremia",
    "lab_anemia",
    "lab_hypoglycemia" # will OOM at 200G on `gpu` partition
]

In [3]:
def expand_embeddings(df, task_embeddings):
    embeddings = df['task'].map(task_embeddings)
    new_columns = [f'task_emb_{i}' for i in range(task_embedding_dim)]
    df = pd.concat([df.drop('task', axis=1), pd.DataFrame(embeddings.tolist(), columns=new_columns, index=df.index)], axis=1)
    return df

In [4]:
with open('task_embeddings.json', 'r') as file:
    task_embeddings = json.load(file)
task_embedding_dim = 128

In [1]:
# for i in range(len(labeling_functions)):

# for j in range(len(all_tasks)):

    X_train_all = []
    y_train_all = []
    X_val_all = []
    y_val_all = []

    task_type = all_tasks_name[j]
    folder_path_dir = f'multi_task_data_uq_v2/{task_type}'

    if not os.path.exists(folder_path_dir):
        os.makedirs(folder_path_dir)

    unique_tasks = all_tasks[j]

    for i in range(len(unique_tasks)):

        task = unique_tasks[i]
        folder_path_data = f'multi_task_data_uq/{task_type}/{task}'
        folder_path = f'single_task_data/{task}'

        if not os.path.exists(folder_path_data):
            os.makedirs(folder_path_data)

        train_x_name = os.path.join(folder_path, 'X_train.csv')
        train_y_name = os.path.join(folder_path, 'y_train.csv')
        val_x_name = os.path.join(folder_path, 'X_val.csv')
        val_y_name = os.path.join(folder_path, 'y_val.csv')
        test_x_name = os.path.join(folder_path, 'X_test.csv')
        test_y_name = os.path.join(folder_path, 'y_test.csv')

        X_train = pd.read_csv(train_x_name).to_numpy()
        y_train = pd.read_csv(train_y_name).to_numpy().reshape(-1)
        X_val = pd.read_csv(val_x_name).to_numpy()
        y_val = pd.read_csv(val_y_name).to_numpy().reshape(-1)
        X_test = pd.read_csv(test_x_name).to_numpy()
        y_test = pd.read_csv(test_y_name).to_numpy().reshape(-1)

        # expand the embeddings for the task
        X_train = np.concatenate([X_train, np.round(np.tile(task_embeddings[task], (X_train.shape[0], 1)), 4)], axis=1)
        X_val = np.concatenate([X_val,  np.round(np.tile(task_embeddings[task], (X_val.shape[0], 1)), 4)], axis=1)
        X_test = np.concatenate([X_test, np.round(np.tile(task_embeddings[task], (X_test.shape[0], 1)), 4)], axis=1)

        # class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
        # class_weights = torch.tensor(class_weights, dtype=torch.float)
        X_train_all.append(X_train)
        y_train_all.append(y_train)

        X_val_all.append(X_val)
        y_val_all.append(y_val)

        pd.DataFrame(X_test).to_csv(os.path.join(folder_path_data, 'X_test.csv'), index=False)
        pd.DataFrame(y_test).to_csv(os.path.join(folder_path_data, 'y_test.csv'), index=False)

    X_train_all = np.concatenate(X_train_all, axis=0)
    y_train_all = np.concatenate(y_train_all, axis=0)

    X_val_all = np.concatenate(X_val_all, axis=0)
    y_val_all = np.concatenate(y_val_all, axis=0)

    pd.DataFrame(X_train_all).to_csv(f'multi_task_data_uq/{task_type}/X_train_all.csv', index=False)
    pd.DataFrame(y_train_all).to_csv(f'multi_task_data_uq/{task_type}/y_train_all.csv', index=False)

    pd.DataFrame(X_val_all).to_csv(f'multi_task_data_uq/{task_type}/X_val_all.csv', index=False)
    pd.DataFrame(y_val_all).to_csv(f'multi_task_data_uq/{task_type}/y_val_all.csv', index=False)

SyntaxError: invalid syntax. Perhaps you forgot a comma? (2101942803.py, line 54)

'single_task_data/lab_hypoglycemia'

In [7]:
X_train_all

array([[-1.745 , -0.1219,  1.8125, ...,  0.2943,  0.6588,  0.7391],
       [-2.037 , -0.47  ,  1.274 , ...,  0.2943,  0.6588,  0.7391],
       [-1.877 , -1.647 ,  1.546 , ...,  0.2943,  0.6588,  0.7391],
       ...,
       [-1.975 , -1.2   , -0.7544, ...,  0.8698,  0.0425,  0.9779],
       [-1.325 , -1.646 , -0.6934, ...,  0.8698,  0.0425,  0.9779],
       [-1.88  , -0.7427, -0.2286, ...,  0.8698,  0.0425,  0.9779]])