In [1]:
import sys
sys.path.append('..')
sys.path.append('../ehrshot')
import copy
from typing import Literal
import argparse
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

import torch
from torch import nn
from torch.distributions import Distribution
from torch_uncertainty.utils.distributions import cat_dist


In [32]:
labeling_functions=[
    "guo_los",
    "guo_readmission",
    "guo_icu",
    "new_hypertension",
    "new_hyperlipidemia",
    "new_pancan",
    "new_celiac",
    "new_lupus",
    "new_acutemi",
    "lab_thrombocytopenia",
    "lab_hyperkalemia",
    "lab_hyponatremia",
    "lab_anemia",
    "lab_hypoglycemia" # will OOM at 200G on `gpu` partition
]

In [2]:
X_train = pd.read_csv('multi_task_data/X_train_all.csv')
X_val = pd.read_csv('multi_task_data/X_val_all.csv')
X_test = pd.read_csv('multi_task_data/X_test_all.csv')

y_train = pd.read_csv('multi_task_data/y_train_all.csv')
y_val = pd.read_csv('multi_task_data/y_val_all.csv')
y_test = pd.read_csv('multi_task_data/y_test_all.csv')

y_train['task'] = X_train['task']
y_val['task'] = X_val['task']
y_test['task'] = X_test['task']

In [18]:
def process_data_subtask(X, y, subtask):
    X = X.drop(columns=['Unnamed: 0'])
    X_subtask = X[X['task'] == subtask]
    X_subtask = X_subtask.drop(columns=['task'])

    y = y.drop(columns=['Unnamed: 0'])
    y_subtask = y[y['task'] == subtask]
    y_subtask = y_subtask.drop(columns=['task'])

    return X_subtask, y_subtask

In [33]:
for i in range(len(labeling_functions)):
    task = labeling_functions[i]
    X_subtask_train, y_subtask_train = process_data_subtask(X_train, y_train, labeling_functions[i])
    X_subtask_val, y_subtask_val = process_data_subtask(X_val, y_val, labeling_functions[i])
    X_subtask_test, y_subtask_test = process_data_subtask(X_test, y_test, labeling_functions[i])
    
    folder_path = f'single_task_data/{task}'
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    train_x_name = os.path.join(folder_path, 'X_train.csv')
    train_y_name = os.path.join(folder_path, 'y_train.csv')
    val_x_name = os.path.join(folder_path, 'X_val.csv')
    val_y_name = os.path.join(folder_path, 'y_val.csv')
    test_x_name = os.path.join(folder_path, 'X_test.csv')
    test_y_name = os.path.join(folder_path, 'y_test.csv')

    X_subtask_train.to_csv(train_x_name, index = False)
    y_subtask_train.to_csv(train_y_name, index = False)
    X_subtask_val.to_csv(val_x_name, index = False)
    y_subtask_val.to_csv(val_y_name, index = False)
    X_subtask_test.to_csv(test_x_name, index = False)
    y_subtask_test.to_csv(test_y_name, index = False)
