In [1]:
import numpy as np
import pandas as pd
import os
import subprocess
from pathlib import Path

from mimic_ecg_preprocessing import prepare_mimic_ecg

from utils.stratify import *

In [2]:
# finetune_dataset e.g. mimic_all_all_all_all_2000_5A 

# mimic_{subsettrain}_{labelsettrain}_{subsettest}_{labelsettest}_{mincnt}_{digits} where _{digits} is optional

# subsettrain: all/ed/hosp/allnonzero/ednonzero/hospnonzero/allnonzerofirst/ednonzerofirst/hospnonzerofirst/allfirst/edfirst/hospfirst default: allnonzero
# labelsettrain: {all/hosp/ed}{/af/I} first part selects the label set all: both ed diagnosis and hosp diagnosis hosp: just hosp diagnosis ed: just ed diagnosis; second part: can be omitted or af for af labels or collection of uppercase letters such as I to select specific label sets
# similar for subsettest/labelsettest but labelsettest can only be {all/hosp/ed}
# digits: 3/4/5/3A/4A/5A or just empty corresponding to I48, I48.1 or I48.19; append an A to include all ancestors

In [10]:
target_folder = Path() # insert your data path

In [None]:
df = pd.read_csv() # load the provided csv file

In [None]:
for c in ["hosp_diag_hosp" ,"ed_diag_ed", "ed_diag_hosp", "all_diag_hosp", "all_diag_all"]:
    df[c]=df[c].apply(lambda x:eval(x))

In [11]:
# Benchmark settings

Tall2all_Eall2all = 'mimic_all_all_allfirst_all_2000_5A'
Ted2all_Eall2all = 'mimic_ed_all_allfirst_all_2000_5A'
Ted2ed_Eall2all = 'mimic_ed_ed_allfirst_all_2000_5A'
Tall2all_Eed2all = 'mimic_all_all_edfirst_all_2000_5A'
Ted2all_Eed2all = 'mimic_ed_all_edfirst_all_2000_5A'
Tall2all_Eall2hosp = 'mimic_all_all_allfirst_hosp_2000_5A'
Tall2all_Eed2hosp = 'mimic_all_all_edfirst_hosp_2000_5A'
Tall2all_Eed2ed = 'mimic_all_all_edfirst_ed_2000_5A'


finetune_dataset = Ted2all_Eed2all

In [14]:
df_scenario, lbl_itos = prepare_mimic_ecg(finetune_dataset, target_folder, df_mapped=df)

Label set: 1076 labels.


In [17]:
max_fold_id = df_scenario.fold.max()
df_train = df_scenario[df_scenario.fold<max_fold_id-1]
df_val = df_scenario[df_scenario.fold==max_fold_id-1]
df_test = df_scenario[df_scenario.fold==max_fold_id]

In [34]:
df_train['data'] # ecg index from mimic-iv-ecg

0              0
1              1
5              5
11            11
18            18
           ...  
800001    800001
800012    800012
800023    800023
800024    800024
800032    800032
Name: data, Length: 166408, dtype: int64

In [39]:
df_train['filename'] # ecg path from mimic-iv-ecg

0         mimic-iv-ecg-diagnostic-electrocardiogram-matc...
1         mimic-iv-ecg-diagnostic-electrocardiogram-matc...
5         mimic-iv-ecg-diagnostic-electrocardiogram-matc...
11        mimic-iv-ecg-diagnostic-electrocardiogram-matc...
18        mimic-iv-ecg-diagnostic-electrocardiogram-matc...
                                ...                        
800001    mimic-iv-ecg-diagnostic-electrocardiogram-matc...
800012    mimic-iv-ecg-diagnostic-electrocardiogram-matc...
800023    mimic-iv-ecg-diagnostic-electrocardiogram-matc...
800024    mimic-iv-ecg-diagnostic-electrocardiogram-matc...
800032    mimic-iv-ecg-diagnostic-electrocardiogram-matc...
Name: filename, Length: 166408, dtype: object

In [40]:
df_train['label']  # multilabel target

0         [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1         [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
5         [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
11        [1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...
18        [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, ...
                                ...                        
800001    [1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, ...
800012    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
800023    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
800024    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
800032    [0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, ...
Name: label, Length: 166408, dtype: object