# Prerequisites
*HAIM*
https://www.physionet.org/content/haim-multimodal/1.0.1/

*MIMIC-CXR*
https://www.physionet.org/content/mimic-cxr-jpg/2.0.0/

*MIMIC-IV*
https://www.physionet.org/content/mimiciv/1.0/


# Imports

In [1]:
import pandas as pd

# Multimodal Data

In [2]:
multimodal_data = pd.read_csv("haim_mimiciv_key_ids.csv")
multimodal_data = multimodal_data.head(5000)  # I have 5000 pickled files only
print(
    f"Number of patients who have multimodal data: {multimodal_data['subject_id'].nunique()}"
)

Number of patients who have multimodal data: 2990


# EHR

In [3]:
EHR_ROOT_PATH = "/nvme/datasets/mimiciv/1.0"
CORE_DATA_PATH = EHR_ROOT_PATH + "/core"
HOSP_DATA_PATH = EHR_ROOT_PATH + "/hosp"
ICU_DATA_PATH = EHR_ROOT_PATH + "/icu"

## ICU

### ICU Stays

In [4]:
icustayes = pd.read_csv(ICU_DATA_PATH + "/icustays.csv")
icustayes

Unnamed: 0,subject_id,hadm_id,stay_id,first_careunit,last_careunit,intime,outtime,los
0,17867402,24528534,31793211,Trauma SICU (TSICU),Trauma SICU (TSICU),2154-03-03 04:11:00,2154-03-04 18:16:56,1.587454
1,14435996,28960964,31983544,Trauma SICU (TSICU),Trauma SICU (TSICU),2150-06-19 17:57:00,2150-06-22 18:33:54,3.025625
2,17609946,27385897,33183475,Trauma SICU (TSICU),Trauma SICU (TSICU),2138-02-05 18:54:00,2138-02-15 12:42:05,9.741725
3,18966770,23483021,34131444,Trauma SICU (TSICU),Trauma SICU (TSICU),2123-10-25 10:35:00,2123-10-25 18:59:47,0.350544
4,12776735,20817525,34547665,Neuro Stepdown,Neuro Stepdown,2200-07-12 00:33:00,2200-07-13 16:44:40,1.674769
...,...,...,...,...,...,...,...,...
76535,15368898,27299174,39990887,Trauma SICU (TSICU),Trauma SICU (TSICU),2126-06-13 01:00:00,2126-06-13 20:28:35,0.811516
76536,15721773,28911582,39991872,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2177-11-08 14:09:00,2177-11-10 00:24:54,1.427708
76537,12275003,22562812,39992247,Cardiac Vascular Intensive Care Unit (CVICU),Cardiac Vascular Intensive Care Unit (CVICU),2182-08-15 09:37:33,2182-08-16 17:25:44,1.325127
76538,17577670,24221219,39993265,Medical/Surgical Intensive Care Unit (MICU/SICU),Medical/Surgical Intensive Care Unit (MICU/SICU),2154-01-03 22:50:26,2154-01-06 12:44:43,2.579363


## HOSP

### Diagnoses

In [5]:
diagnoses = pd.read_csv(HOSP_DATA_PATH + "/diagnoses_icd.csv")
diagnoses_labels = pd.read_csv(HOSP_DATA_PATH + "/d_icd_diagnoses.csv")
diagnoses = diagnoses.merge(diagnoses_labels[["icd_code", "long_title"]], on="icd_code")
diagnoses_icu = diagnoses.merge(icustayes[["hadm_id", "stay_id"]], on="hadm_id")
diagnoses_icu

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,long_title,stay_id
0,17482187,29966553,9,2825,9,Sickle-cell trait,39062004
1,17482187,29966553,10,4019,9,Unspecified essential hypertension,39062004
2,17482187,29966553,4,2851,9,Acute posthemorrhagic anemia,39062004
3,17482187,29966553,11,V1251,9,Personal history of venous thrombosis and embo...,39062004
4,17482187,29966553,12,V5861,9,Long-term (current) use of anticoagulants,39062004
...,...,...,...,...,...,...,...
1386890,15928338,29204890,1,G7001,10,Myasthenia gravis with (acute) exacerbation,39380108
1386891,12916012,26621411,1,O873,10,Cerebral venous thrombosis in the puerperium,30659394
1386892,11909506,23843132,1,K613,10,Ischiorectal abscess,36908300
1386893,17425595,27520164,1,R259,10,Unspecified abnormal involuntary movements,37778430


# CXR

In [6]:
cxr_data = pd.read_csv(
    "/nvme/datasets/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-2.0.0-chexpert.csv"
)
cxr_data.head()

Unnamed: 0,subject_id,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
0,10000032,50414267,,,,,,,,,1.0,,,,,
1,10000032,53189527,,,,,,,,,1.0,,,,,
2,10000032,53911762,,,,,,,,,1.0,,,,,
3,10000032,56699142,,,,,,,,,1.0,,,,,
4,10000764,57375967,,,1.0,,,,,,,,,-1.0,,


Get patients who have CXR diagnoses

In [7]:
cxr_diagnoses = cxr_data.drop(
    ["subject_id", "study_id", "No Finding", "Support Devices"], axis=1
).columns
print(f"Number of diagnoses: {len(cxr_diagnoses)}")
print(f"Diagnoses: {cxr_diagnoses}")

Number of diagnoses: 12
Diagnoses: Index(['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema',
       'Enlarged Cardiomediastinum', 'Fracture', 'Lung Lesion', 'Lung Opacity',
       'Pleural Effusion', 'Pleural Other', 'Pneumonia', 'Pneumothorax'],
      dtype='object')


In [8]:
multimodal_patients_diagnoses = diagnoses_icu[
    (diagnoses_icu["subject_id"].isin(multimodal_data["subject_id"]))
    & (diagnoses_icu["hadm_id"].isin(multimodal_data["hadm_id"]))
    & (diagnoses_icu["stay_id"].isin(multimodal_data["stay_id"]))
]
print(
    "Number of patients who have multimodal data: ",
    multimodal_patients_diagnoses["subject_id"].nunique(),
)

Number of patients who have multimodal data:  2990


In [9]:
# Match with long_title (not ICD code)
filtered_data = multimodal_patients_diagnoses[
    multimodal_patients_diagnoses["long_title"].str.contains("|".join(cxr_diagnoses))
]

filtered_data.to_csv(f"patients_cxr_diagnoses.csv", index=False)
print(f"Number of patients who have CXR diagnoses: {filtered_data.shape[0]}")

Number of patients who have CXR diagnoses: 966
