# Clinical data import

This file contain followed sections:

1. Create a folder for every patients inside the MIMIC-CXR dataset.
2. Identify stay_id for CXR-image.
3. Create stay_id folder and import CXR-meta.
4. Import otehr clinical data.
5. Save CXR spread sheet.

In [1]:
import pandas as pd
import time
import numpy as np
import os
import random
from datetime import datetime
import math
from tqdm.notebook  import tqdm
import numpy as np

pd.options.mode.chained_assignment  = None

In [5]:
disk_location  = "/Volumes/New Volume/AI-VR dataset"
# disk_location = "E:/AI-VR dataset"

# Define the path for all of the folders.
ED_FOLDER_PATH = f"{disk_location}/MIMIC-IV ED"
CLINICAL_FOLDER_PATH = f"{disk_location}/MIMIC-IV Clinical Database"
CXR_FOLDER_PATH = f"{disk_location}/MIMIC-CXR-JPG/physionet.org/files/mimic-cxr-jpg/2.0.0"
EYEGAZE_FOLDER_PATH = f"{disk_location}/eye-gaze-data-for-chest-x-rays-1.0.0"
REFLACX_FOLDER_PATH = f"{disk_location}/reflacx-reports-and-eye-tracking-data-for-localization-of-abnormalities-in-chest-x-rays-1.0.0/reflacx-reports-and-eye-tracking-data-for-localization-of-abnormalities-in-chest-x-rays-1.0.0"
# XAMI_MIMIC_PATH = "{disk_location}/XAMI-MIMIC"
XAMI_MIMIC_PATH = "./XAMI-MIMIC"

In [6]:
os.makedirs(f"{XAMI_MIMIC_PATH}", exist_ok=True)

## Create folder for patients

In [7]:
CXR_meta_df = pd.read_csv(f"{CXR_FOLDER_PATH}/mimic-cxr-2.0.0-metadata.csv.gz", compression='gzip', header=0, sep=',', quotechar='"')

In [8]:
CXR_meta_df.head(5)

Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,CHEST (PA AND LAT),PA,3056,2544,21800506,213014.531,CHEST (PA AND LAT),postero-anterior,Erect
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,10000032,50414267,CHEST (PA AND LAT),LATERAL,3056,2544,21800506,213014.531,CHEST (PA AND LAT),lateral,Erect
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,CHEST (PA AND LAT),PA,3056,2544,21800626,165500.312,CHEST (PA AND LAT),postero-anterior,Erect
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,10000032,53189527,CHEST (PA AND LAT),LATERAL,3056,2544,21800626,165500.312,CHEST (PA AND LAT),lateral,Erect
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,CHEST (PORTABLE AP),AP,2705,2539,21800723,80556.875,CHEST (PORTABLE AP),antero-posterior,


In [9]:
all_patients_in_cxr = list(set(CXR_meta_df['subject_id']))
all_patients_in_cxr.sort()

In [10]:
print(f"We have total {len(all_patients_in_cxr)} patients.")

We have total 65379 patients.


In [11]:
# # foreach patient, we create a folder for it
# for patient_id in tqdm(all_patients_in_cxr):
#     os.makedirs(f"{XAMI_MIMIC_PATH}/patient_{patient_id}", exist_ok=True)

In [12]:
CXR_meta_df

Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,CHEST (PA AND LAT),PA,3056,2544,21800506,213014.531,CHEST (PA AND LAT),postero-anterior,Erect
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,10000032,50414267,CHEST (PA AND LAT),LATERAL,3056,2544,21800506,213014.531,CHEST (PA AND LAT),lateral,Erect
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,CHEST (PA AND LAT),PA,3056,2544,21800626,165500.312,CHEST (PA AND LAT),postero-anterior,Erect
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,10000032,53189527,CHEST (PA AND LAT),LATERAL,3056,2544,21800626,165500.312,CHEST (PA AND LAT),lateral,Erect
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,CHEST (PORTABLE AP),AP,2705,2539,21800723,80556.875,CHEST (PORTABLE AP),antero-posterior,
...,...,...,...,...,...,...,...,...,...,...,...,...
377105,428e2c18-5721d8f3-35a05001-36f3d080-9053b83c,19999733,57132437,CHEST (PA AND LAT),PA,3056,2544,21520708,224550.171,CHEST (PA AND LAT),postero-anterior,Erect
377106,58c403aa-35ff8bd9-73e39f54-8dc9cc5d-e0ec3fa9,19999733,57132437,CHEST (PA AND LAT),LATERAL,3056,2544,21520708,224550.171,CHEST (PA AND LAT),lateral,Erect
377107,58766883-376a15ce-3b323a28-6af950a0-16b793bd,19999987,55368167,CHEST (PORTABLE AP),AP,2544,3056,21451104,51448.218,CHEST (PORTABLE AP),antero-posterior,Erect
377108,7ba273af-3d290f8d-e28d0ab4-484b7a86-7fc12b08,19999987,58621812,CHEST (PORTABLE AP),AP,3056,2544,21451102,202809.234,CHEST (PORTABLE AP),antero-posterior,Erect


## Identify stay_id for CXR-image

In [13]:
## Create studyDateTime for indentifying the stay_id.
CXR_meta_df['StudyDateTime']  = [float(str(d) + str(t))
 for d, t in zip(CXR_meta_df['StudyDate'], CXR_meta_df['StudyTime'])]


In [14]:
CXR_meta_df['StudyDateTime']

0         2.180051e+13
1         2.180051e+13
2         2.180063e+13
3         2.180063e+13
4         2.180072e+12
              ...     
377105    2.152071e+13
377106    2.152071e+13
377107    2.145110e+12
377108    2.145110e+13
377109    2.145110e+12
Name: StudyDateTime, Length: 377110, dtype: float64

In [15]:
### Use the transfer Id to indentify.
def time_string_to_float(x):
    return float('inf') if type(x) is float and math.isnan(x) else float(x.replace("-", "").replace("/", "").replace(" ", "").replace(":", ""))

In [16]:
# load the stay_id 
ED_edstays_df = pd.read_csv(f"{ED_FOLDER_PATH}/edstays.csv")

In [17]:
CXR_meta_df['stay_id'] = [None] * len(CXR_meta_df)

In [18]:
print(f"MIMIC-CXR contain total {len(CXR_meta_df)} instances.")

MIMIC-CXR contain total 377110 instances.


In [19]:
######## Identify stay_id for CXR-images ########

cxr_cannot_find_stay = []

ED_edstays_df['intime_float'] = ED_edstays_df.intime.apply(time_string_to_float)
ED_edstays_df['outtime_float'] = ED_edstays_df.outtime.apply(time_string_to_float)

with tqdm(total= len(CXR_meta_df)) as pbar:
    for idx, cxr_meta_instance in enumerate(CXR_meta_df.iloc):
        dicom_id = cxr_meta_instance['dicom_id']
        
        study_time = cxr_meta_instance["StudyDateTime"]

        ed_stay_time_match = ED_edstays_df[
            (ED_edstays_df.subject_id == cxr_meta_instance.subject_id)  
        ]

        if (len(ed_stay_time_match) > 0):
            ed_stay_time_match = ed_stay_time_match[
                (ed_stay_time_match.intime_float < study_time) &
                (ed_stay_time_match.outtime_float > study_time)
            ]


        if (len(ed_stay_time_match) == 1):
            stay_id = ed_stay_time_match.iloc[0]['stay_id']
            CXR_meta_df.at[idx,"stay_id"] = stay_id
        else:
            # print(f"Having problem with {dicom_id} CXR iamge, it has {len(ed_stay_time_match)} matches.")
            cxr_cannot_find_stay.append(cxr_meta_instance)


        pbar.update(1)

# 5m 2.7s on Mac

  0%|          | 0/377110 [00:00<?, ?it/s]

In [20]:
print(f"Total {len(cxr_cannot_find_stay)} CXR-images can't find their stay_id")

Total 284800 CXR-images can't find their stay_id


In [21]:
### Add relationship to EyeGaze and REFLACX. (Checking if the dataset contain this dicom_id)
eye_gaze_master_df = pd.read_csv(f"{EYEGAZE_FOLDER_PATH}/master_sheet.csv")
reflacx_meta_df =  pd.concat([pd.read_csv(f"{REFLACX_FOLDER_PATH}/main_data/metadata_phase_{i}.csv") for i in range(1,4)])
all_dicom_id_in_eye_gaze = list(eye_gaze_master_df['dicom_id'])
all_dicom_id_in_reflacx = list(reflacx_meta_df['dicom_id'])
CXR_meta_df['in_eye_gaze'] = CXR_meta_df["dicom_id"].isin(all_dicom_id_in_eye_gaze)
CXR_meta_df['in_reflacx'] = CXR_meta_df["dicom_id"].isin(all_dicom_id_in_reflacx)

In [22]:
CXR_meta_df_has_stayId = CXR_meta_df[[
    not (id is None) for id in CXR_meta_df['stay_id']]]

## Create stay_id folder and import CXR-data

In [23]:
# cxr_chexpert_df = pd.read_csv(f"{CXR_FOLDER_PATH}/mimic-cxr-2.0.0-chexpert.csv.gz", compression='gzip', header=0, sep=',', quotechar='"')
# cxr_negbio_df = pd.read_csv(f"{CXR_FOLDER_PATH}/mimic-cxr-2.0.0-negbio.csv.gz", compression='gzip', header=0, sep=',', quotechar='"')
# cxr_split_df = pd.read_csv(f"{CXR_FOLDER_PATH}/mimic-cxr-2.0.0-split.csv.gz", compression='gzip', header=0, sep=',', quotechar='"')

In [24]:
# for subject_id in set(CXR_meta_df_has_stayId['subject_id']):
#     subject_matches = CXR_meta_df_has_stayId[CXR_meta_df_has_stayId['subject_id'] == subject_id]
#     all_stay_ids = list(set(subject_matches['stay_id']))


#     for stay_id in all_stay_ids:
#         stay_matches = subject_matches[subject_matches['stay_id'] == stay_id]
#         all_study_ids = list(set(stay_matches['stay_id']))

#         for study_id in all_study_ids:
#             match_cxr_meta = CXR_meta_df_has_stayId[CXR_meta_df_has_stayId['study_id'] == study_id]
#             match_cxr_chexpert = cxr_chexpert_df[cxr_chexpert_df["study_id"] == study_id]
#             match_cxr_negbio = cxr_negbio_df[cxr_negbio_df["study_id"] == study_id]
#             match_cxr_split = cxr_split_df[cxr_split_df["study_id"] == study_id]

#             save_folder_path = f"{XAMI_MIMIC_PATH}/patient_{subject_id}/stay_{stay_id}/CXR-JPG/study_{study_id}"
#             os.makedirs(save_folder_path, exist_ok=True)

#             match_cxr_meta.to_csv(f"{stay_folder_path}/cxr_chexpert.csv")
#             match_cxr_chexpert.to_csv(f"{stay_folder_path}/cxr_chexpert.csv")
#             match_cxr_negbio.to_csv(f"{stay_folder_path}/cxr_negbio.csv")
#             match_cxr_split.to_csv(f"{stay_folder_path}/cxr_split.csv")        



In [25]:
# for stay_id in set(CXR_meta_df_has_stayId['stay_id']):

#     # Create the folder for this stay.
#     stay_matches = CXR_meta_df_has_stayId[CXR_meta_df_has_stayId['stay_id'] == stay_id]

#     if (len(stay_matches) < 1):
#         raise StopIteration("Should have matches in the CXR meta table")

#     # Find who is the patient
#     subject_id = stay_matches.iloc[0]['subject_id']

#     # Create a folder for this stay
#     stay_folder_path = f"{XAMI_MIMIC_PATH}/patient_{subject_id}/stay_{stay_id}"
#     os.makedirs(stay_folder_path, exist_ok=True)

#     all_study_ids = list(set(stay_matches["study_id"]))

#     for study_id in all_study_ids:
#         match_cxr_meta = CXR_meta_df_has_stayId[CXR_meta_df_has_stayId['study_id'] == study_id]
#         match_cxr_chexpert = cxr_chexpert_df[cxr_chexpert_df["study_id"] == study_id]
#         match_cxr_negbio = cxr_negbio_df[cxr_negbio_df["study_id"] == study_id]
#         match_cxr_split = cxr_split_df[cxr_split_df["study_id"] == study_id]

#         match_cxr_chexpert.to_csv(f"{stay_folder_path}/cxr_chexpert.csv")
#         match_cxr_negbio.to_csv(f"{stay_folder_path}/cxr_negbio.csv")
#         match_cxr_split.to_csv(f"{stay_folder_path}/cxr_split.csv")


## Import other clinical data to the stay folder

In [26]:
######################################
# Tables can be linked with stay_id
######################################
# core - tranfers 
# ed - diagnosis
# ed - edstays
# ed - medrecon 
# ed - pyxis 
# ed - triage 
# ed - vitalsign

######################################
# Tables can be linked with patient_id
######################################
# core - admissions
# core - patients 

In [27]:
def stay_level_import(cxr_df, import_df, save_folder_name, save_table_name, import_df_stay_key = "stay_id"):
    all_cxr_stay_ids = set(cxr_df['stay_id'])

    # get the stay_id that are valid in the import df 
    valid_stay_ids = [id for id in list(
            set(import_df[import_df_stay_key])) if id in all_cxr_stay_ids]

    valid_stay_ids.sort()

    # foreach stay_id, we import the data to its folder
    for stay_id in valid_stay_ids:

        # Create the folder for this stay.
        matches = import_df[import_df[import_df_stay_key] == stay_id] 

        if (len(matches ) < 1):
            raise StopIteration("Should have matches in the CXR meta table")

        # Find who is the subject_id
        subject_id = matches.iloc[0]['subject_id']
        
        # Create a folder for this stay 
        save_folder_path = f"{XAMI_MIMIC_PATH}/patient_{subject_id}/stay_{stay_id}/{save_folder_name}" 
        os.makedirs(save_folder_path, exist_ok=True)

        # Store the matches in the folder.
        matches.to_csv(f"{save_folder_path}/{save_table_name}.csv")

In [28]:
ED_diagnosis_df = pd.read_csv(f"{ED_FOLDER_PATH}/diagnosis.csv")
stay_level_import(CXR_meta_df_has_stayId, ED_diagnosis_df, "ED", "diagnosis")

In [29]:
ED_edstays_df = pd.read_csv(f"{ED_FOLDER_PATH}/edstays.csv")
stay_level_import(CXR_meta_df_has_stayId, ED_edstays_df, "ED", "edstays")

In [30]:
ED_medrecon_df = pd.read_csv(f"{ED_FOLDER_PATH}/medrecon.csv")
stay_level_import(CXR_meta_df_has_stayId, ED_medrecon_df, "ED", "medrecon")

In [None]:
ED_pyxis_df = pd.read_csv(f"{ED_FOLDER_PATH}/pyxis.csv")
stay_level_import(CXR_meta_df_has_stayId, ED_pyxis_df, "ED", "pyxis")

In [None]:
ED_triage_df = pd.read_csv(f"{ED_FOLDER_PATH}/triage.csv")
stay_level_import(CXR_meta_df_has_stayId, ED_triage_df, "ED", "triage")

In [None]:
Core_transfers_df = pd.read_csv(f"{CLINICAL_FOLDER_PATH}/core/transfers.csv")
stay_level_import(CXR_meta_df_has_stayId, Core_transfers_df, "Core", "transfers", "transfer_id")

In [None]:
def subject_level_import(cxr_df, import_df, save_table_name, import_df_subject_id = "subject_id"):
    all_cxr_subjet_ids = set(cxr_df['subject_id'])

    # get the stay_id that are valid in the import df 
    valid_subject_ids = [id for id in list(
            set(import_df[import_df_subject_id])) if id in all_cxr_subjet_ids]

    valid_subject_ids.sort()

    # foreach stay_id, we import the data to its folder
    for subject_id in valid_subject_ids:

        # Create the folder for this stay.
        matches = import_df[import_df[import_df_subject_id] == subject_id] 

        if (len(matches ) < 1):
            raise StopIteration("Should have matches in the CXR meta table")

        # Create a folder for this stay 
        save_folder_path = f"{XAMI_MIMIC_PATH}/patient_{subject_id}" 
        os.makedirs(save_folder_path, exist_ok=True)

        # Store the matches in the folder.
        matches.to_csv(f"{save_folder_path}/{save_table_name}.csv")

In [None]:
Core_admissions_df = pd.read_csv(f"{CLINICAL_FOLDER_PATH}/core/admissions.csv")
subject_level_import(CXR_meta_df, Core_admissions_df, "admissions")

In [None]:
Core_patients_df = pd.read_csv(f"{CLINICAL_FOLDER_PATH}/core/patients.csv")
subject_level_import(CXR_meta_df, Core_patients_df, "patients")

In [None]:
CXR_meta_df['in_ed'] = CXR_meta_df["stay_id"].isin(set(ED_edstays_df['stay_id']))
CXR_meta_df['in_core'] = CXR_meta_df["subject_id"].isin(set(Core_patients_df['subject_id']))

# Save meta data spread sheet.

In [None]:

# change the stay_id to int
# CXR_meta_df['stay_id'] = CXR_meta_df['stay_id'].apply(lambda x: None if np.isnan(x) else str(int(x)))
CXR_meta_df.to_csv("cxr_meta.csv")
cxr_df_with_stay_id_only = CXR_meta_df[CXR_meta_df['stay_id'].notnull()]
cxr_df_with_stay_id_only['stay_id'] = cxr_df_with_stay_id_only['stay_id'].apply(lambda x: int(x))
cxr_df_with_stay_id_only.to_csv("cxr_meta_with_stay_id_only.csv")

In [None]:
available_eye_gaze_data = CXR_meta_df[
    (CXR_meta_df["in_ed"]) &
    (CXR_meta_df["in_core"]) &
    (CXR_meta_df["in_eye_gaze"])
]

In [None]:
available_eye_gaze_data

In [None]:
available_reflacx_data = CXR_meta_df[
    (CXR_meta_df["in_ed"]) &
    (CXR_meta_df["in_core"]) &
    (CXR_meta_df["in_reflacx"])
]

In [None]:
available_reflacx_data