# Clinical data import

This file contain followed sections:

1. Create a folder for every patients inside the MIMIC-CXR dataset.
2. Identify stay_id for CXR-image.
3. Create stay_id folder and import CXR-meta.
4. Import otehr clinical data.
5. Save CXR spread sheet.

In [1]:
import pandas as pd
import os
import math
from tqdm.notebook import tqdm
from utils.importer import import_to_patient_sub_folder
from data_path import CXR_FOLDER_PATH, REFLACX_FOLDER_PATH, EYEGAZE_FOLDER_PATH, ED_FOLDER_PATH, CLINICAL_FOLDER_PATH, XAMI_SPREADSHEET_FOLDER_PATH

pd.options.mode.chained_assignment = None


## Load needed dataframe

In [2]:
CXR_meta_df = pd.read_csv(os.path.join(CXR_FOLDER_PATH, "mimic-cxr-2.0.0-metadata.csv.gz"),
                          compression='gzip', header=0, sep=',', quotechar='"')
CXR_meta_df.head(5)


Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,CHEST (PA AND LAT),PA,3056,2544,21800506,213014.531,CHEST (PA AND LAT),postero-anterior,Erect
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,10000032,50414267,CHEST (PA AND LAT),LATERAL,3056,2544,21800506,213014.531,CHEST (PA AND LAT),lateral,Erect
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,CHEST (PA AND LAT),PA,3056,2544,21800626,165500.312,CHEST (PA AND LAT),postero-anterior,Erect
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,10000032,53189527,CHEST (PA AND LAT),LATERAL,3056,2544,21800626,165500.312,CHEST (PA AND LAT),lateral,Erect
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,CHEST (PORTABLE AP),AP,2705,2539,21800723,80556.875,CHEST (PORTABLE AP),antero-posterior,


In [3]:
reflacx_meta_df =  pd.concat([pd.read_csv( os.path.join(REFLACX_FOLDER_PATH, "main_data", f"metadata_phase_{i}.csv")) for i in range(1,4)])
reflacx_meta_df.head(5)

Unnamed: 0,id,split,eye_tracking_data_discarded,image,dicom_id,subject_id,image_size_x,image_size_y,Airway wall thickening,Atelectasis,...,Support devices,Wide mediastinum,Abnormal mediastinal contour,Acute fracture,Enlarged hilum,Hiatal hernia,High lung volume / emphysema,Interstitial lung disease,Lung nodule or mass,Pleural abnormality
0,P102R108387,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,0.0,...,True,0.0,,,,,,,,
1,P102R379837,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,0.0,...,True,0.0,,,,,,,,
2,P102R558314,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,4.0,0.0,...,True,0.0,,,,,,,,
3,P102R765317,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,0.0,...,True,2.0,,,,,,,,
4,P102R915878,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,0.0,...,True,0.0,,,,,,,,


In [4]:
eye_gaze_master_df = pd.read_csv(os.path.join(
    EYEGAZE_FOLDER_PATH, "master_sheet.csv"))
eye_gaze_master_df.head(5)


Unnamed: 0,dicom_id,path,study_id,patient_id,stay_id,gender,anchor_age,image_top_pad,image_bottom_pad,image_left_pad,...,fracture__chx,lung_lesion__chx,lung_opacity__chx,no_finding__chx,pleural_effusion__chx,pleural_other__chx,pneumonia__chx,pneumothorax__chx,support_devices__chx,cxr_exam_indication
0,24c7496c-d7635dfe-b8e0b87f-d818affc-78ff7cf4,files/p15/p15628804/s58573295/24c7496c-d7635df...,58573295,15628804,33811834,F,20 - 30,86,86,448,...,,,,,,,,,,___F with CHF and shortness of breath// ?Pulm...
1,78711a04-264d5305-d5feec9b-ebef1cec-fdc6db9c,files/p19/p19462352/s51900589/78711a04-264d530...,51900589,19462352,32954494,F,20 - 30,0,0,534,...,,,,,1.0,,1.0,,,"___F with hypertension, tachycardia"
2,a770d8d6-7b6a62ff-815ab876-c81709a8-9a654a54,files/p11/p11255143/s50941783/a770d8d6-7b6a62f...,50941783,11255143,34005408,F,20 - 30,0,0,534,...,,,,1.0,,,,,1.0,History of myocardial infarction. Shortness o...
3,8e457921-bc1af8aa-a65073c1-aaac8247-c5ceb780,files/p10/p10526322/s55981398/8e457921-bc1af8a...,55981398,10526322,36680301,F,20 - 30,0,0,534,...,,,,,,,,,,___F with SOB
4,62fe5d5a-1806ee3c-f4e742fa-f2b036ea-d390057a,files/p12/p12055181/s59722264/62fe5d5a-1806ee3...,59722264,12055181,30138691,M,20 - 30,0,0,534,...,,,1.0,,1.0,,-1.0,,,"___M with DOE, SOB in supine position // eval..."


## Remove the CXR-images not in EyeGaze or REFLACX

In [5]:
all_dicom_id_in_eye_gaze = list(eye_gaze_master_df['dicom_id'])
all_dicom_id_in_reflacx = list(reflacx_meta_df['dicom_id'])
all_dicoms_ids_in_REFLACX_EyeGaze =  set(all_dicom_id_in_eye_gaze+all_dicom_id_in_reflacx)

CXR_meta_df = CXR_meta_df[CXR_meta_df['dicom_id'].isin(all_dicoms_ids_in_REFLACX_EyeGaze)] 

CXR_meta_df['in_eye_gaze'] = CXR_meta_df["dicom_id"].isin(all_dicom_id_in_eye_gaze)
CXR_meta_df['in_reflacx'] = CXR_meta_df["dicom_id"].isin(all_dicom_id_in_reflacx)

print(f"We have total {len(CXR_meta_df)} CXR-images and {len(list(set(CXR_meta_df['subject_id'])))} patients.")

We have total 3689 CXR-images and 3192 patients.


In [6]:
CXR_meta_df.head(5)

Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning,in_eye_gaze,in_reflacx
177,fa771fa1-d9571d07-bff8f655-327734a7-6e10b29d,10002428,59258773,CHEST (PORTABLE AP),AP,3056,2544,21560419,92717.109,CHEST (PORTABLE AP),antero-posterior,Erect,False,True
181,4873aa08-977bfd31-fb492e64-6ef432d1-3f12cbe3,10002430,53254222,CHEST (PA AND LAT),PA,3056,2544,21250928,160736.171,CHEST (PA AND LAT),postero-anterior,Erect,True,False
266,dcdc4bd9-4301b111-2a65a814-ee8e7bc5-7f0b9a5a,10003400,56466802,CHEST (PORTABLE AP),AP,3056,2544,21361209,133738.015,CHEST (PORTABLE AP),antero-posterior,Erect,False,True
497,5bdabba9-388f6646-ac06b5f5-f68b2fd2-3630de21,10011607,55814288,CHEST (PA AND LAT),AP,3056,2544,21820111,230933.25,CHEST (PA AND LAT),antero-posterior,Erect,False,True
539,4a629500-9c3281ca-90bab490-9b6ac9c1-e5e6a580,10012261,53728467,CHEST (PORTABLE AP),AP,2544,3056,21750906,191151.453,CHEST (PORTABLE AP),antero-posterior,Erect,False,True


In [7]:
# load the edstays for indentifying the stay_id since the CXR-image dataset is a subset of ED.
ED_edstays_df = pd.read_csv(os.path.join(ED_FOLDER_PATH, "edstays.csv"))
ED_diagnosis_df = pd.read_csv(os.path.join(ED_FOLDER_PATH, "diagnosis.csv"))
ED_medrecon_df = pd.read_csv(os.path.join(ED_FOLDER_PATH, "medrecon.csv"))
ED_pyxis_df = pd.read_csv(os.path.join(ED_FOLDER_PATH, "pyxis.csv"))
ED_triage_df = pd.read_csv(os.path.join(ED_FOLDER_PATH, "triage.csv"))
Core_transfers_df = pd.read_csv(os.path.join(
    CLINICAL_FOLDER_PATH, "core", "transfers.csv"))
Core_admissions_df = pd.read_csv(os.path.join(
    CLINICAL_FOLDER_PATH, "core", "admissions.csv"))
Core_patients_df = pd.read_csv(os.path.join(
    CLINICAL_FOLDER_PATH, "core", "patients.csv"))

ED_edstays_df.head(5)


Unnamed: 0,subject_id,hadm_id,stay_id,intime,outtime
0,10000115,,38081480,2154-12-10 02:04:00,2154-12-10 02:16:00
1,10000115,,30295111,2154-12-17 16:37:00,2154-12-17 17:38:00
2,10000473,,33267868,2138-03-15 20:07:00,2138-03-15 20:26:00
3,10000764,27897940.0,35420907,2132-10-14 19:31:00,2132-10-14 23:32:59
4,10001038,,34301067,2149-08-07 08:51:00,2149-08-07 08:55:00


## Identify stay_id for CXR-image

In [8]:
######## Identify stay_id for CXR-images ########

### Use the transfer Id to indentify.
def time_string_to_float(x):
    return float('inf') if type(x) is float and math.isnan(x) else float(x.replace("-", "").replace("/", "").replace(" ", "").replace(":", ""))

## Create studyDateTime for indentifying the stay_id.
CXR_meta_df['StudyDateTime']  = [float(str(d) + str(t))
 for d, t in zip(CXR_meta_df['StudyDate'], CXR_meta_df['StudyTime'])]

# Initialise stay_id
CXR_meta_df['stay_id'] = None

cxr_cannot_find_stay = []

ED_edstays_df['intime_float'] = ED_edstays_df.intime.apply(time_string_to_float)
ED_edstays_df['outtime_float'] = ED_edstays_df.outtime.apply(time_string_to_float)

with tqdm(total= len(CXR_meta_df)) as pbar:
    for idx, cxr_meta_instance in CXR_meta_df.iterrows():
        dicom_id = cxr_meta_instance['dicom_id']
        
        study_time = cxr_meta_instance["StudyDateTime"]

        ed_stay_time_match = ED_edstays_df[
            (ED_edstays_df.subject_id == cxr_meta_instance.subject_id)  
        ]

        if (len(ed_stay_time_match) > 0):
            ed_stay_time_match = ed_stay_time_match[
                (ed_stay_time_match.intime_float < study_time) &
                (ed_stay_time_match.outtime_float > study_time)
            ]


        if (len(ed_stay_time_match) == 1):
            stay_id = ed_stay_time_match.iloc[0]['stay_id']
            CXR_meta_df.at[idx,"stay_id"] = stay_id
        else:
            # print(f"Having problem with {dicom_id} CXR iamge, it has {len(ed_stay_time_match)} matches.")
            cxr_cannot_find_stay.append(cxr_meta_instance)


        pbar.update(1)

# 5m 2.7s on Mac

  0%|          | 0/3689 [00:00<?, ?it/s]

In [9]:
# Store .csv with identified stay_id
os.makedirs(XAMI_SPREADSHEET_FOLDER_PATH, exist_ok=True)
CXR_meta_df.to_csv(os.path.join(XAMI_SPREADSHEET_FOLDER_PATH ,"cxr_meta.csv"))

cxr_df_with_stay_id_only = CXR_meta_df[CXR_meta_df['stay_id'].notnull()]

cxr_df_with_stay_id_only['stay_id'] = cxr_df_with_stay_id_only['stay_id'].apply(lambda x: int(x))
cxr_df_with_stay_id_only.to_csv(os.path.join(XAMI_SPREADSHEET_FOLDER_PATH ,"cxr_meta_with_stay_id_only.csv"))
print(f"Total {len(cxr_cannot_find_stay)} CXR-images can't find their stay_id, {len(cxr_df_with_stay_id_only)} instances found stay_id.")

Total 2461 CXR-images can't find their stay_id, 1228 instances found stay_id.


In [10]:
CXR_meta_df['in_ed'] = CXR_meta_df["stay_id"].isin(set(ED_edstays_df['stay_id']))
CXR_meta_df['in_core'] = CXR_meta_df["subject_id"].isin(set(Core_patients_df['subject_id']))

In [11]:
available_eye_gaze_data = CXR_meta_df[
    (CXR_meta_df["in_ed"]) &
    (CXR_meta_df["in_core"]) &
    (CXR_meta_df["in_eye_gaze"])
]

available_eye_gaze_data.head(5)

Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning,in_eye_gaze,in_reflacx,StudyDateTime,stay_id,in_ed,in_core
181,4873aa08-977bfd31-fb492e64-6ef432d1-3f12cbe3,10002430,53254222,CHEST (PA AND LAT),PA,3056,2544,21250928,160736.171,CHEST (PA AND LAT),postero-anterior,Erect,True,False,21250930000000.0,31293660,True,True
567,8106217e-c41ab813-c6002d3f-ed4ee98b-7b165bae,10012498,55812956,CHEST (PA AND LAT),PA,3056,2544,21470331,211642.296,CHEST (PA AND LAT),postero-anterior,Erect,True,False,21470330000000.0,37362927,True,True
727,5f1ac54d-47dfae00-930fa704-514131fa-ee0c138e,10015701,53321493,,PA,3056,2544,21330801,163117.156,postero-anterior,Erect,,True,False,21330800000000.0,30941688,True,True
2391,7847b651-c522f27b-8f94d954-79307a95-885317a1,10064049,54392870,CHEST (PA AND LAT),PA,2544,3056,21620811,222923.515,CHEST (PA AND LAT),postero-anterior,Erect,True,False,21620810000000.0,37042568,True,True
2797,9c01aecd-dc299044-a9fac53a-09a86bf1-58878e8a,10074605,59245019,CHEST (PA AND LAT),PA,3056,2544,21900201,151110.765,CHEST (PA AND LAT),postero-anterior,Erect,True,False,21900200000000.0,35307197,True,True


In [12]:
available_reflacx_data = CXR_meta_df[
    (CXR_meta_df["in_ed"]) &
    (CXR_meta_df["in_core"]) &
    (CXR_meta_df["in_reflacx"])
]
available_reflacx_data.head(5)

Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning,in_eye_gaze,in_reflacx,StudyDateTime,stay_id,in_ed,in_core
266,dcdc4bd9-4301b111-2a65a814-ee8e7bc5-7f0b9a5a,10003400,56466802,CHEST (PORTABLE AP),AP,3056,2544,21361209,133738.015,CHEST (PORTABLE AP),antero-posterior,Erect,False,True,21361210000000.0,33678912,True,True
497,5bdabba9-388f6646-ac06b5f5-f68b2fd2-3630de21,10011607,55814288,CHEST (PA AND LAT),AP,3056,2544,21820111,230933.25,CHEST (PA AND LAT),antero-posterior,Erect,False,True,21820110000000.0,37054412,True,True
539,4a629500-9c3281ca-90bab490-9b6ac9c1-e5e6a580,10012261,53728467,CHEST (PORTABLE AP),AP,2544,3056,21750906,191151.453,CHEST (PORTABLE AP),antero-posterior,Erect,False,True,21750910000000.0,38668412,True,True
583,38a5ffc8-93297f03-3f0a14e4-aa8ca225-7e968b5b,10013324,51191158,CHEST (PA AND LAT),PA,3056,2544,21231028,171318.828,CHEST (PA AND LAT),postero-anterior,Erect,False,True,21231030000000.0,30504536,True,True
1835,e5ba5704-ce2f09d3-e28fe2a2-8a9aca96-86f4966a,10046166,57379357,CHEST (PA AND LAT),AP,2544,3056,21330321,115719.296,CHEST (PA AND LAT),antero-posterior,Erect,False,True,21330320000000.0,37162571,True,True


## Create stay_id folder and import CXR-data

In [13]:
######################################
# Tables can be linked with both patient_id and stay_id
######################################
# core - tranfers 
# ed - diagnosis
# ed - edstays
# ed - medrecon 
# ed - pyxis 
# ed - triage 
# ed - vitalsign

######################################
# Tables can be linked with patient_id
######################################
# core - admissions
# core - patients 

In [14]:
# Import ED
import_to_patient_sub_folder(
    CXR_meta_df, ED_diagnosis_df, "ED", "diagnosis")
import_to_patient_sub_folder(
    CXR_meta_df, ED_edstays_df, "ED", "edstays")
import_to_patient_sub_folder(
    CXR_meta_df, ED_medrecon_df, "ED", "medrecon")
import_to_patient_sub_folder(
    CXR_meta_df, ED_pyxis_df, "ED", "pyxis")
import_to_patient_sub_folder(
    CXR_meta_df, ED_triage_df, "ED", "triage")

# Import Core
import_to_patient_sub_folder(
    CXR_meta_df, Core_transfers_df, "Core", "transfers")
import_to_patient_sub_folder(
    CXR_meta_df, Core_admissions_df, "Core", "admissions")
import_to_patient_sub_folder(CXR_meta_df, Core_patients_df, "Core", "patients")


  0%|          | 0/3180 [00:00<?, ?it/s]

  0%|          | 0/3184 [00:00<?, ?it/s]

  0%|          | 0/2819 [00:00<?, ?it/s]

  0%|          | 0/2943 [00:00<?, ?it/s]

  0%|          | 0/3184 [00:00<?, ?it/s]

  0%|          | 0/3185 [00:00<?, ?it/s]

  0%|          | 0/2910 [00:00<?, ?it/s]

  0%|          | 0/3185 [00:00<?, ?it/s]