# Clinical data import

This file contain followed sections:

1. Create a folder for every patients inside the MIMIC-CXR dataset.
2. Identify stay_id for CXR-image.
3. Create stay_id folder and import CXR-meta.
4. Import otehr clinical data.
5. Save CXR spread sheet.

In [1]:
import pandas as pd
import os
import math
from tqdm.notebook import tqdm
from utils.importer import import_to_patient_sub_folder
from data_path import disk_location, PHYSIONET_PATH, CXR_JPG_FOLDER_PATH, REFLACX_FOLDER_PATH, EYEGAZE_FOLDER_PATH, ED_FOLDER_PATH, HOSP_FOLDER_PATH, XAMI_SPREADSHEET_FOLDER_PATH, ICU_FOLDER_PATH

pd.options.mode.chained_assignment = None


## Load needed dataframe

In [2]:
CXR_JPG_FOLDER_PATH


'D:/physionet.org/files/mimic-cxr-jpg/2.0.0'

In [3]:
os.path.join(CXR_JPG_FOLDER_PATH, "mimic-cxr-2.0.0-metadata.csv.gz")




In [4]:
CXR_meta_df = pd.read_csv(os.path.join(CXR_JPG_FOLDER_PATH, "mimic-cxr-2.0.0-metadata.csv.gz"),
                          compression='gzip', header=0, sep=',', quotechar='"')
CXR_meta_df.head(5)


Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,CHEST (PA AND LAT),PA,3056,2544,21800506,213014.531,CHEST (PA AND LAT),postero-anterior,Erect
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,10000032,50414267,CHEST (PA AND LAT),LATERAL,3056,2544,21800506,213014.531,CHEST (PA AND LAT),lateral,Erect
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,CHEST (PA AND LAT),PA,3056,2544,21800626,165500.312,CHEST (PA AND LAT),postero-anterior,Erect
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,10000032,53189527,CHEST (PA AND LAT),LATERAL,3056,2544,21800626,165500.312,CHEST (PA AND LAT),lateral,Erect
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,CHEST (PORTABLE AP),AP,2705,2539,21800723,80556.875,CHEST (PORTABLE AP),antero-posterior,


In [5]:
reflacx_meta_df = pd.concat([pd.read_csv(os.path.join(
    REFLACX_FOLDER_PATH, "main_data", f"metadata_phase_{i}.csv")) for i in range(1, 4)])
reflacx_meta_df.head(5)


Unnamed: 0,id,split,eye_tracking_data_discarded,image,dicom_id,subject_id,image_size_x,image_size_y,Airway wall thickening,Atelectasis,...,Support devices,Wide mediastinum,Abnormal mediastinal contour,Acute fracture,Enlarged hilum,Hiatal hernia,High lung volume / emphysema,Interstitial lung disease,Lung nodule or mass,Pleural abnormality
0,P102R108387,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,0.0,...,True,0.0,,,,,,,,
1,P102R379837,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,0.0,...,True,0.0,,,,,,,,
2,P102R558314,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,4.0,0.0,...,True,0.0,,,,,,,,
3,P102R765317,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,0.0,...,True,2.0,,,,,,,,
4,P102R915878,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,0.0,...,True,0.0,,,,,,,,


In [6]:
eye_gaze_master_df = pd.read_csv(os.path.join(
    EYEGAZE_FOLDER_PATH, "master_sheet.csv"))
eye_gaze_master_df.head(5)


Unnamed: 0,dicom_id,path,study_id,patient_id,stay_id,gender,anchor_age,image_top_pad,image_bottom_pad,image_left_pad,...,fracture__chx,lung_lesion__chx,lung_opacity__chx,no_finding__chx,pleural_effusion__chx,pleural_other__chx,pneumonia__chx,pneumothorax__chx,support_devices__chx,cxr_exam_indication
0,24c7496c-d7635dfe-b8e0b87f-d818affc-78ff7cf4,files/p15/p15628804/s58573295/24c7496c-d7635df...,58573295,15628804,33811834,F,20 - 30,86,86,448,...,,,,,,,,,,___F with CHF and shortness of breath// ?Pulm...
1,78711a04-264d5305-d5feec9b-ebef1cec-fdc6db9c,files/p19/p19462352/s51900589/78711a04-264d530...,51900589,19462352,32954494,F,20 - 30,0,0,534,...,,,,,1.0,,1.0,,,"___F with hypertension, tachycardia"
2,a770d8d6-7b6a62ff-815ab876-c81709a8-9a654a54,files/p11/p11255143/s50941783/a770d8d6-7b6a62f...,50941783,11255143,34005408,F,20 - 30,0,0,534,...,,,,1.0,,,,,1.0,History of myocardial infarction. Shortness o...
3,8e457921-bc1af8aa-a65073c1-aaac8247-c5ceb780,files/p10/p10526322/s55981398/8e457921-bc1af8a...,55981398,10526322,36680301,F,20 - 30,0,0,534,...,,,,,,,,,,___F with SOB
4,62fe5d5a-1806ee3c-f4e742fa-f2b036ea-d390057a,files/p12/p12055181/s59722264/62fe5d5a-1806ee3...,59722264,12055181,30138691,M,20 - 30,0,0,534,...,,,1.0,,1.0,,-1.0,,,"___M with DOE, SOB in supine position // eval..."


## Remove the CXR-images not in EyeGaze or REFLACX

In [7]:
all_dicom_id_in_eye_gaze = list(eye_gaze_master_df['dicom_id'])
all_dicom_id_in_reflacx = list(reflacx_meta_df['dicom_id'])
all_dicoms_ids_in_REFLACX_EyeGaze = set(
    all_dicom_id_in_eye_gaze+all_dicom_id_in_reflacx)

CXR_meta_df = CXR_meta_df[CXR_meta_df['dicom_id'].isin(
    all_dicoms_ids_in_REFLACX_EyeGaze)]

CXR_meta_df['in_eye_gaze'] = CXR_meta_df["dicom_id"].isin(
    all_dicom_id_in_eye_gaze)
CXR_meta_df['in_reflacx'] = CXR_meta_df["dicom_id"].isin(
    all_dicom_id_in_reflacx)

print(
    f"We have total {len(CXR_meta_df)} CXR-images and {len(list(set(CXR_meta_df['subject_id'])))} patients.")


We have total 3689 CXR-images and 3192 patients.


In [8]:
CXR_meta_df

Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning,in_eye_gaze,in_reflacx
177,fa771fa1-d9571d07-bff8f655-327734a7-6e10b29d,10002428,59258773,CHEST (PORTABLE AP),AP,3056,2544,21560419,92717.109,CHEST (PORTABLE AP),antero-posterior,Erect,False,True
181,4873aa08-977bfd31-fb492e64-6ef432d1-3f12cbe3,10002430,53254222,CHEST (PA AND LAT),PA,3056,2544,21250928,160736.171,CHEST (PA AND LAT),postero-anterior,Erect,True,False
266,dcdc4bd9-4301b111-2a65a814-ee8e7bc5-7f0b9a5a,10003400,56466802,CHEST (PORTABLE AP),AP,3056,2544,21361209,133738.015,CHEST (PORTABLE AP),antero-posterior,Erect,False,True
497,5bdabba9-388f6646-ac06b5f5-f68b2fd2-3630de21,10011607,55814288,CHEST (PA AND LAT),AP,3056,2544,21820111,230933.250,CHEST (PA AND LAT),antero-posterior,Erect,False,True
539,4a629500-9c3281ca-90bab490-9b6ac9c1-e5e6a580,10012261,53728467,CHEST (PORTABLE AP),AP,2544,3056,21750906,191151.453,CHEST (PORTABLE AP),antero-posterior,Erect,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
376634,d9661ff6-877ac981-a20a8810-92309d46-173008ad,19991135,50498205,CHEST (PORTABLE AP),AP,3056,2544,21260910,53556.406,CHEST (PORTABLE AP),antero-posterior,,False,True
376636,d8f5555c-d8bcf97c-3a9d5596-17b9f854-6e15d081,19991135,50634986,,PA,2022,1736,21261102,151208.000,CHEST (PA AND LAT),postero-anterior,Erect,False,True
376644,d04bbcbd-5143439e-50ebe9a8-71380f67-44f8d127,19991135,54910031,CHEST (PORTABLE AP),AP,3056,2544,21260908,202437.031,CHEST (PORTABLE AP),antero-posterior,Erect,False,True
377010,7cfe7acc-65b0b548-feb77772-04c46d33-76a03230,19997367,58396642,CHEST (PORTABLE AP),AP,2712,2402,21260425,134114.328,CHEST (PORTABLE AP),antero-posterior,Erect,False,True


In [9]:
CXR_meta_df.head(5)


Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning,in_eye_gaze,in_reflacx
177,fa771fa1-d9571d07-bff8f655-327734a7-6e10b29d,10002428,59258773,CHEST (PORTABLE AP),AP,3056,2544,21560419,92717.109,CHEST (PORTABLE AP),antero-posterior,Erect,False,True
181,4873aa08-977bfd31-fb492e64-6ef432d1-3f12cbe3,10002430,53254222,CHEST (PA AND LAT),PA,3056,2544,21250928,160736.171,CHEST (PA AND LAT),postero-anterior,Erect,True,False
266,dcdc4bd9-4301b111-2a65a814-ee8e7bc5-7f0b9a5a,10003400,56466802,CHEST (PORTABLE AP),AP,3056,2544,21361209,133738.015,CHEST (PORTABLE AP),antero-posterior,Erect,False,True
497,5bdabba9-388f6646-ac06b5f5-f68b2fd2-3630de21,10011607,55814288,CHEST (PA AND LAT),AP,3056,2544,21820111,230933.25,CHEST (PA AND LAT),antero-posterior,Erect,False,True
539,4a629500-9c3281ca-90bab490-9b6ac9c1-e5e6a580,10012261,53728467,CHEST (PORTABLE AP),AP,2544,3056,21750906,191151.453,CHEST (PORTABLE AP),antero-posterior,Erect,False,True


In [10]:
# load the edstays for indentifying the stay_id since the CXR-image dataset is a subset of ED.
# ED_edstays_df = pd.read_csv(os.path.join(ED_FOLDER_PATH, "edstays.csv"))
# ED_diagnosis_df = pd.read_csv(os.path.join(ED_FOLDER_PATH, "diagnosis.csv"))
# ED_medrecon_df = pd.read_csv(os.path.join(ED_FOLDER_PATH, "medrecon.csv"))
# ED_pyxis_df = pd.read_csv(os.path.join(ED_FOLDER_PATH, "pyxis.csv"))
# ED_triage_df = pd.read_csv(os.path.join(ED_FOLDER_PATH, "triage.csv"))

# necessary
hosp_patients_df = pd.read_csv(os.path.join(
    HOSP_FOLDER_PATH,  "patients.csv.gz"))
ed_triage_df = pd.read_csv(os.path.join(ED_FOLDER_PATH, "triage.csv.gz"))

ed_edstays_df = pd.read_csv(os.path.join(ED_FOLDER_PATH, "edstays.csv.gz"))

# clinical data will also be passed into spreadsheet folder.

hosp_folder_path = os.path.join(
    XAMI_SPREADSHEET_FOLDER_PATH, "Hosp")
os.makedirs(hosp_folder_path, exist_ok=True)
hosp_patients_df.to_csv(os.path.join(
    hosp_folder_path, "patients.csv"))


ed_folder_path = os.path.join(
    XAMI_SPREADSHEET_FOLDER_PATH, "ED")
os.makedirs(ed_folder_path, exist_ok=True)
ed_triage_df.to_csv(os.path.join(
    ed_folder_path, "triage.csv"))

# ed

# Core_transfers_df = pd.read_csv(os.path.join(
#     CLINICAL_FOLDER_PATH, "core", "transfers.csv"))
# Core_admissions_df = pd.read_csv(os.path.join(
#     CLINICAL_FOLDER_PATH, "core", "admissions.csv"))
# Core_patients_df = pd.read_csv(os.path.join(
#     CLINICAL_FOLDER_PATH, "core", "patients.csv"))

# hosp_microbiologyevents_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "microbiologyevents.csv.gz"))
# hosp_admissions_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "admissions.csv.gz"))
# hosp_omr_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "omr.csv.gz"))
# hosp_d_hcpcs_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "d_hcpcs.csv.gz"))
# hosp_d_icd_diagnoses_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "d_icd_diagnoses.csv.gz"))
# hosp_patients_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "patients.csv.gz"))
# hosp_d_icd_procedures_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "d_icd_procedures.csv.gz"))
# hosp_pharmacy_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "pharmacy.csv.gz"))
# hosp_d_labitems_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "d_labitems.csv.gz"))
# hosp_poe_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "poe.csv.gz"))
# hosp_diagnoses_icd_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "diagnoses_icd.csv.gz"))
# hosp_poe_detail_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "poe_detail.csv.gz"))
# hosp_drgcodes_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "drgcodes.csv.gz"))
# hosp_prescriptions_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "prescriptions.csv.gz"))
# hosp_emar_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "emar.csv.gz"))
# hosp_procedures_icd_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "procedures_icd.csv.gz"))
# hosp_emar_detail_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "emar_detail.csv.gz"))
# hosp_services_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "services.csv.gz"))
# hosp_hcpcsevents_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "hcpcsevents.csv.gz"))
# hosp_transfers_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "transfers.csv.gz"))
# hosp_labevents_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "labevents.csv.gz"))


# we have to load other module csv files here.

ed_edstays_df.head(5)


Unnamed: 0,subject_id,hadm_id,stay_id,intime,outtime,gender,race,arrival_transport,disposition
0,10000032,22595853.0,33258284,2180-05-06 19:17:00,2180-05-06 23:30:00,F,WHITE,AMBULANCE,ADMITTED
1,10000032,22841357.0,38112554,2180-06-26 15:54:00,2180-06-26 21:31:00,F,WHITE,AMBULANCE,ADMITTED
2,10000032,25742920.0,35968195,2180-08-05 20:58:00,2180-08-06 01:44:00,F,WHITE,AMBULANCE,ADMITTED
3,10000032,29079034.0,32952584,2180-07-22 16:24:00,2180-07-23 05:54:00,F,WHITE,AMBULANCE,HOME
4,10000032,29079034.0,39399961,2180-07-23 05:54:00,2180-07-23 14:00:00,F,WHITE,AMBULANCE,ADMITTED


## Identify stay_id for CXR-image

In [11]:
######## Identify stay_id for CXR-images ########

# Use the transfer Id to indentify.
def time_string_to_float(x):
    return float('inf') if type(x) is float and math.isnan(x) else float(x.replace("-", "").replace("/", "").replace(" ", "").replace(":", ""))


# Create studyDateTime for indentifying the stay_id.
CXR_meta_df['StudyDateTime'] = [float(str(d) + str(t))
                                for d, t in zip(CXR_meta_df['StudyDate'], CXR_meta_df['StudyTime'])]

# Initialise stay_id
CXR_meta_df['stay_id'] = None

cxr_cannot_find_stay = []

ed_edstays_df['intime_float'] = ed_edstays_df.intime.apply(
    time_string_to_float)
ed_edstays_df['outtime_float'] = ed_edstays_df.outtime.apply(
    time_string_to_float)

with tqdm(total=len(CXR_meta_df)) as pbar:
    for idx, cxr_meta_instance in CXR_meta_df.iterrows():
        dicom_id = cxr_meta_instance['dicom_id']

        study_time = cxr_meta_instance["StudyDateTime"]

        ed_stay_time_match = ed_edstays_df[
            (ed_edstays_df.subject_id == cxr_meta_instance.subject_id)
        ]

        if (len(ed_stay_time_match) > 0):
            ed_stay_time_match = ed_stay_time_match[
                (ed_stay_time_match.intime_float < study_time) &
                (ed_stay_time_match.outtime_float > study_time)
            ]

        if (len(ed_stay_time_match) == 1):
            stay_id = ed_stay_time_match.iloc[0]['stay_id']
            CXR_meta_df.at[idx, "stay_id"] = stay_id
        else:
            # print(f"Having problem with {dicom_id} CXR iamge, it has {len(ed_stay_time_match)} matches.")
            cxr_cannot_find_stay.append(cxr_meta_instance)

        pbar.update(1)

# 5m 2.7s on Mac


  0%|          | 0/3689 [00:00<?, ?it/s]

In [12]:
# Store .csv with identified stay_id
os.makedirs(XAMI_SPREADSHEET_FOLDER_PATH, exist_ok=True)
CXR_meta_df.to_csv(os.path.join(XAMI_SPREADSHEET_FOLDER_PATH, "cxr_meta.csv"))

cxr_df_with_stay_id_only = CXR_meta_df[CXR_meta_df['stay_id'].notnull()]

cxr_df_with_stay_id_only['stay_id'] = cxr_df_with_stay_id_only['stay_id'].apply(
    lambda x: int(x))
cxr_df_with_stay_id_only.to_csv(os.path.join(
    XAMI_SPREADSHEET_FOLDER_PATH, "cxr_meta_with_stay_id_only.csv"))
print(f"Total {len(cxr_cannot_find_stay)} CXR-images can't find their stay_id, {len(cxr_df_with_stay_id_only)} instances found stay_id.")


Total 2006 CXR-images can't find their stay_id, 1683 instances found stay_id.


In [13]:
CXR_meta_df['in_ed'] = CXR_meta_df["stay_id"].isin(
    set(ed_edstays_df['stay_id']))
CXR_meta_df['in_core'] = CXR_meta_df["subject_id"].isin(
    set(hosp_patients_df['subject_id']))


In [14]:
available_eye_gaze_data = CXR_meta_df[
    (CXR_meta_df["in_ed"]) &
    (CXR_meta_df["in_core"]) &
    (CXR_meta_df["in_eye_gaze"])
]

available_eye_gaze_data.head(5)


Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning,in_eye_gaze,in_reflacx,StudyDateTime,stay_id,in_ed,in_core
181,4873aa08-977bfd31-fb492e64-6ef432d1-3f12cbe3,10002430,53254222,CHEST (PA AND LAT),PA,3056,2544,21250928,160736.171,CHEST (PA AND LAT),postero-anterior,Erect,True,False,21250930000000.0,31293660,True,True
567,8106217e-c41ab813-c6002d3f-ed4ee98b-7b165bae,10012498,55812956,CHEST (PA AND LAT),PA,3056,2544,21470331,211642.296,CHEST (PA AND LAT),postero-anterior,Erect,True,False,21470330000000.0,37362927,True,True
727,5f1ac54d-47dfae00-930fa704-514131fa-ee0c138e,10015701,53321493,,PA,3056,2544,21330801,163117.156,postero-anterior,Erect,,True,False,21330800000000.0,30941688,True,True
900,d8bb1eda-1acb9229-4531796f-3f4dd3da-ed32b7e2,10019593,58500109,CHEST (PA AND LAT),PA,3012,2540,21820730,104318.718,CHEST (PA AND LAT),postero-anterior,Erect,True,False,21820730000000.0,39709989,True,True
1544,1a1932a9-6d40a33f-f52e88ba-b2c05012-155785dd,10037602,56937008,CHEST (PA AND LAT),PA,2544,3056,21480517,120621.031,CHEST (PA AND LAT),postero-anterior,Erect,True,False,21480520000000.0,35640513,True,True


In [15]:
available_reflacx_data = CXR_meta_df[
    (CXR_meta_df["in_ed"]) &
    (CXR_meta_df["in_core"]) &
    (CXR_meta_df["in_reflacx"])
]
available_reflacx_data.head(5)


Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning,in_eye_gaze,in_reflacx,StudyDateTime,stay_id,in_ed,in_core
266,dcdc4bd9-4301b111-2a65a814-ee8e7bc5-7f0b9a5a,10003400,56466802,CHEST (PORTABLE AP),AP,3056,2544,21361209,133738.015,CHEST (PORTABLE AP),antero-posterior,Erect,False,True,21361210000000.0,33678912,True,True
497,5bdabba9-388f6646-ac06b5f5-f68b2fd2-3630de21,10011607,55814288,CHEST (PA AND LAT),AP,3056,2544,21820111,230933.25,CHEST (PA AND LAT),antero-posterior,Erect,False,True,21820110000000.0,37054412,True,True
539,4a629500-9c3281ca-90bab490-9b6ac9c1-e5e6a580,10012261,53728467,CHEST (PORTABLE AP),AP,2544,3056,21750906,191151.453,CHEST (PORTABLE AP),antero-posterior,Erect,False,True,21750910000000.0,38668412,True,True
583,38a5ffc8-93297f03-3f0a14e4-aa8ca225-7e968b5b,10013324,51191158,CHEST (PA AND LAT),PA,3056,2544,21231028,171318.828,CHEST (PA AND LAT),postero-anterior,Erect,False,True,21231030000000.0,30504536,True,True
1670,6c79a3c1-8d01e0d4-b8b42695-900cae5d-18a8e87a,10039913,58774423,CHEST (PA AND LAT),PA,3056,2480,21300218,182846.375,CHEST (PA AND LAT),postero-anterior,Erect,False,True,21300220000000.0,39772803,True,True


## Create stay_id folder and import CXR-data

In [16]:
######################################
# Tables can be linked with both patient_id and stay_id
######################################
# core - tranfers
# ed - diagnosis
# ed - edstays
# ed - medrecon
# ed - pyxis
# ed - triage
# ed - vitalsign

######################################
# Tables can be linked with patient_id
######################################
# core - admissions
# core - patients


In [17]:
def import_to_folders(CXR_meta_df, import_dfs, folder_path, folder_name):
    for n in import_dfs:
        df = pd.read_csv(os.path.join(folder_path,  f"{n}.csv.gz"))
        print(f"Importing {n}")
        if "subject_id" in df.columns:
            print(f"Importing {n}")
            # import to patient folder
            import_to_patient_sub_folder(CXR_meta_df, df, folder_name, n)
        else:
            # import to spreadsheet
            print(f"Importing {n} to spreadsheet")
            spreadsheet_folder_path = os.path.join(
                XAMI_SPREADSHEET_FOLDER_PATH, folder_name)
            os.makedirs(spreadsheet_folder_path, exist_ok=True)
            df.to_csv(os.path.join(
                spreadsheet_folder_path, f"{n}.csv"))
        del df


In [18]:
# Import ED
# import_to_patient_sub_folder(
#     CXR_meta_df, ED_diagnosis_df, "ED", "diagnosis")
# import_to_patient_sub_folder(
#     CXR_meta_df, ED_edstays_df, "ED", "edstays")
# import_to_patient_sub_folder(
#     CXR_meta_df, ED_medrecon_df, "ED", "medrecon")
# import_to_patient_sub_folder(
#     CXR_meta_df, ED_pyxis_df, "ED", "pyxis")
# import_to_patient_sub_folder(
#     CXR_meta_df, ED_triage_df, "ED", "triage")


# # Import Core
# import_to_patient_sub_folder(
#     CXR_meta_df, Core_transfers_df, "Core", "transfers")
# import_to_patient_sub_folder(
#     CXR_meta_df, Core_admissions_df, "Core", "admissions")
# import_to_patient_sub_folder(CXR_meta_df, Core_patients_df, "Core", "patients")

# instead of using the core module above, in the 2.0 version, only hosp module is used. the info in core is transfered into it.
# has loaded.
# hosp_patients_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "patients.csv.gz"))

import_hosp_dfs = [
    "microbiologyevents",
    "admissions",
    "omr",
    "pharmacy",
    "poe",
    "diagnoses_icd",
    "poe_detail",
    "drgcodes",
    "prescriptions",
    "emar",
    "procedures_icd",
    "emar_detail",
    "services",
    "hcpcsevents",
    "transfers",
    "labevents",
    "d_hcpcs",
    "d_icd_diagnoses",
    "d_icd_procedures",
    "d_labitems",
]

import_ed_dfs = [
    "diagnosis",
    "medrecon",
    "vitalsign",
    "edstays",
    "pyxis",
    "triage",
]

import_icu_dfs = [
    # "chartevents",	
    "icustays",		
    "inputevents",
    "d_items",				
    "outputevents",
    "datetimeevents",	
    "ingredientevents",	
    "procedureevents",
]

import_to_folders(CXR_meta_df, import_hosp_dfs, HOSP_FOLDER_PATH, "Hosp")
import_to_folders(CXR_meta_df, import_ed_dfs, ED_FOLDER_PATH, "ED")
import_to_folders(CXR_meta_df, import_icu_dfs, ICU_FOLDER_PATH, "ICU")


# cxr_chexpert_df.to_csv(os.path.join(
#     spreadsheet_folder_path, "cxr_chexpert.csv"))
# cxr_negbio_df.to_csv(os.path.join(
#     spreadsheet_folder_path, "cxr_negbio.csv"))
# cxr_split_df.to_csv(os.path.join(
#     spreadsheet_folder_path, "cxr_split.csv"))


# for n in import_ed_dfs:
#     df = pd.read_csv(os.path.join(ED_FOLDER_PATH, f"{n}.csv.gz"))
#     if "subject_id" in df.columns:
#         import_to_patient_sub_folder(CXR_meta_df, df , "ED", "diagnosis")
#     else:
#         # import to spreadsheet

#     del df


# ed_diagnosis_df = pd.read_csv(os.path.join(ED_FOLDER_PATH, "diagnosis.csv.gz"))
# ed_medrecon_df = pd.read_csv(os.path.join(ED_FOLDER_PATH, "medrecon.csv.gz"))
# ed_vitalsign_df = pd.read_csv(os.path.join(ED_FOLDER_PATH, "vitalsign.csv.gz"))
# ed_pyxis_df = pd.read_csv(os.path.join(ED_FOLDER_PATH, "pyxis.csv.gz"))
# ed_triage_df = pd.read_csv(os.path.join(ED_FOLDER_PATH, "triage.csv.gz"))

# hosp_microbiologyevents_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "microbiologyevents.csv.gz"))
# hosp_admissions_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "admissions.csv.gz"))
# hosp_omr_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "omr.csv.gz")) #
# hosp_pharmacy_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "pharmacy.csv.gz"))
# hosp_poe_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "poe.csv.gz"))
# hosp_diagnoses_icd_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "diagnoses_icd.csv.gz"))
# hosp_poe_detail_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "poe_detail.csv.gz")) #
# hosp_drgcodes_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "drgcodes.csv.gz"))
# hosp_prescriptions_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "prescriptions.csv.gz"))
# hosp_emar_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "emar.csv.gz"))
# hosp_procedures_icd_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "procedures_icd.csv.gz"))
# hosp_emar_detail_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "emar_detail.csv.gz"))
# hosp_services_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "services.csv.gz"))
# hosp_hcpcsevents_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "hcpcsevents.csv.gz"))
# hosp_transfers_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "transfers.csv.gz"))
# hosp_labevents_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "labevents.csv.gz"))

#### Code mapping tables, import these to spreadsheets ####
# hosp_d_hcpcs_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "d_hcpcs.csv.gz"))
# hosp_d_icd_diagnoses_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "d_icd_diagnoses.csv.gz"))
# hosp_d_icd_procedures_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "d_icd_procedures.csv.gz"))
# hosp_d_labitems_df = pd.read_csv(os.path.join( HOSP_FOLDER_PATH,  "d_labitems.csv.gz"))

# import_to_patient_sub_folder( CXR_meta_df, hosp_d_hcpcs_df, "Hosp", "d_hcpcs")
# import_to_patient_sub_folder( CXR_meta_df, hosp_d_icd_diagnoses_df, "Hosp", "d_icd_diagnoses")
# import_to_patient_sub_folder( CXR_meta_df, hosp_d_icd_procedures_df, "Hosp", "d_icd_procedures")
# import_to_patient_sub_folder( CXR_meta_df, hosp_d_labitems_df, "Hosp", "d_labitems")
###########################################################

# import_to_patient_sub_folder( CXR_meta_df, hosp_microbiologyevents_df, "Hosp", "microbiologyevents")
# import_to_patient_sub_folder( CXR_meta_df, hosp_admissions_df, "Hosp", "admissions")
# import_to_patient_sub_folder( CXR_meta_df, hosp_omr_df, "Hosp", "omr")
# import_to_patient_sub_folder( CXR_meta_df, hosp_patients_df, "Hosp", "patients")
# import_to_patient_sub_folder( CXR_meta_df, hosp_pharmacy_df, "Hosp", "pharmacy")
# import_to_patient_sub_folder( CXR_meta_df, hosp_poe_df, "Hosp", "poe")
# import_to_patient_sub_folder( CXR_meta_df, hosp_diagnoses_icd_df, "Hosp", "diagnoses_icd")
# import_to_patient_sub_folder( CXR_meta_df, hosp_poe_detail_df, "Hosp", "poe_detail")
# import_to_patient_sub_folder( CXR_meta_df, hosp_drgcodes_df, "Hosp", "drgcodes")
# import_to_patient_sub_folder( CXR_meta_df, hosp_prescriptions_df, "Hosp", "prescriptions")
# import_to_patient_sub_folder( CXR_meta_df, hosp_emar_df, "Hosp", "emar")
# import_to_patient_sub_folder( CXR_meta_df, hosp_procedures_icd_df, "Hosp", "procedures_icd")
# import_to_patient_sub_folder( CXR_meta_df, hosp_emar_detail_df, "Hosp", "emar_detail")
# import_to_patient_sub_folder( CXR_meta_df, hosp_services_df, "Hosp", "services")
# import_to_patient_sub_folder( CXR_meta_df, hosp_hcpcsevents_df, "Hosp", "hcpcsevents")
# import_to_patient_sub_folder( CXR_meta_df, hosp_transfers_df, "Hosp", "transfers")
# import_to_patient_sub_folder( CXR_meta_df, hosp_labevents_df, "Hosp", "labevents")


# do the same for ed module as well.

# import_to_patient_sub_folder(CXR_meta_df, ed_diagnosis_df , "ED", "diagnosis")
# import_to_patient_sub_folder(CXR_meta_df, ed_medrecon_df , "ED", "medrecon")
# import_to_patient_sub_folder(CXR_meta_df, ed_vitalsign_df , "ED", "vitalsign")
# import_to_patient_sub_folder(CXR_meta_df, ed_edstays_df , "ED", "edstays")
# import_to_patient_sub_folder(CXR_meta_df, ed_pyxis_df , "ED", "pyxis")
# import_to_patient_sub_folder(CXR_meta_df, ed_triage_df , "ED", "triage")


  df = pd.read_csv(os.path.join(folder_path,  f"{n}.csv.gz"))


Importing microbiologyevents
Importing microbiologyevents


  0%|          | 0/2847 [00:00<?, ?it/s]

Importing admissions
Importing admissions


  0%|          | 0/2908 [00:00<?, ?it/s]

Importing omr
Importing omr


  0%|          | 0/2658 [00:00<?, ?it/s]

  df = pd.read_csv(os.path.join(folder_path,  f"{n}.csv.gz"))


Importing pharmacy
Importing pharmacy


  0%|          | 0/2774 [00:00<?, ?it/s]

Importing poe
Importing poe


  0%|          | 0/2904 [00:00<?, ?it/s]

Importing diagnoses_icd
Importing diagnoses_icd


  0%|          | 0/2908 [00:00<?, ?it/s]

Importing poe_detail
Importing poe_detail


  0%|          | 0/2896 [00:00<?, ?it/s]

Importing drgcodes
Importing drgcodes


  0%|          | 0/2649 [00:00<?, ?it/s]

  df = pd.read_csv(os.path.join(folder_path,  f"{n}.csv.gz"))


Importing prescriptions
Importing prescriptions


  0%|          | 0/2774 [00:00<?, ?it/s]

Importing emar
Importing emar


  0%|          | 0/2170 [00:00<?, ?it/s]

Importing procedures_icd
Importing procedures_icd


  0%|          | 0/2386 [00:00<?, ?it/s]

  df = pd.read_csv(os.path.join(folder_path,  f"{n}.csv.gz"))


Importing emar_detail
Importing emar_detail


  0%|          | 0/2170 [00:00<?, ?it/s]

Importing services
Importing services


  0%|          | 0/2908 [00:00<?, ?it/s]

Importing hcpcsevents
Importing hcpcsevents


  0%|          | 0/1596 [00:00<?, ?it/s]

Importing transfers
Importing transfers


  0%|          | 0/3183 [00:00<?, ?it/s]

Importing labevents
Importing labevents


  0%|          | 0/3148 [00:00<?, ?it/s]

Importing d_hcpcs
Importing d_hcpcs to spreadsheet
Importing d_icd_diagnoses
Importing d_icd_diagnoses to spreadsheet
Importing d_icd_procedures
Importing d_icd_procedures to spreadsheet
Importing d_labitems
Importing d_labitems to spreadsheet
Importing diagnosis
Importing diagnosis


  0%|          | 0/3177 [00:00<?, ?it/s]

Importing medrecon
Importing medrecon


  0%|          | 0/2817 [00:00<?, ?it/s]

Importing vitalsign
Importing vitalsign


  0%|          | 0/3064 [00:00<?, ?it/s]

Importing edstays
Importing edstays


  0%|          | 0/3181 [00:00<?, ?it/s]

Importing pyxis
Importing pyxis


  0%|          | 0/2940 [00:00<?, ?it/s]

Importing triage
Importing triage


  0%|          | 0/3181 [00:00<?, ?it/s]

Importing icustays
Importing icustays


  0%|          | 0/1638 [00:00<?, ?it/s]

Importing inputevents
Importing inputevents


  0%|          | 0/1636 [00:00<?, ?it/s]

Importing d_items
Importing d_items to spreadsheet
Importing outputevents
Importing outputevents


  0%|          | 0/1622 [00:00<?, ?it/s]

Importing datetimeevents
Importing datetimeevents


  0%|          | 0/1636 [00:00<?, ?it/s]

Importing ingredientevents
Importing ingredientevents


  0%|          | 0/1633 [00:00<?, ?it/s]

Importing procedureevents
Importing procedureevents


  0%|          | 0/1637 [00:00<?, ?it/s]

In [19]:
df = pd.read_csv("./mimic-eye/spreadsheets/cxr_meta.csv")

In [20]:
df

Unnamed: 0.1,Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning,in_eye_gaze,in_reflacx,StudyDateTime,stay_id
0,177,fa771fa1-d9571d07-bff8f655-327734a7-6e10b29d,10002428,59258773,CHEST (PORTABLE AP),AP,3056,2544,21560419,92717.109,CHEST (PORTABLE AP),antero-posterior,Erect,False,True,2.156042e+12,
1,181,4873aa08-977bfd31-fb492e64-6ef432d1-3f12cbe3,10002430,53254222,CHEST (PA AND LAT),PA,3056,2544,21250928,160736.171,CHEST (PA AND LAT),postero-anterior,Erect,True,False,2.125093e+13,31293660.0
2,266,dcdc4bd9-4301b111-2a65a814-ee8e7bc5-7f0b9a5a,10003400,56466802,CHEST (PORTABLE AP),AP,3056,2544,21361209,133738.015,CHEST (PORTABLE AP),antero-posterior,Erect,False,True,2.136121e+13,33678912.0
3,497,5bdabba9-388f6646-ac06b5f5-f68b2fd2-3630de21,10011607,55814288,CHEST (PA AND LAT),AP,3056,2544,21820111,230933.250,CHEST (PA AND LAT),antero-posterior,Erect,False,True,2.182011e+13,37054412.0
4,539,4a629500-9c3281ca-90bab490-9b6ac9c1-e5e6a580,10012261,53728467,CHEST (PORTABLE AP),AP,2544,3056,21750906,191151.453,CHEST (PORTABLE AP),antero-posterior,Erect,False,True,2.175091e+13,38668412.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3684,376634,d9661ff6-877ac981-a20a8810-92309d46-173008ad,19991135,50498205,CHEST (PORTABLE AP),AP,3056,2544,21260910,53556.406,CHEST (PORTABLE AP),antero-posterior,,False,True,2.126091e+12,
3685,376636,d8f5555c-d8bcf97c-3a9d5596-17b9f854-6e15d081,19991135,50634986,,PA,2022,1736,21261102,151208.000,CHEST (PA AND LAT),postero-anterior,Erect,False,True,2.126110e+13,
3686,376644,d04bbcbd-5143439e-50ebe9a8-71380f67-44f8d127,19991135,54910031,CHEST (PORTABLE AP),AP,3056,2544,21260908,202437.031,CHEST (PORTABLE AP),antero-posterior,Erect,False,True,2.126091e+13,
3687,377010,7cfe7acc-65b0b548-feb77772-04c46d33-76a03230,19997367,58396642,CHEST (PORTABLE AP),AP,2712,2402,21260425,134114.328,CHEST (PORTABLE AP),antero-posterior,Erect,False,True,2.126043e+13,
