In [1]:
import os
import math

from datasets import paths
from tqdm.notebook import tqdm

import warnings
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
warnings.filterwarnings("ignore")

In [2]:
PHYSIONET_PATH = os.path.join(paths.PHYSIONET_PATH, "files")
CXR_JPG_FOLDER_PATH = os.path.join(PHYSIONET_PATH, "mimic-cxr-jpg", "2.0.0")
MIMICIV_FOLDER_PATH = os.path.join(PHYSIONET_PATH, "mimiciv", "2.0")
ED_FOLDER_PATH = os.path.join(PHYSIONET_PATH, "mimic-iv-ed", "2.0", "ed")
HOSP_FOLDER_PATH = os.path.join(MIMICIV_FOLDER_PATH, "hosp")

CXR_meta_df = pd.read_csv(os.path.join(CXR_JPG_FOLDER_PATH, "mimic-cxr-2.0.0-metadata.csv.gz"),
                          compression='gzip', header=0, sep=',', quotechar='"')
hosp_patients_df = pd.read_csv(os.path.join(
    HOSP_FOLDER_PATH,  "patients.csv.gz"))
ed_triage_df = pd.read_csv(os.path.join(ED_FOLDER_PATH, "triage.csv.gz"))
ed_edstays_df = pd.read_csv(os.path.join(ED_FOLDER_PATH, "edstays.csv.gz"))

In [3]:
def time_string_to_float(x):
    '''
    Use `transfer_id`` to identify.
    '''
    return float('inf') if type(x) is float and math.isnan(x) else float(x.replace("-", "").replace("/", "").replace(" ", "").replace(":", ""))

In [4]:
# Create studyDateTime for identifying stay_id.
CXR_meta_df['StudyDateTime'] = [float(str(d) + str(t))
                                for d, t in zip(CXR_meta_df['StudyDate'], CXR_meta_df['StudyTime'])]

# Initialise stay_id
CXR_meta_df['stay_id'] = None

cxr_cannot_find_stay = []

ed_edstays_df['intime_float'] = ed_edstays_df.intime.apply(
    time_string_to_float)
ed_edstays_df['outtime_float'] = ed_edstays_df.outtime.apply(
    time_string_to_float)

with tqdm(total=len(CXR_meta_df)) as pbar:
    for idx, cxr_meta_instance in CXR_meta_df.iterrows():
        dicom_id = cxr_meta_instance['dicom_id']

        study_time = cxr_meta_instance["StudyDateTime"]

        ed_stay_time_match = ed_edstays_df[
            (ed_edstays_df.subject_id == cxr_meta_instance.subject_id)
        ]

        if (len(ed_stay_time_match) > 0):
            ed_stay_time_match = ed_stay_time_match[
                (ed_stay_time_match.intime_float < study_time) &
                (ed_stay_time_match.outtime_float > study_time)
            ]

        if (len(ed_stay_time_match) == 1):
            stay_id = ed_stay_time_match.iloc[0]['stay_id']
            CXR_meta_df.at[idx, "stay_id"] = stay_id
        else:
            # print(f"Having problem with {dicom_id} CXR iamge, it has {len(ed_stay_time_match)} matches.")
            cxr_cannot_find_stay.append(cxr_meta_instance)

        pbar.update(1)

CXR_meta_df.to_csv("./spreadsheets/cxr_meta.csv")

CXR_meta_df = CXR_meta_df[CXR_meta_df['stay_id'].notnull()  & (CXR_meta_df['ViewPosition'].isin(["PA", "AP"]))]
CXR_meta_df.to_csv("./spreadsheets/cxr_meta_stayId.csv")

  0%|          | 0/377110 [00:00<?, ?it/s]

In [2]:
df = pd.read_csv("./spreadsheets/cxr_meta_stayId.csv")

In [4]:
df.columns

Index(['Unnamed: 0', 'dicom_id', 'subject_id', 'study_id',
       'PerformedProcedureStepDescription', 'ViewPosition', 'Rows', 'Columns',
       'StudyDate', 'StudyTime', 'ProcedureCodeSequence_CodeMeaning',
       'ViewCodeSequence_CodeMeaning',
       'PatientOrientationCodeSequence_CodeMeaning', 'StudyDateTime',
       'stay_id'],
      dtype='object')