In [1]:
import os, math, datetime

import pandas as pd
import numpy as np

from tqdm import tqdm

In [2]:
physio_file_path = "E:\\physionet.org\\files\\"

def time_string_to_float(x):
    return float('inf') if type(x) is float and math.isnan(x) else float(x.replace("-", "").replace("/", "").replace(" ", "").replace(":", ""))

def fill_time(time_flaot): 
    zero_in_front_times = 5 - int(np.log10(time_flaot))
    return "0"*zero_in_front_times + str(time_flaot)

def get_datetime_from_float(float_time):
    str_time = str(int(float_time))
    return datetime.datetime(
        year=int(str_time[:4]),
        month=int(str_time[4:6]),
        day=int(str_time[6:8]),
        hour=int(str_time[8:10]),
        minute=int(str_time[8:10]),
        second= int(str_time[12:14]),
    )

In [3]:
cxr_meta_df = pd.read_csv(os.path.join( physio_file_path, "mimic-cxr-jpg\\2.0.0\mimic-cxr-2.0.0-metadata.csv.gz"))
triage_df = pd.read_csv(os.path.join( physio_file_path, "mimic-iv-ed\\2.0\\ed\\triage.csv.gz"))
ed_stay_df = pd.read_csv(os.path.join( physio_file_path, "mimic-iv-ed\\2.0\\ed\\edstays.csv.gz"))
patient_df = pd.read_csv(os.path.join( physio_file_path, "mimiciv\\2.0\\hosp\\patients.csv.gz"))
reflacx_df  = pd.concat([pd.read_csv(os.path.join(
    physio_file_path, "reflacx-xray-localization\\1.0.0\\main_data", f"metadata_phase_{i}.csv")) for i in range(1, 4)])
eye_gaze_df = pd.read_csv(os.path.join(physio_file_path, "egd-cxr\\1.0.0\\master_sheet.csv"))
icu_df = pd.read_csv(os.path.join(physio_file_path, "mimiciv\\2.0\\icu\\icustays.csv.gz"))

triage_patients = triage_df['subject_id'].unique()
reflacx_patients = reflacx_df['subject_id'].unique()
cxr_patients = cxr_meta_df['subject_id'].unique()
mimic_patients = patient_df['subject_id'].unique()
eye_gaze_patients = eye_gaze_df['patient_id'].unique()
icu_patients = icu_df["subject_id"].unique()


In [4]:
all_dicom_id_in_reflacx = list(reflacx_df['dicom_id'])
cxr_meta_df['in_reflacx'] = cxr_meta_df["dicom_id"].isin(all_dicom_id_in_reflacx)

In [5]:
cxr_meta_df['StudyDateTime']  = [float(str(d) + fill_time(t))
 for d, t in zip(cxr_meta_df['StudyDate'], cxr_meta_df['StudyTime'])]


# cxr_meta_df['StudyDateTime']  = [float(str(d) + str(t))
#  for d, t in zip(cxr_meta_df['StudyDate'], cxr_meta_df['StudyTime'])]

# Initialise stay_id
cxr_meta_df['stay_id'] = None



ed_stay_df['intime_float'] = ed_stay_df.intime.apply(time_string_to_float)
ed_stay_df['outtime_float'] = ed_stay_df.outtime.apply(time_string_to_float)

In [6]:
out_of_stay_range_but_only_have_one = []
cxr_cannot_find_stay = []
has_patient_match_only = [] # f
has_multiple_match  =[]  # cases that have multiple match
in_days = [] # in the limit of day range.

with tqdm(total= len(cxr_meta_df)) as pbar:
    for idx, cxr_meta_instance in cxr_meta_df.iterrows():
        dicom_id = cxr_meta_instance['dicom_id']
        
        study_time = cxr_meta_instance["StudyDateTime"]

        patient_match = ed_stay_df[
            (ed_stay_df.subject_id == cxr_meta_instance.subject_id)  
        ]

        # if (len(patient_match) == 1 ):
        #     ed_stay_time_match = patient_match
        # else:
            # if (len(patient_match) > 0):
        ed_stay_time_match = patient_match[
            (patient_match.intime_float < study_time) &
            (patient_match.outtime_float > study_time)
        ]

        if (len(ed_stay_time_match) == 1):
            stay_id = ed_stay_time_match.iloc[0]['stay_id']
            cxr_meta_df.at[idx,"stay_id"] = stay_id
            # out_of_stay_range_but_only_have_one.append(cxr_meta_df)

        elif (len(ed_stay_time_match) > 1):
            # ed_stay_time_match
            # raise StopIteration(f"Has multiple match in {idx}")
            has_multiple_match.append(cxr_meta_instance)
            #     ## we find the closest.

        elif (len(ed_stay_time_match) == 0) and len(patient_match) > 0:
            # raise StopIteration(f"Has patient match only {idx}")

            cxr_time = get_datetime_from_float(cxr_meta_instance['StudyDateTime'])
            patient_match["intime_obj"] = patient_match.intime_float.apply(get_datetime_from_float)
            patient_match["outtime_obj"] = patient_match.outtime_float.apply(get_datetime_from_float)

            # calculate the time diff here.
            patient_match["intime_diff"] =  patient_match["intime_obj"].apply(lambda x: abs((x- cxr_time).days))
            patient_match["outtime_diff"] = patient_match['intime_obj'].apply(lambda x: abs((x- cxr_time).days))
            
            # get the closest 
            patient_match["min_diff"] = patient_match[["intime_diff", "outtime_diff"]].min(axis=1)

            patient_match.sort_values("min_diff", ascending=True, axis=0, inplace=True)

            # check 1st instance and set limitation here:
            if (patient_match.iloc[0]['min_diff'] <= 7):
                in_days.append(cxr_meta_instance)
                cxr_meta_df.at[idx,"stay_id"] = patient_match.iloc[0]['stay_id']

            has_patient_match_only.append(cxr_meta_instance)
        else:
            # print(f"Having problem with {dicom_id} CXR iamge, it has {len(ed_stay_time_match)} matches.")

            # raise StopIteration(f"Can't find stay_id in #{idx} instance")
            cxr_cannot_find_stay.append(cxr_meta_instance)

        pbar.update(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  patient_match["intime_obj"] = patient_match.intime_float.apply(get_datetime_from_float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  patient_match["outtime_obj"] = patient_match.outtime_float.apply(get_datetime_from_float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  patient_match["intime_diff"

In [7]:
print(f"With the limitation of days, {len(in_days)} more stays have their stay_id identified.")
print(f"Still have {len(cxr_meta_df[cxr_meta_df.stay_id.isnull()])} CXRs without stay_id, which is  {(len(cxr_meta_df[cxr_meta_df.stay_id.isnull()])/ len(cxr_meta_df))*100:.2f}% of the MIMIC-CXR dataset")
print(f"In terms of REFLACX dataset, it has {len(cxr_meta_df[cxr_meta_df.in_reflacx & cxr_meta_df.stay_id.isnull()])} cases without stay_id, which is {len(cxr_meta_df[cxr_meta_df.in_reflacx & cxr_meta_df.stay_id.isnull()])/ len(cxr_meta_df[cxr_meta_df.in_reflacx])*100:.2f}% of REFLACX dataset.")

With the limitation of days, 83592 more stays have their stay_id identified.
Still have 123453 CXRs without stay_id, which is  32.74% of the MIMIC-CXR dataset


AttributeError: 'DataFrame' object has no attribute 'in_reflacx'

In [9]:
cxr_meta_df.columns

Index(['dicom_id', 'subject_id', 'study_id',
       'PerformedProcedureStepDescription', 'ViewPosition', 'Rows', 'Columns',
       'StudyDate', 'StudyTime', 'ProcedureCodeSequence_CodeMeaning',
       'ViewCodeSequence_CodeMeaning',
       'PatientOrientationCodeSequence_CodeMeaning', 'StudyDateTime',
       'stay_id'],
      dtype='object')

In [8]:
cxr_meta_df.in_reflacx

AttributeError: 'DataFrame' object has no attribute 'in_reflacx'