# Creating the spreadsheet from REFLACX and MIMIC dataset.
The created spreadsheet can be used with the dataset modules in `./data`.

In [1]:
import os, json
import pandas as pd

from data.paths import TabularDataPaths
from utils import print as print_f
from utils.preprocessing import get_split_list, checkMissingValue
from utils.init import reproducibility

from data.paths import XAMI_MIMIC_PATH, SPREADSHEET_FOLDER, PHYSIO_PATH
from data.constants import REFLACX_ALL_LABEL_COLS, REFLACX_REPETITIVE_ALL_LABEL_COLS

### Determine the seed and insure the reproducibility.

In [2]:
seed = 0
reproducibility(0)

Read in the REFLACX dataframe

In [3]:
reflacx_df = pd.concat(
    [
        pd.read_csv(
            os.path.join("E:\\physionet.org\\files\\reflacx-xray-localization\\1.0.0\\main_data", f"metadata_phase_{i}.csv")
        )
        for i in [1, 2, 3]
    ],
    axis=0,
)
reflacx_df.head(5)

Unnamed: 0,id,split,eye_tracking_data_discarded,image,dicom_id,subject_id,image_size_x,image_size_y,Airway wall thickening,Atelectasis,...,Support devices,Wide mediastinum,Abnormal mediastinal contour,Acute fracture,Enlarged hilum,Hiatal hernia,High lung volume / emphysema,Interstitial lung disease,Lung nodule or mass,Pleural abnormality
0,P102R108387,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,0.0,...,True,0.0,,,,,,,,
1,P102R379837,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,0.0,...,True,0.0,,,,,,,,
2,P102R558314,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,4.0,0.0,...,True,0.0,,,,,,,,
3,P102R765317,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,0.0,...,True,2.0,,,,,,,,
4,P102R915878,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,0.0,...,True,0.0,,,,,,,,


In [4]:
reflacx_df

Unnamed: 0,id,split,eye_tracking_data_discarded,image,dicom_id,subject_id,image_size_x,image_size_y,Airway wall thickening,Atelectasis,...,Support devices,Wide mediastinum,Abnormal mediastinal contour,Acute fracture,Enlarged hilum,Hiatal hernia,High lung volume / emphysema,Interstitial lung disease,Lung nodule or mass,Pleural abnormality
0,P102R108387,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,0.0,...,True,0.0,,,,,,,,
1,P102R379837,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,0.0,...,True,0.0,,,,,,,,
2,P102R558314,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,4.0,0.0,...,True,0.0,,,,,,,,
3,P102R765317,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,0.0,...,True,2.0,,,,,,,,
4,P102R915878,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,0.0,...,True,0.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2502,P300R939601,test,False,physionet.org/files/mimic-cxr/2.0.0/files/p19/...,d9661ff6-877ac981-a20a8810-92309d46-173008ad,19991135,2544,3056,,0.0,...,False,,5.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
2503,P300R331925,test,False,physionet.org/files/mimic-cxr/2.0.0/files/p19/...,d8f5555c-d8bcf97c-3a9d5596-17b9f854-6e15d081,19991135,1736,2022,,0.0,...,True,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
2504,P300R336111,test,False,physionet.org/files/mimic-cxr/2.0.0/files/p19/...,d04bbcbd-5143439e-50ebe9a8-71380f67-44f8d127,19991135,2544,3056,,5.0,...,True,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2505,P300R179177,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p19/...,7cfe7acc-65b0b548-feb77772-04c46d33-76a03230,19997367,2402,2712,,5.0,...,True,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0


In [5]:
reflacx_df.columns

Index(['id', 'split', 'eye_tracking_data_discarded', 'image', 'dicom_id',
       'subject_id', 'image_size_x', 'image_size_y', 'Airway wall thickening',
       'Atelectasis', 'Consolidation', 'Emphysema',
       'Enlarged cardiac silhouette', 'Fibrosis', 'Fracture',
       'Groundglass opacity', 'Mass', 'Nodule', 'Other', 'Pleural effusion',
       'Pleural thickening', 'Pneumothorax', 'Pulmonary edema',
       'Quality issue', 'Support devices', 'Wide mediastinum',
       'Abnormal mediastinal contour', 'Acute fracture', 'Enlarged hilum',
       'Hiatal hernia', 'High lung volume / emphysema',
       'Interstitial lung disease', 'Lung nodule or mass',
       'Pleural abnormality'],
      dtype='object')

In [6]:
print(f"The REFLACX dataset has {len(reflacx_df)} instances.")
### some features and datasets details can be explained here. ###

The REFLACX dataset has 3052 instances.


## Check what are those features containing missing in REFLACX.

In [7]:
print_f.print_title("Columns have NaN value in REFLACX metadata") 
checkMissingValue(reflacx_df)

[('Airway wall thickening', 2757), ('Emphysema', 2757), ('Fibrosis', 2757), ('Fracture', 2757), ('Mass', 2757), ('Nodule', 2757), ('Other', 2839), ('Pleural effusion', 2757), ('Pleural thickening', 2757), ('Quality issue', 2757), ('Wide mediastinum', 2757), ('Abnormal mediastinal contour', 295), ('Acute fracture', 295), ('Enlarged hilum', 295), ('Hiatal hernia', 295), ('High lung volume / emphysema', 295), ('Interstitial lung disease', 295), ('Lung nodule or mass', 295), ('Pleural abnormality', 295)]


### Filling the missing features

In [8]:
# Filling nan for numerical cols.
reflacx_df[REFLACX_REPETITIVE_ALL_LABEL_COLS] = reflacx_df[REFLACX_REPETITIVE_ALL_LABEL_COLS].fillna(0)

# Folling nan for boolean cols.
reflacx_df[["Quality issue", "Support devices"]] = reflacx_df[
    ["Quality issue", "Support devices"]
].fillna(False)

# check the missing values again.
print_f.print_title("Columns have NaN value in REFLACX metadata")
checkMissingValue(reflacx_df)

[]


## Dealing with the repetitive labels in REFLACX

In [9]:
reflacx_df["Lung nodule or mass"] = reflacx_df[
    ["Lung nodule or mass", "Mass", "Nodule"]
].max(axis=1)

reflacx_df["High lung volume / emphysema"] = reflacx_df[
    ["High lung volume / emphysema", "Emphysema"]
].max(axis=1)

reflacx_df["Pleural abnormality"] = reflacx_df[
    ["Pleural abnormality", "Pleural thickening", "Pleural effusion"]
].max(axis=1)

# Remove the repetitive labels that not in the version we want.
del reflacx_df["Mass"]
del reflacx_df["Nodule"]

del reflacx_df["Emphysema"]

del reflacx_df["Pleural thickening"]
del reflacx_df["Pleural effusion"]

# Load other datasets that have the modalities we desire.

In [10]:
# we don't actually need to merge with other datasets, we just need to import the eye-tracking data, that's all.

# the xami-cxr contains stay_id
cxr_meta_df = pd.read_csv(
    'E:\\XAMI-MIMIC 2.0\\spreadsheets\\cxr_meta.csv'
)
hosp_patients_df = pd.read_csv("E:\\physionet.org\\files\\mimiciv\\2.0\\hosp\\patients.csv.gz")
ed_triage_df = pd.read_csv("E:\\physionet.org\\files\\mimic-iv-ed\\2.0\\ed\\triage.csv.gz")

In [11]:
cxr_meta_df.head(5)

Unnamed: 0.1,Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning,in_eye_gaze,in_reflacx,StudyDateTime,stay_id
0,177,fa771fa1-d9571d07-bff8f655-327734a7-6e10b29d,10002428,59258773,CHEST (PORTABLE AP),AP,3056,2544,21560419,92717.109,CHEST (PORTABLE AP),antero-posterior,Erect,False,True,2156042000000.0,
1,181,4873aa08-977bfd31-fb492e64-6ef432d1-3f12cbe3,10002430,53254222,CHEST (PA AND LAT),PA,3056,2544,21250928,160736.171,CHEST (PA AND LAT),postero-anterior,Erect,True,False,21250930000000.0,31293660.0
2,266,dcdc4bd9-4301b111-2a65a814-ee8e7bc5-7f0b9a5a,10003400,56466802,CHEST (PORTABLE AP),AP,3056,2544,21361209,133738.015,CHEST (PORTABLE AP),antero-posterior,Erect,False,True,21361210000000.0,33678912.0
3,497,5bdabba9-388f6646-ac06b5f5-f68b2fd2-3630de21,10011607,55814288,CHEST (PA AND LAT),AP,3056,2544,21820111,230933.25,CHEST (PA AND LAT),antero-posterior,Erect,False,True,21820110000000.0,37054412.0
4,539,4a629500-9c3281ca-90bab490-9b6ac9c1-e5e6a580,10012261,53728467,CHEST (PORTABLE AP),AP,2544,3056,21750906,191151.453,CHEST (PORTABLE AP),antero-posterior,Erect,False,True,21750910000000.0,38668412.0


In [12]:
merged_df = (
    reflacx_df.merge(
        cxr_meta_df,
        "left",
        left_on="dicom_id",
        right_on="dicom_id",
        suffixes=("", "cxr"),
    )
    .merge(hosp_patients_df, "left", on="subject_id", suffixes=("", "_patient"))
    .merge(ed_triage_df, "left", on="stay_id", suffixes=("", "_triage"))
)

# filter out the null stay instances.
merged_df = merged_df[merged_df["eye_tracking_data_discarded"] != True]

## only get the one with `stay_id`, so we can link to triage.csv
merged_df = merged_df[~merged_df["stay_id"].isna()]
merged_df["stay_id"] = merged_df["stay_id"].astype(int)

In [13]:
class CXRImage:
    def __init__(self, patient_id, study_id, dicom_id, reflacx_id=None) -> None:
        self.patient_id = patient_id
        self.study_id = study_id
        self.dicom_id = dicom_id
        self.reflacx_id = reflacx_id

    def get_physio_image_path(self, physio_path, jpg=True, version="2.0.0"):
        file_extension = "jpg"if jpg else "dcm"

        return os.path.join(
            physio_path,
            "files",
            ("mimic-cxr-jpg" if jpg else "mimic-cxr"),
            version,
            "files",
            f"p{self.patient_id[:2]}",
            f"p{self.patient_id}",
            f"s{self.study_id}",
            f"{self.dicom_id}.{file_extension}"
        )

    def get_xami_image_path(self, xami_path):
        return os.path.join(
            xami_path,
            f"patient_{self.patient_id}",
            "CXR-JPG",
            f"s{self.study_id}",
            f"{self.dicom_id}.jpg"
        )

def create_cxr_from_path(path, reflacx_id=None):
    fs = path.split("/")
    dicom_id = fs[-1].split(".")[0]
    study_id = fs[-2][1:]
    patient_id = fs[-3][1:]

    return CXRImage(
        patient_id=patient_id,
        study_id=study_id,
        dicom_id=dicom_id,
        reflacx_id=reflacx_id,
    )


In [14]:
## in order to have a faster reading speed, we use XAMI-MIMIC
XAMI_MIMIC_PATH_str = "{XAMI_MIMIC_PATH}"

## Calculate the age according to the time when the CXR taken.
merged_df["image_path"] = merged_df['image'].apply(lambda x: create_cxr_from_path(x).get_xami_image_path(XAMI_MIMIC_PATH))
merged_df["bbox_path"] = (
    f"{XAMI_MIMIC_PATH_str}\patient_"
    + merged_df["subject_id"].astype(str)
    + "\REFLACX\\"
    + merged_df["id"].astype(str)
    + "\\anomaly_location_ellipses.csv"
)
merged_df["fixation_path"] = (
    f"{XAMI_MIMIC_PATH_str}\patient_"
    + merged_df["subject_id"].astype(str)
    + "\REFLACX\\"
    + merged_df["id"].astype(str)
    + "\\fixations.csv"
)

In [15]:
# transform to physio approach, we keep this one incase we need physio to run all in the future.

# reflacx_df["image_path"] = reflacx_df["image"].apply(
#     lambda x: create_cxr_from_path(x).get_physio_image_path(PHYSIO_PATH)
# )

# physio_reflacx_path = (
#     "E:\\physionet.org\\files\\reflacx-xray-localization\\1.0.0\\main_data\\"
# )
# reflacx_df["bbox_path"] = reflacx_df["id"].apply(
#     lambda x: os.path.join(physio_reflacx_path, x, "anomaly_location_ellipses.csv")
# )

# reflacx_df["fixation_path"] = reflacx_df["id"].apply(
#     lambda x: os.path.join(physio_reflacx_path, x, "fixations.csv")
# )

In [16]:
merged_df['split'] = get_split_list(len(merged_df), train_portion=0.7)

In [17]:
from collections import OrderedDict

count_map = dict((merged_df[REFLACX_ALL_LABEL_COLS]>0).sum(axis=0))
OrderedDict({k: v for k, v in sorted(count_map.items(), key=lambda item: item[1])})

OrderedDict([('Fracture', 0),
             ('Fibrosis', 1),
             ('Wide mediastinum', 3),
             ('Quality issue', 3),
             ('Pneumothorax', 4),
             ('Acute fracture', 4),
             ('Airway wall thickening', 7),
             ('Hiatal hernia', 9),
             ('Enlarged hilum', 14),
             ('Interstitial lung disease', 16),
             ('High lung volume / emphysema', 24),
             ('Abnormal mediastinal contour', 25),
             ('Lung nodule or mass', 52),
             ('Groundglass opacity', 77),
             ('Pulmonary edema', 97),
             ('Consolidation', 124),
             ('Pleural abnormality', 152),
             ('Atelectasis', 161),
             ('Enlarged cardiac silhouette', 183),
             ('Support devices', 215)])

In [18]:
sort_countmap = OrderedDict({k: v for k, v in sorted(count_map.items(), key=lambda item: item[1])})

In [19]:
sort_countmap.keys()

odict_keys(['Fracture', 'Fibrosis', 'Wide mediastinum', 'Quality issue', 'Pneumothorax', 'Acute fracture', 'Airway wall thickening', 'Hiatal hernia', 'Enlarged hilum', 'Interstitial lung disease', 'High lung volume / emphysema', 'Abnormal mediastinal contour', 'Lung nodule or mass', 'Groundglass opacity', 'Pulmonary edema', 'Consolidation', 'Pleural abnormality', 'Atelectasis', 'Enlarged cardiac silhouette', 'Support devices'])

In [20]:
merged_df = merged_df.reset_index()
merged_df.to_csv(os.path.join(SPREADSHEET_FOLDER, "reflacx_with_stay.csv"))

In [21]:
merged_df.columns

Index(['index', 'id', 'split', 'eye_tracking_data_discarded', 'image',
       'dicom_id', 'subject_id', 'image_size_x', 'image_size_y',
       'Airway wall thickening', 'Atelectasis', 'Consolidation',
       'Enlarged cardiac silhouette', 'Fibrosis', 'Fracture',
       'Groundglass opacity', 'Other', 'Pneumothorax', 'Pulmonary edema',
       'Quality issue', 'Support devices', 'Wide mediastinum',
       'Abnormal mediastinal contour', 'Acute fracture', 'Enlarged hilum',
       'Hiatal hernia', 'High lung volume / emphysema',
       'Interstitial lung disease', 'Lung nodule or mass',
       'Pleural abnormality', 'Unnamed: 0', 'subject_idcxr', 'study_id',
       'PerformedProcedureStepDescription', 'ViewPosition', 'Rows', 'Columns',
       'StudyDate', 'StudyTime', 'ProcedureCodeSequence_CodeMeaning',
       'ViewCodeSequence_CodeMeaning',
       'PatientOrientationCodeSequence_CodeMeaning', 'in_eye_gaze',
       'in_reflacx', 'StudyDateTime', 'stay_id', 'gender', 'anchor_age',
       '