# Creating the spreadsheet from REFLACX and MIMIC dataset.
The created spreadsheet can be used with the dataset modules in `./data`.

In [1]:
import os, json
import pandas as pd

from data.paths import TabularDataPaths
from utils import print as print_f
from utils.preprocessing import get_split_list, checkMissingValue
from utils.init import reproducibility

from data.constants import REFLACX_ALL_LABEL_COLS, REFLACX_REPETITIVE_ALL_LABEL_COLS
from data.paths import MIMIC_EYE_PATH

### Determine the seed and insure the reproducibility.

In [2]:
seed = 0
reproducibility(0)

Read in the REFLACX dataframe

In [3]:
# mimic_eye_path = "/Volumes/T7/mimic-eye"
mimic_eye_path = MIMIC_EYE_PATH

In [4]:
reflacx_df = pd.read_csv(
    os.path.join(mimic_eye_path, "spreadsheets/REFLACX/metadata.csv")
)
reflacx_df.head(5)

Unnamed: 0.1,Unnamed: 0,id,split,eye_tracking_data_discarded,image,dicom_id,subject_id,image_size_x,image_size_y,Airway wall thickening,...,Support devices,Wide mediastinum,Abnormal mediastinal contour,Acute fracture,Enlarged hilum,Hiatal hernia,High lung volume / emphysema,Interstitial lung disease,Lung nodule or mass,Pleural abnormality
0,0,P102R108387,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,...,True,0.0,,,,,,,,
1,1,P102R379837,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,...,True,0.0,,,,,,,,
2,2,P102R558314,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,4.0,...,True,0.0,,,,,,,,
3,3,P102R765317,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,...,True,2.0,,,,,,,,
4,4,P102R915878,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,...,True,0.0,,,,,,,,


In [5]:
reflacx_df

Unnamed: 0.1,Unnamed: 0,id,split,eye_tracking_data_discarded,image,dicom_id,subject_id,image_size_x,image_size_y,Airway wall thickening,...,Support devices,Wide mediastinum,Abnormal mediastinal contour,Acute fracture,Enlarged hilum,Hiatal hernia,High lung volume / emphysema,Interstitial lung disease,Lung nodule or mass,Pleural abnormality
0,0,P102R108387,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,...,True,0.0,,,,,,,,
1,1,P102R379837,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,...,True,0.0,,,,,,,,
2,2,P102R558314,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,4.0,...,True,0.0,,,,,,,,
3,3,P102R765317,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,...,True,2.0,,,,,,,,
4,4,P102R915878,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,...,True,0.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3047,2502,P300R939601,test,False,physionet.org/files/mimic-cxr/2.0.0/files/p19/...,d9661ff6-877ac981-a20a8810-92309d46-173008ad,19991135,2544,3056,,...,False,,5.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
3048,2503,P300R331925,test,False,physionet.org/files/mimic-cxr/2.0.0/files/p19/...,d8f5555c-d8bcf97c-3a9d5596-17b9f854-6e15d081,19991135,1736,2022,,...,True,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
3049,2504,P300R336111,test,False,physionet.org/files/mimic-cxr/2.0.0/files/p19/...,d04bbcbd-5143439e-50ebe9a8-71380f67-44f8d127,19991135,2544,3056,,...,True,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3050,2505,P300R179177,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p19/...,7cfe7acc-65b0b548-feb77772-04c46d33-76a03230,19997367,2402,2712,,...,True,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0


In [6]:
reflacx_df.columns

Index(['Unnamed: 0', 'id', 'split', 'eye_tracking_data_discarded', 'image',
       'dicom_id', 'subject_id', 'image_size_x', 'image_size_y',
       'Airway wall thickening', 'Atelectasis', 'Consolidation', 'Emphysema',
       'Enlarged cardiac silhouette', 'Fibrosis', 'Fracture',
       'Groundglass opacity', 'Mass', 'Nodule', 'Other', 'Pleural effusion',
       'Pleural thickening', 'Pneumothorax', 'Pulmonary edema',
       'Quality issue', 'Support devices', 'Wide mediastinum',
       'Abnormal mediastinal contour', 'Acute fracture', 'Enlarged hilum',
       'Hiatal hernia', 'High lung volume / emphysema',
       'Interstitial lung disease', 'Lung nodule or mass',
       'Pleural abnormality'],
      dtype='object')

In [7]:
print(f"The REFLACX dataset has {len(reflacx_df)} instances.")
### some features and datasets details can be explained here. ###

The REFLACX dataset has 3052 instances.


## Check what are those features containing missing in REFLACX.

In [8]:
print_f.print_title("Columns have NaN value in REFLACX metadata") 
checkMissingValue(reflacx_df)

[('Airway wall thickening', 2757), ('Emphysema', 2757), ('Fibrosis', 2757), ('Fracture', 2757), ('Mass', 2757), ('Nodule', 2757), ('Other', 2839), ('Pleural effusion', 2757), ('Pleural thickening', 2757), ('Quality issue', 2757), ('Wide mediastinum', 2757), ('Abnormal mediastinal contour', 295), ('Acute fracture', 295), ('Enlarged hilum', 295), ('Hiatal hernia', 295), ('High lung volume / emphysema', 295), ('Interstitial lung disease', 295), ('Lung nodule or mass', 295), ('Pleural abnormality', 295)]


### Filling the missing features

In [9]:
# Filling nan for numerical cols.
reflacx_df[REFLACX_REPETITIVE_ALL_LABEL_COLS] = reflacx_df[REFLACX_REPETITIVE_ALL_LABEL_COLS].fillna(0)

# Folling nan for boolean cols.
reflacx_df[["Quality issue", "Support devices"]] = reflacx_df[
    ["Quality issue", "Support devices"]
].fillna(False)

# check the missing values again.
print_f.print_title("Columns have NaN value in REFLACX metadata")
checkMissingValue(reflacx_df)

[]


## Dealing with the repetitive labels in REFLACX

In [10]:
reflacx_df["Lung nodule or mass"] = reflacx_df[
    ["Lung nodule or mass", "Mass", "Nodule"]
].max(axis=1)

reflacx_df["High lung volume / emphysema"] = reflacx_df[
    ["High lung volume / emphysema", "Emphysema"]
].max(axis=1)

reflacx_df["Pleural abnormality"] = reflacx_df[
    ["Pleural abnormality", "Pleural thickening", "Pleural effusion"]
].max(axis=1)

# Remove the repetitive labels that not in the version we want.
del reflacx_df["Mass"]
del reflacx_df["Nodule"]

del reflacx_df["Emphysema"]

del reflacx_df["Pleural thickening"]
del reflacx_df["Pleural effusion"]

# Load other datasets that have the modalities we desire.

In [11]:
mimic_eye_path

'D:\\mimic-eye'

In [12]:
# we don't actually need to merge with other datasets, we just need to import the eye-tracking data, that's all.

# cxr_meta_df = pd.read_csv("E:\\physionet.org\\files\\mimic-cxr-jpg\\2.0.0\\mimic-cxr-2.0.0-metadata.csv.gz")
# hosp_patients_df = pd.read_csv("E:\\physionet.org\\files\\mimiciv\\2.0\\hosp\\patients.csv.gz")
# ed_triage_df = pd.read_csv("E:\\physionet.org\\files\\mimic-iv-ed\\2.0\\ed\\triage.csv.gz")

# cxr_meta_df.head(5)
cxr_meta_df = pd.read_csv(  os.path.join(mimic_eye_path, "spreadsheets","cxr_meta.csv"))
negbio_df = pd.read_csv(  os.path.join(mimic_eye_path, "spreadsheets","CXR-JPG","cxr_negbio.csv"))
chexpert_df = pd.read_csv(  os.path.join(mimic_eye_path, "spreadsheets","CXR-JPG","cxr_chexpert.csv"))

In [13]:
cols_to_use = cxr_meta_df.columns.difference(reflacx_df.columns)
merged_df = reflacx_df.merge(cxr_meta_df[list(cols_to_use)+ ['dicom_id']], "left", on="dicom_id", suffixes=("",""))
merged_df = merged_df.merge(negbio_df.add_suffix("_negbio"), "left", left_on="study_id", right_on="study_id_negbio" 
)
merged_df = merged_df.merge(chexpert_df.add_suffix("_chexpert"), "left", left_on="study_id", right_on="study_id_chexpert",
)



In [14]:
class CXRImage:
    def __init__(self, patient_id, study_id, dicom_id, reflacx_id=None) -> None:
        self.patient_id = patient_id
        self.study_id = study_id
        self.dicom_id = dicom_id
        self.reflacx_id = reflacx_id

    def get_physio_image_path(self, physio_path, jpg=True, version="2.0.0"):
        file_extension = "jpg"if jpg else "dcm"

        return os.path.join(
            physio_path,
            "files",
            ("mimic-cxr-jpg" if jpg else "mimic-cxr"),
            version,
            "files",
            f"p{self.patient_id[:2]}",
            f"p{self.patient_id}",
            f"s{self.study_id}",
            f"{self.dicom_id}.{file_extension}"
        )

    def get_mimic_eye_image_path(self, XAMI_MIMIC_PATH_str):
        return os.path.join(
            XAMI_MIMIC_PATH_str,
            f"patient_{self.patient_id}",
            "CXR-JPG",
            f"s{self.study_id}",
            f"{self.dicom_id}.jpg"
        )

def create_cxr_from_path(path, reflacx_id=None):
    fs = path.split("/")
    dicom_id = fs[-1].split(".")[0]
    study_id = fs[-2][1:]
    patient_id = fs[-3][1:]

    return CXRImage(
        patient_id=patient_id,
        study_id=study_id,
        dicom_id=dicom_id,
        reflacx_id=reflacx_id,
    )


In [15]:
# in order to have a faster reading speed, we use XAMI-MIMIC
XAMI_MIMIC_PATH_str = "{XAMI_MIMIC_PATH}"

# Calculate the age according to the time when the CXR taken.
merged_df["image_path"] = merged_df['image'].apply(
    lambda x: create_cxr_from_path(x).get_mimic_eye_image_path(XAMI_MIMIC_PATH_str))
# merged_df["bbox_path"] = (
#     f"{XAMI_MIMIC_PATH_str}\patient_"
#     + merged_df["subject_id"].astype(str)
#     + "\REFLACX\\"
#     + merged_df["id"].astype(str)
#     + "\\anomaly_location_ellipses.csv"
# )

merged_df["bbox_path"] = merged_df.apply(lambda x: os.path.join(
    XAMI_MIMIC_PATH_str,
    f"patient_{str(x['subject_id'])}",
    "REFLACX", 
    "main_data",
    str(x["id"]),
    "anomaly_location_ellipses.csv"),
    axis=1)

merged_df["fixation_path"] = merged_df.apply(lambda x: os.path.join(
    XAMI_MIMIC_PATH_str,
    f"patient_{str(x['subject_id'])}",
    "REFLACX",
    "main_data",
    str(x["id"]),
    "fixations.csv"),
    axis=1)


# merged_df["fixation_path"] = (
#     f"{XAMI_MIMIC_PATH_str}\patient_"
#     + merged_df["subject_id"].astype(str)
#     + "\REFLACX\\"
#     + merged_df["id"].astype(str)
#     + "\\fixations.csv"
# )


In [16]:
## only keep the one without eye-tracking data

merged_df = merged_df[merged_df["eye_tracking_data_discarded"] != True]

## we first test what kind of modalities exist, then check how can we separate them. The problem can be from the labels (abnormalities), so let's first see which disease has most of the agreement from different radiologists.

In [17]:
merged_df['split'] = get_split_list(len(merged_df), train_portion=0.7)

In [18]:
from collections import OrderedDict

count_map = dict((merged_df[REFLACX_ALL_LABEL_COLS]>0).sum(axis=0))
OrderedDict({k: v for k, v in sorted(count_map.items(), key=lambda item: item[1])})

OrderedDict([('Fibrosis', 5),
             ('Quality issue', 9),
             ('Wide mediastinum', 10),
             ('Fracture', 12),
             ('Airway wall thickening', 21),
             ('Hiatal hernia', 24),
             ('Acute fracture', 33),
             ('Interstitial lung disease', 34),
             ('Enlarged hilum', 56),
             ('Abnormal mediastinal contour', 84),
             ('High lung volume / emphysema', 89),
             ('Pneumothorax', 105),
             ('Lung nodule or mass', 162),
             ('Groundglass opacity', 389),
             ('Pulmonary edema', 427),
             ('Enlarged cardiac silhouette', 710),
             ('Consolidation', 814),
             ('Atelectasis', 840),
             ('Pleural abnormality', 933),
             ('Support devices', 1305)])

In [19]:
sort_countmap = OrderedDict({k: v for k, v in sorted(count_map.items(), key=lambda item: item[1])})

In [20]:
sort_countmap.keys()

odict_keys(['Fibrosis', 'Quality issue', 'Wide mediastinum', 'Fracture', 'Airway wall thickening', 'Hiatal hernia', 'Acute fracture', 'Interstitial lung disease', 'Enlarged hilum', 'Abnormal mediastinal contour', 'High lung volume / emphysema', 'Pneumothorax', 'Lung nodule or mass', 'Groundglass opacity', 'Pulmonary edema', 'Enlarged cardiac silhouette', 'Consolidation', 'Atelectasis', 'Pleural abnormality', 'Support devices'])

In [21]:
merged_df.columns

Index(['Unnamed: 0', 'id', 'split', 'eye_tracking_data_discarded', 'image',
       'dicom_id', 'subject_id', 'image_size_x', 'image_size_y',
       'Airway wall thickening', 'Atelectasis', 'Consolidation',
       'Enlarged cardiac silhouette', 'Fibrosis', 'Fracture',
       'Groundglass opacity', 'Other', 'Pneumothorax', 'Pulmonary edema',
       'Quality issue', 'Support devices', 'Wide mediastinum',
       'Abnormal mediastinal contour', 'Acute fracture', 'Enlarged hilum',
       'Hiatal hernia', 'High lung volume / emphysema',
       'Interstitial lung disease', 'Lung nodule or mass',
       'Pleural abnormality', 'Columns',
       'PatientOrientationCodeSequence_CodeMeaning',
       'PerformedProcedureStepDescription',
       'ProcedureCodeSequence_CodeMeaning', 'Rows', 'StudyDate',
       'StudyDateTime', 'StudyTime', 'ViewCodeSequence_CodeMeaning',
       'ViewPosition', 'in_eye_gaze', 'in_reflacx', 'stay_id', 'study_id',
       'Unnamed: 0_negbio', 'subject_id_negbio', 'study_id

In [22]:
merged_df = merged_df.reset_index()
merged_df.to_csv(os.path.join("./spreadsheets", "reflacx_eye.csv"))

In [23]:
merged_df.columns

Index(['index', 'Unnamed: 0', 'id', 'split', 'eye_tracking_data_discarded',
       'image', 'dicom_id', 'subject_id', 'image_size_x', 'image_size_y',
       'Airway wall thickening', 'Atelectasis', 'Consolidation',
       'Enlarged cardiac silhouette', 'Fibrosis', 'Fracture',
       'Groundglass opacity', 'Other', 'Pneumothorax', 'Pulmonary edema',
       'Quality issue', 'Support devices', 'Wide mediastinum',
       'Abnormal mediastinal contour', 'Acute fracture', 'Enlarged hilum',
       'Hiatal hernia', 'High lung volume / emphysema',
       'Interstitial lung disease', 'Lung nodule or mass',
       'Pleural abnormality', 'Columns',
       'PatientOrientationCodeSequence_CodeMeaning',
       'PerformedProcedureStepDescription',
       'ProcedureCodeSequence_CodeMeaning', 'Rows', 'StudyDate',
       'StudyDateTime', 'StudyTime', 'ViewCodeSequence_CodeMeaning',
       'ViewPosition', 'in_eye_gaze', 'in_reflacx', 'stay_id', 'study_id',
       'Unnamed: 0_negbio', 'subject_id_negbio', 

In [24]:
# DO another for clinical data
hosp_patients_df = pd.read_csv(os.path.join(mimic_eye_path, "spreadsheets","Hosp","patients.csv"),)
ed_triage_df = pd.read_csv(os.path.join(mimic_eye_path, "spreadsheets","ED","triage.csv"), index_col=False)

In [25]:
merged_df = (
    merged_df
    .merge(hosp_patients_df, "left", on="subject_id", suffixes=("", "_patient"))
    .merge(ed_triage_df, "left", on="stay_id", suffixes=("", "_triage"))
)

merged_df["age"] = merged_df["anchor_age"] + (
    merged_df["StudyDate"].apply(lambda x: int(str(x)[:4])) - merged_df["anchor_year"]
)

In [26]:
for col in [ col  for col in merged_df.columns if col.startswith("Unnamed:")]:
    del merged_df[col]

In [27]:
import numpy as np

In [28]:
np.log2(512)

9.0

In [29]:
merged_df.columns

Index(['index', 'id', 'split', 'eye_tracking_data_discarded', 'image',
       'dicom_id', 'subject_id', 'image_size_x', 'image_size_y',
       'Airway wall thickening', 'Atelectasis', 'Consolidation',
       'Enlarged cardiac silhouette', 'Fibrosis', 'Fracture',
       'Groundglass opacity', 'Other', 'Pneumothorax', 'Pulmonary edema',
       'Quality issue', 'Support devices', 'Wide mediastinum',
       'Abnormal mediastinal contour', 'Acute fracture', 'Enlarged hilum',
       'Hiatal hernia', 'High lung volume / emphysema',
       'Interstitial lung disease', 'Lung nodule or mass',
       'Pleural abnormality', 'Columns',
       'PatientOrientationCodeSequence_CodeMeaning',
       'PerformedProcedureStepDescription',
       'ProcedureCodeSequence_CodeMeaning', 'Rows', 'StudyDate',
       'StudyDateTime', 'StudyTime', 'ViewCodeSequence_CodeMeaning',
       'ViewPosition', 'in_eye_gaze', 'in_reflacx', 'stay_id', 'study_id',
       'subject_id_negbio', 'study_id_negbio', 'Atelectasis_neg

In [30]:
merged_df[~merged_df['stay_id'].isna()].to_csv(os.path.join("./spreadsheets", "reflacx_clinical_eye.csv"))

In [35]:
[c for c in merged_df.columns if c.endswith("_chexpert")]

['subject_id_chexpert',
 'study_id_chexpert',
 'Atelectasis_chexpert',
 'Cardiomegaly_chexpert',
 'Consolidation_chexpert',
 'Edema_chexpert',
 'Enlarged Cardiomediastinum_chexpert',
 'Fracture_chexpert',
 'Lung Lesion_chexpert',
 'Lung Opacity_chexpert',
 'No Finding_chexpert',
 'Pleural Effusion_chexpert',
 'Pleural Other_chexpert',
 'Pneumonia_chexpert',
 'Pneumothorax_chexpert',
 'Support Devices_chexpert']

In [36]:
[c for c in merged_df.columns if c.endswith("_negbio")]

['subject_id_negbio',
 'study_id_negbio',
 'Atelectasis_negbio',
 'Cardiomegaly_negbio',
 'Consolidation_negbio',
 'Edema_negbio',
 'Enlarged Cardiomediastinum_negbio',
 'Fracture_negbio',
 'Lung Lesion_negbio',
 'Lung Opacity_negbio',
 'No Finding_negbio',
 'Pleural Effusion_negbio',
 'Pleural Other_negbio',
 'Pneumonia_negbio',
 'Pneumothorax_negbio',
 'Support Devices_negbio']

In [33]:
merged_df.columns

Index(['index', 'id', 'split', 'eye_tracking_data_discarded', 'image',
       'dicom_id', 'subject_id', 'image_size_x', 'image_size_y',
       'Airway wall thickening', 'Atelectasis', 'Consolidation',
       'Enlarged cardiac silhouette', 'Fibrosis', 'Fracture',
       'Groundglass opacity', 'Other', 'Pneumothorax', 'Pulmonary edema',
       'Quality issue', 'Support devices', 'Wide mediastinum',
       'Abnormal mediastinal contour', 'Acute fracture', 'Enlarged hilum',
       'Hiatal hernia', 'High lung volume / emphysema',
       'Interstitial lung disease', 'Lung nodule or mass',
       'Pleural abnormality', 'Columns',
       'PatientOrientationCodeSequence_CodeMeaning',
       'PerformedProcedureStepDescription',
       'ProcedureCodeSequence_CodeMeaning', 'Rows', 'StudyDate',
       'StudyDateTime', 'StudyTime', 'ViewCodeSequence_CodeMeaning',
       'ViewPosition', 'in_eye_gaze', 'in_reflacx', 'stay_id', 'study_id',
       'subject_id_negbio', 'study_id_negbio', 'Atelectasis_neg

In [34]:
negbio_df

Unnamed: 0.1,Unnamed: 0,subject_id,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
0,0,10000032,50414267,,,,,,,,,1.0,,,,,
1,1,10000032,53189527,,,,,,,,,1.0,,,,,
2,2,10000032,53911762,,,,,,,,,1.0,,,,,
3,3,10000032,56699142,,,,,,,,,1.0,,,,,
4,4,10000764,57375967,,,1.0,,,,,,,,,-1.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227822,227822,19999442,58708861,,,,,,,,,1.0,,,,,1.0
227823,227823,19999733,57132437,,,,,,,,,1.0,,,,,
227824,227824,19999987,55368167,1.0,-1.0,,,,,0.0,,,0.0,,,0.0,
227825,227825,19999987,58621812,1.0,,,,,,,,,,,,,1.0
