# Creating the spreadsheet from REFLACX and MIMIC dataset.
The created spreadsheet can be used with the dataset modules in `./data`.

In [1]:
import os, json
import pandas as pd

from data.paths import TabularDataPaths
from utils import print as print_f
from utils.preprocessing import get_split_list, checkMissingValue
from utils.init import reproducibility

from data.paths import XAMI_MIMIC_PATH, SPREADSHEET_FOLDER, PHYSIO_PATH
from data.constants import REFLACX_ALL_LABEL_COLS, REFLACX_REPETITIVE_ALL_LABEL_COLS

### Determine the seed and insure the reproducibility.

In [2]:
seed = 0
reproducibility(0)

Read in the REFLACX dataframe

In [3]:
reflacx_df = pd.concat(
    [
        pd.read_csv(
            os.path.join("E:\\physionet.org\\files\\reflacx-xray-localization\\1.0.0\\main_data", f"metadata_phase_{i}.csv")
        )
        for i in [1, 2, 3]
    ],
    axis=0,
)
reflacx_df.head(5)

Unnamed: 0,id,split,eye_tracking_data_discarded,image,dicom_id,subject_id,image_size_x,image_size_y,Airway wall thickening,Atelectasis,...,Support devices,Wide mediastinum,Abnormal mediastinal contour,Acute fracture,Enlarged hilum,Hiatal hernia,High lung volume / emphysema,Interstitial lung disease,Lung nodule or mass,Pleural abnormality
0,P102R108387,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,0.0,...,True,0.0,,,,,,,,
1,P102R379837,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,0.0,...,True,0.0,,,,,,,,
2,P102R558314,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,4.0,0.0,...,True,0.0,,,,,,,,
3,P102R765317,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,0.0,...,True,2.0,,,,,,,,
4,P102R915878,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,0.0,...,True,0.0,,,,,,,,


In [4]:
reflacx_df

Unnamed: 0,id,split,eye_tracking_data_discarded,image,dicom_id,subject_id,image_size_x,image_size_y,Airway wall thickening,Atelectasis,...,Support devices,Wide mediastinum,Abnormal mediastinal contour,Acute fracture,Enlarged hilum,Hiatal hernia,High lung volume / emphysema,Interstitial lung disease,Lung nodule or mass,Pleural abnormality
0,P102R108387,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,0.0,...,True,0.0,,,,,,,,
1,P102R379837,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,0.0,...,True,0.0,,,,,,,,
2,P102R558314,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,4.0,0.0,...,True,0.0,,,,,,,,
3,P102R765317,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,0.0,...,True,2.0,,,,,,,,
4,P102R915878,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,0.0,...,True,0.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2502,P300R939601,test,False,physionet.org/files/mimic-cxr/2.0.0/files/p19/...,d9661ff6-877ac981-a20a8810-92309d46-173008ad,19991135,2544,3056,,0.0,...,False,,5.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
2503,P300R331925,test,False,physionet.org/files/mimic-cxr/2.0.0/files/p19/...,d8f5555c-d8bcf97c-3a9d5596-17b9f854-6e15d081,19991135,1736,2022,,0.0,...,True,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
2504,P300R336111,test,False,physionet.org/files/mimic-cxr/2.0.0/files/p19/...,d04bbcbd-5143439e-50ebe9a8-71380f67-44f8d127,19991135,2544,3056,,5.0,...,True,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2505,P300R179177,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p19/...,7cfe7acc-65b0b548-feb77772-04c46d33-76a03230,19997367,2402,2712,,5.0,...,True,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0


In [5]:
reflacx_df.columns

Index(['id', 'split', 'eye_tracking_data_discarded', 'image', 'dicom_id',
       'subject_id', 'image_size_x', 'image_size_y', 'Airway wall thickening',
       'Atelectasis', 'Consolidation', 'Emphysema',
       'Enlarged cardiac silhouette', 'Fibrosis', 'Fracture',
       'Groundglass opacity', 'Mass', 'Nodule', 'Other', 'Pleural effusion',
       'Pleural thickening', 'Pneumothorax', 'Pulmonary edema',
       'Quality issue', 'Support devices', 'Wide mediastinum',
       'Abnormal mediastinal contour', 'Acute fracture', 'Enlarged hilum',
       'Hiatal hernia', 'High lung volume / emphysema',
       'Interstitial lung disease', 'Lung nodule or mass',
       'Pleural abnormality'],
      dtype='object')

In [6]:
print(f"The REFLACX dataset has {len(reflacx_df)} instances.")
### some features and datasets details can be explained here. ###

The REFLACX dataset has 3052 instances.


## Check what are those features containing missing in REFLACX.

In [7]:
print_f.print_title("Columns have NaN value in REFLACX metadata") 
checkMissingValue(reflacx_df)

[('Airway wall thickening', 2757), ('Emphysema', 2757), ('Fibrosis', 2757), ('Fracture', 2757), ('Mass', 2757), ('Nodule', 2757), ('Other', 2839), ('Pleural effusion', 2757), ('Pleural thickening', 2757), ('Quality issue', 2757), ('Wide mediastinum', 2757), ('Abnormal mediastinal contour', 295), ('Acute fracture', 295), ('Enlarged hilum', 295), ('Hiatal hernia', 295), ('High lung volume / emphysema', 295), ('Interstitial lung disease', 295), ('Lung nodule or mass', 295), ('Pleural abnormality', 295)]


### Filling the missing features

In [8]:
# Filling nan for numerical cols.
reflacx_df[REFLACX_REPETITIVE_ALL_LABEL_COLS] = reflacx_df[REFLACX_REPETITIVE_ALL_LABEL_COLS].fillna(0)

# Folling nan for boolean cols.
reflacx_df[["Quality issue", "Support devices"]] = reflacx_df[
    ["Quality issue", "Support devices"]
].fillna(False)

# check the missing values again.
print_f.print_title("Columns have NaN value in REFLACX metadata")
checkMissingValue(reflacx_df)

[]


## Dealing with the repetitive labels in REFLACX

In [9]:
reflacx_df["Lung nodule or mass"] = reflacx_df[
    ["Lung nodule or mass", "Mass", "Nodule"]
].max(axis=1)

reflacx_df["High lung volume / emphysema"] = reflacx_df[
    ["High lung volume / emphysema", "Emphysema"]
].max(axis=1)

reflacx_df["Pleural abnormality"] = reflacx_df[
    ["Pleural abnormality", "Pleural thickening", "Pleural effusion"]
].max(axis=1)

# Remove the repetitive labels that not in the version we want.
del reflacx_df["Mass"]
del reflacx_df["Nodule"]

del reflacx_df["Emphysema"]

del reflacx_df["Pleural thickening"]
del reflacx_df["Pleural effusion"]

# Load other datasets that have the modalities we desire.

In [10]:
# we don't actually need to merge with other datasets, we just need to import the eye-tracking data, that's all.

cxr_meta_df = pd.read_csv("E:\\physionet.org\\files\\mimic-cxr-jpg\\2.0.0\\mimic-cxr-2.0.0-metadata.csv.gz")
hosp_patients_df = pd.read_csv("E:\\physionet.org\\files\\mimiciv\\2.0\\hosp\\patients.csv.gz")
ed_triage_df = pd.read_csv("E:\\physionet.org\\files\\mimic-iv-ed\\2.0\\ed\\triage.csv.gz")

In [11]:
## Merge REFLACX with other datasets (modalities)

In [12]:
cxr_meta_df.head(5)

Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,CHEST (PA AND LAT),PA,3056,2544,21800506,213014.531,CHEST (PA AND LAT),postero-anterior,Erect
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,10000032,50414267,CHEST (PA AND LAT),LATERAL,3056,2544,21800506,213014.531,CHEST (PA AND LAT),lateral,Erect
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,CHEST (PA AND LAT),PA,3056,2544,21800626,165500.312,CHEST (PA AND LAT),postero-anterior,Erect
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,10000032,53189527,CHEST (PA AND LAT),LATERAL,3056,2544,21800626,165500.312,CHEST (PA AND LAT),lateral,Erect
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,CHEST (PORTABLE AP),AP,2705,2539,21800723,80556.875,CHEST (PORTABLE AP),antero-posterior,


In [13]:
class CXRImage:
    def __init__(self, patient_id, study_id, dicom_id, reflacx_id=None) -> None:
        self.patient_id = patient_id
        self.study_id = study_id
        self.dicom_id = dicom_id
        self.reflacx_id = reflacx_id

    def get_physio_image_path(self, physio_path, jpg=True, version="2.0.0"):
        file_extension = "jpg"if jpg else "dcm"

        return os.path.join(
            physio_path,
            "files",
            ("mimic-cxr-jpg" if jpg else "mimic-cxr"),
            version,
            "files",
            f"p{self.patient_id[:2]}",
            f"p{self.patient_id}",
            f"s{self.study_id}",
            f"{self.dicom_id}.{file_extension}"
        )

    def get_xami_image_path(self, xami_path):
        return os.path.join(
            xami_path,
            f"patient_{self.patient_id}",
            "CXR-JPG",
            f"s{self.study_id}",
            f"{self.dicom_id}.jpg"
        )

def create_cxr_from_path(path, reflacx_id=None):
    fs = path.split("/")
    dicom_id = fs[-1].split(".")[0]
    study_id = fs[-2][1:]
    patient_id = fs[-3][1:]

    return CXRImage(
        patient_id=patient_id,
        study_id=study_id,
        dicom_id=dicom_id,
        reflacx_id=reflacx_id,
    )


In [14]:
## in order to have a faster reading speed, we use XAMI-MIMIC
XAMI_MIMIC_PATH_str = "{XAMI_MIMIC_PATH}"

## Calculate the age according to the time when the CXR taken.
reflacx_df["image_path"] = reflacx_df['image'].apply(lambda x: create_cxr_from_path(x).get_xami_image_path(XAMI_MIMIC_PATH))
reflacx_df["bbox_path"] = (
    f"{XAMI_MIMIC_PATH_str}\patient_"
    + reflacx_df["subject_id"].astype(str)
    + "\REFLACX\\"
    + reflacx_df["id"].astype(str)
    + "\\anomaly_location_ellipses.csv"
)
reflacx_df["fixation_path"] = (
    f"{XAMI_MIMIC_PATH_str}\patient_"
    + reflacx_df["subject_id"].astype(str)
    + "\REFLACX\\"
    + reflacx_df["id"].astype(str)
    + "\\fixations.csv"
)

In [15]:
# transform to physio approach, we keep this one incase we need physio to run all in the future.

# reflacx_df["image_path"] = reflacx_df["image"].apply(
#     lambda x: create_cxr_from_path(x).get_physio_image_path(PHYSIO_PATH)
# )

# physio_reflacx_path = (
#     "E:\\physionet.org\\files\\reflacx-xray-localization\\1.0.0\\main_data\\"
# )
# reflacx_df["bbox_path"] = reflacx_df["id"].apply(
#     lambda x: os.path.join(physio_reflacx_path, x, "anomaly_location_ellipses.csv")
# )

# reflacx_df["fixation_path"] = reflacx_df["id"].apply(
#     lambda x: os.path.join(physio_reflacx_path, x, "fixations.csv")
# )

In [16]:
## only keep the one without eye-tracking data

reflacx_df = reflacx_df[reflacx_df["eye_tracking_data_discarded"] != True]

## we first test what kind of modalities exist, then check how can we separate them. The problem can be from the labels (abnormalities), so let's first see which disease has most of the agreement from different radiologists.

In [17]:
reflacx_df['split'] = get_split_list(len(reflacx_df), train_portion=0.7)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reflacx_df['split'] = get_split_list(len(reflacx_df), train_portion=0.7)


In [18]:
from collections import OrderedDict

count_map = dict((reflacx_df[REFLACX_ALL_LABEL_COLS]>0).sum(axis=0))
OrderedDict({k: v for k, v in sorted(count_map.items(), key=lambda item: item[1])})

OrderedDict([('Fibrosis', 5),
             ('Quality issue', 9),
             ('Wide mediastinum', 10),
             ('Fracture', 12),
             ('Airway wall thickening', 21),
             ('Hiatal hernia', 24),
             ('Acute fracture', 33),
             ('Interstitial lung disease', 34),
             ('Enlarged hilum', 56),
             ('Abnormal mediastinal contour', 84),
             ('High lung volume / emphysema', 89),
             ('Pneumothorax', 105),
             ('Lung nodule or mass', 162),
             ('Groundglass opacity', 389),
             ('Pulmonary edema', 427),
             ('Enlarged cardiac silhouette', 710),
             ('Consolidation', 814),
             ('Atelectasis', 840),
             ('Pleural abnormality', 933),
             ('Support devices', 1305)])

In [19]:
sort_countmap = OrderedDict({k: v for k, v in sorted(count_map.items(), key=lambda item: item[1])})

In [20]:
sort_countmap.keys()

odict_keys(['Fibrosis', 'Quality issue', 'Wide mediastinum', 'Fracture', 'Airway wall thickening', 'Hiatal hernia', 'Acute fracture', 'Interstitial lung disease', 'Enlarged hilum', 'Abnormal mediastinal contour', 'High lung volume / emphysema', 'Pneumothorax', 'Lung nodule or mass', 'Groundglass opacity', 'Pulmonary edema', 'Enlarged cardiac silhouette', 'Consolidation', 'Atelectasis', 'Pleural abnormality', 'Support devices'])

In [21]:
reflacx_df = reflacx_df.reset_index()
reflacx_df.to_csv(os.path.join(SPREADSHEET_FOLDER, "reflacx_eye.csv"))

In [22]:
reflacx_df.columns

Index(['index', 'id', 'split', 'eye_tracking_data_discarded', 'image',
       'dicom_id', 'subject_id', 'image_size_x', 'image_size_y',
       'Airway wall thickening', 'Atelectasis', 'Consolidation',
       'Enlarged cardiac silhouette', 'Fibrosis', 'Fracture',
       'Groundglass opacity', 'Other', 'Pneumothorax', 'Pulmonary edema',
       'Quality issue', 'Support devices', 'Wide mediastinum',
       'Abnormal mediastinal contour', 'Acute fracture', 'Enlarged hilum',
       'Hiatal hernia', 'High lung volume / emphysema',
       'Interstitial lung disease', 'Lung nodule or mass',
       'Pleural abnormality', 'image_path', 'bbox_path', 'fixation_path'],
      dtype='object')