In [1]:
import os, json
import pandas as pd

from utils.preprocessing import get_split_list, checkMissingValue
from utils.init import reproducibility
from datasets import paths
from datasets.reflacx import constants as r_constants

In [2]:
seed = 0
reproducibility(seed)

In [3]:
reflacx_df = pd.read_csv(
    os.path.join(paths.MIMIC_EYE_PATH, "spreadsheets/REFLACX/metadata.csv")
)
reflacx_df.head(5)

Unnamed: 0.1,Unnamed: 0,id,split,eye_tracking_data_discarded,image,dicom_id,subject_id,image_size_x,image_size_y,Airway wall thickening,...,Support devices,Wide mediastinum,Abnormal mediastinal contour,Acute fracture,Enlarged hilum,Hiatal hernia,High lung volume / emphysema,Interstitial lung disease,Lung nodule or mass,Pleural abnormality
0,0,P102R108387,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,...,True,0.0,,,,,,,,
1,1,P102R379837,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,...,True,0.0,,,,,,,,
2,2,P102R558314,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,4.0,...,True,0.0,,,,,,,,
3,3,P102R765317,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,...,True,2.0,,,,,,,,
4,4,P102R915878,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,...,True,0.0,,,,,,,,


In [4]:
cxr_meta_df = pd.read_csv(os.path.join(paths.MIMIC_EYE_PATH, "spreadsheets","cxr_meta.csv"))

In [5]:
print(f"The REFLACX dataset has {len(reflacx_df)} instances.")
### some features and datasets details can be explained here. ###

The REFLACX dataset has 3052 instances.


In [6]:
# Filling nan for numerical cols.
reflacx_df[r_constants.REPETITIVE_ALL_LABEL_COLS] = reflacx_df[r_constants.REPETITIVE_ALL_LABEL_COLS].fillna(0)

# Folling nan for boolean cols.
reflacx_df[["Quality issue", "Support devices"]] = reflacx_df[
    ["Quality issue", "Support devices"]
].fillna(False)

# check the missing values again.
checkMissingValue(reflacx_df)

[]


In [7]:
# delete repetitive cols

reflacx_df["Lung nodule or mass"] = reflacx_df[
    ["Lung nodule or mass", "Mass", "Nodule"]
].max(axis=1)

reflacx_df["High lung volume / emphysema"] = reflacx_df[
    ["High lung volume / emphysema", "Emphysema"]
].max(axis=1)

reflacx_df["Pleural abnormality"] = reflacx_df[
    ["Pleural abnormality", "Pleural thickening", "Pleural effusion"]
].max(axis=1)

# Remove the repetitive labels that not in the version we want.
del reflacx_df["Mass"]
del reflacx_df["Nodule"]

del reflacx_df["Emphysema"]

del reflacx_df["Pleural thickening"]
del reflacx_df["Pleural effusion"]

In [8]:
cxr_meta_df = pd.read_csv(  os.path.join(paths.MIMIC_EYE_PATH, "spreadsheets","cxr_meta.csv"))
negbio_df = pd.read_csv(  os.path.join(paths.MIMIC_EYE_PATH, "spreadsheets","CXR-JPG","cxr_negbio.csv"))
chexpert_df = pd.read_csv(  os.path.join(paths.MIMIC_EYE_PATH, "spreadsheets","CXR-JPG","cxr_chexpert.csv"))

In [9]:
cols_to_use = cxr_meta_df.columns.difference(reflacx_df.columns)
merged_df = reflacx_df.merge(cxr_meta_df[list(cols_to_use)+ ['dicom_id']], "left", on="dicom_id", suffixes=("",""))
merged_df = merged_df.merge(negbio_df.add_suffix("_negbio"), "left", left_on="study_id", right_on="study_id_negbio" 
)
merged_df = merged_df.merge(chexpert_df.add_suffix("_chexpert"), "left", left_on="study_id", right_on="study_id_chexpert",
)

for col in [ col for col in merged_df.columns if (col.endswith("_negbio") or col.endswith("_chexpert"))]:
    merged_df[col] = merged_df[col].fillna(0)

In [10]:
merged_df = merged_df[merged_df["eye_tracking_data_discarded"] != True]

In [11]:
merged_df['split'] = get_split_list(len(merged_df), train_portion=0.7)

In [12]:
from collections import OrderedDict

count_map = dict((merged_df[r_constants.ALL_LABEL_COLS]>0).sum(axis=0))
OrderedDict({k: v for k, v in sorted(count_map.items(), key=lambda item: item[1])})

OrderedDict([('Fibrosis', 5),
             ('Quality issue', 9),
             ('Wide mediastinum', 10),
             ('Fracture', 12),
             ('Airway wall thickening', 21),
             ('Hiatal hernia', 24),
             ('Acute fracture', 33),
             ('Interstitial lung disease', 34),
             ('Enlarged hilum', 56),
             ('Abnormal mediastinal contour', 84),
             ('High lung volume / emphysema', 89),
             ('Pneumothorax', 105),
             ('Lung nodule or mass', 162),
             ('Groundglass opacity', 389),
             ('Pulmonary edema', 427),
             ('Enlarged cardiac silhouette', 710),
             ('Consolidation', 814),
             ('Atelectasis', 840),
             ('Pleural abnormality', 933),
             ('Support devices', 1305)])

In [13]:
sort_countmap = OrderedDict({k: v for k, v in sorted(count_map.items(), key=lambda item: item[1])})
sort_countmap.keys()

odict_keys(['Fibrosis', 'Quality issue', 'Wide mediastinum', 'Fracture', 'Airway wall thickening', 'Hiatal hernia', 'Acute fracture', 'Interstitial lung disease', 'Enlarged hilum', 'Abnormal mediastinal contour', 'High lung volume / emphysema', 'Pneumothorax', 'Lung nodule or mass', 'Groundglass opacity', 'Pulmonary edema', 'Enlarged cardiac silhouette', 'Consolidation', 'Atelectasis', 'Pleural abnormality', 'Support devices'])

In [14]:
reflacx_needed_columns = [
    "id",
    "dicom_id",
    "subject_id",
    "stay_id",
    "study_id",

    "split",
    "eye_tracking_data_discarded",
    # "image_path",
    # "bbox_path",
    # "fixation_path",

    "StudyDate",
    "StudyDateTime",
    "StudyTime",

    "in_eye_gaze",
    "in_reflacx",

    "image",
    "image_size_x",
    "image_size_y",
    "ViewPosition",


    "Airway wall thickening",
    "Atelectasis",
    "Consolidation",
    "Enlarged cardiac silhouette",
    "Fibrosis",
    "Fracture",
    "Groundglass opacity",
    "Other",
    "Pneumothorax",
    "Pulmonary edema",
    "Quality issue",
    "Support devices",
    "Wide mediastinum",
    "Abnormal mediastinal contour",
    "Acute fracture",
    "Enlarged hilum",
    "Hiatal hernia",
    "High lung volume / emphysema",
    "Interstitial lung disease",
    "Lung nodule or mass",
    "Pleural abnormality",

    # negbio
    "Atelectasis_negbio",
    "Cardiomegaly_negbio",
    "Consolidation_negbio",
    "Edema_negbio",
    "Enlarged Cardiomediastinum_negbio",
    "Fracture_negbio",
    "Lung Lesion_negbio",
    "Lung Opacity_negbio",
    "No Finding_negbio",
    "Pleural Effusion_negbio",
    "Pleural Other_negbio",
    "Pneumonia_negbio",
    "Pneumothorax_negbio",
    "Support Devices_negbio",

    # chexpert
    "Atelectasis_chexpert",
    "Cardiomegaly_chexpert",
    "Consolidation_chexpert",
    "Edema_chexpert",
    "Enlarged Cardiomediastinum_chexpert",
    "Fracture_chexpert",
    "Lung Lesion_chexpert",
    "Lung Opacity_chexpert",
    "No Finding_chexpert",
    "Pleural Effusion_chexpert",
    "Pleural Other_chexpert",
    "Pneumonia_chexpert",
    "Pneumothorax_chexpert",
    "Support Devices_chexpert",
]


merged_df = merged_df[reflacx_needed_columns]

In [15]:
merged_df = merged_df.reset_index()
merged_df.to_csv(os.path.join("./spreadsheets", "reflacx.csv"))

In [16]:
merged_df = merged_df[~merged_df['stay_id'].isna()]
# DO another for clinical data
hosp_patients_df = pd.read_csv(os.path.join(paths.MIMIC_EYE_PATH, "spreadsheets","Hosp","patients.csv"),)
ed_triage_df = pd.read_csv(os.path.join(paths.MIMIC_EYE_PATH, "spreadsheets","ED","triage.csv"), index_col=False)

In [17]:
merged_df = (
    merged_df
    .merge(hosp_patients_df, "left", on="subject_id", suffixes=("", "_patient"))
    .merge(ed_triage_df, "left", on="stay_id", suffixes=("", "_triage"))
)

merged_df["age"] = merged_df["anchor_age"] + (
    merged_df["StudyDate"].apply(lambda x: int(str(x)[:4])) - merged_df["anchor_year"]
)

In [18]:
clinical_cols = [
    "temperature",
    "heartrate",
    "resprate",
    "o2sat",
    "sbp",
    "dbp",
    "acuity",
    # "chiefcomplaint",
    "age",
]
merged_df = merged_df.dropna(subset=clinical_cols)

In [19]:
reflacx_clinical_needed_columns = [
    "id",
    "dicom_id",
    "subject_id",
    "stay_id",
    "study_id",

    "split",
    "eye_tracking_data_discarded",
    # "image_path",
    # "bbox_path",
    # "fixation_path",
    "gender",
    "anchor_age",
    "anchor_year",
    "dod",

    "StudyDate",
    "StudyDateTime",
    "StudyTime",

    "in_eye_gaze",
    "in_reflacx",

    "image",
    "image_size_x",
    "image_size_y",
    "ViewPosition",


    "Airway wall thickening",
    "Atelectasis",
    "Consolidation",
    "Enlarged cardiac silhouette",
    "Fibrosis",
    "Fracture",
    "Groundglass opacity",
    "Other",
    "Pneumothorax",
    "Pulmonary edema",
    "Quality issue",
    "Support devices",
    "Wide mediastinum",
    "Abnormal mediastinal contour",
    "Acute fracture",
    "Enlarged hilum",
    "Hiatal hernia",
    "High lung volume / emphysema",
    "Interstitial lung disease",
    "Lung nodule or mass",
    "Pleural abnormality",

    # negbio
    "Atelectasis_negbio",
    "Cardiomegaly_negbio",
    "Consolidation_negbio",
    "Edema_negbio",
    "Enlarged Cardiomediastinum_negbio",
    "Fracture_negbio",
    "Lung Lesion_negbio",
    "Lung Opacity_negbio",
    "No Finding_negbio",
    "Pleural Effusion_negbio",
    "Pleural Other_negbio",
    "Pneumonia_negbio",
    "Pneumothorax_negbio",
    "Support Devices_negbio",

    # chexpert
    "Atelectasis_chexpert",
    "Cardiomegaly_chexpert",
    "Consolidation_chexpert",
    "Edema_chexpert",
    "Enlarged Cardiomediastinum_chexpert",
    "Fracture_chexpert",
    "Lung Lesion_chexpert",
    "Lung Opacity_chexpert",
    "No Finding_chexpert",
    "Pleural Effusion_chexpert",
    "Pleural Other_chexpert",
    "Pneumonia_chexpert",
    "Pneumothorax_chexpert",
    "Support Devices_chexpert",
    
    # clinical
    "temperature",
    "heartrate",
    "resprate",
    "o2sat",
    "sbp",
    "dbp",
    "pain",
    "acuity",
    "chiefcomplaint",
    "age",
]


merged_df = merged_df[reflacx_clinical_needed_columns]

In [20]:
# remove unrealistic clinical values
merged_df = merged_df[merged_df['temperature'] > 60]

In [21]:
merged_df['split'] = get_split_list(len(merged_df), train_portion=0.7)
merged_df.to_csv(os.path.join("./spreadsheets", "reflacx_clinical.csv"))

In [22]:
len(merged_df)

799