# Creating the spreadsheet from REFLACX and MIMIC dataset.
The created spreadsheet can be used with the dataset modules in `./data`.

In [1]:
import os, json
import pandas as pd

from data.paths import TabularDataPaths
from utils import print as print_f
from utils.preprocessing import get_split_list, checkMissingValue
from utils.init import reproducibility

from data.constants import REFLACX_ALL_LABEL_COLS, REFLACX_REPETITIVE_ALL_LABEL_COLS
from data.paths import MIMIC_EYE_PATH

### Determine the seed and insure the reproducibility.

In [2]:
seed = 0
reproducibility(0)

Read in the REFLACX dataframe

In [3]:
# mimic_eye_path = "/Volumes/T7/mimic-eye"
mimic_eye_path = MIMIC_EYE_PATH

In [4]:
reflacx_df = pd.read_csv(
    os.path.join(mimic_eye_path, "spreadsheets/REFLACX/metadata.csv")
)
reflacx_df.head(5)

Unnamed: 0.1,Unnamed: 0,id,split,eye_tracking_data_discarded,image,dicom_id,subject_id,image_size_x,image_size_y,Airway wall thickening,...,Support devices,Wide mediastinum,Abnormal mediastinal contour,Acute fracture,Enlarged hilum,Hiatal hernia,High lung volume / emphysema,Interstitial lung disease,Lung nodule or mass,Pleural abnormality
0,0,P102R108387,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,...,True,0.0,,,,,,,,
1,1,P102R379837,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,...,True,0.0,,,,,,,,
2,2,P102R558314,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,4.0,...,True,0.0,,,,,,,,
3,3,P102R765317,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,...,True,2.0,,,,,,,,
4,4,P102R915878,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,...,True,0.0,,,,,,,,


In [5]:
reflacx_df.columns

Index(['Unnamed: 0', 'id', 'split', 'eye_tracking_data_discarded', 'image',
       'dicom_id', 'subject_id', 'image_size_x', 'image_size_y',
       'Airway wall thickening', 'Atelectasis', 'Consolidation', 'Emphysema',
       'Enlarged cardiac silhouette', 'Fibrosis', 'Fracture',
       'Groundglass opacity', 'Mass', 'Nodule', 'Other', 'Pleural effusion',
       'Pleural thickening', 'Pneumothorax', 'Pulmonary edema',
       'Quality issue', 'Support devices', 'Wide mediastinum',
       'Abnormal mediastinal contour', 'Acute fracture', 'Enlarged hilum',
       'Hiatal hernia', 'High lung volume / emphysema',
       'Interstitial lung disease', 'Lung nodule or mass',
       'Pleural abnormality'],
      dtype='object')

In [5]:
reflacx_df

Unnamed: 0.1,Unnamed: 0,id,split,eye_tracking_data_discarded,image,dicom_id,subject_id,image_size_x,image_size_y,Airway wall thickening,...,Support devices,Wide mediastinum,Abnormal mediastinal contour,Acute fracture,Enlarged hilum,Hiatal hernia,High lung volume / emphysema,Interstitial lung disease,Lung nodule or mass,Pleural abnormality
0,0,P102R108387,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,...,True,0.0,,,,,,,,
1,1,P102R379837,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,...,True,0.0,,,,,,,,
2,2,P102R558314,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,4.0,...,True,0.0,,,,,,,,
3,3,P102R765317,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,...,True,2.0,,,,,,,,
4,4,P102R915878,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,...,True,0.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3047,2502,P300R939601,test,False,physionet.org/files/mimic-cxr/2.0.0/files/p19/...,d9661ff6-877ac981-a20a8810-92309d46-173008ad,19991135,2544,3056,,...,False,,5.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
3048,2503,P300R331925,test,False,physionet.org/files/mimic-cxr/2.0.0/files/p19/...,d8f5555c-d8bcf97c-3a9d5596-17b9f854-6e15d081,19991135,1736,2022,,...,True,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
3049,2504,P300R336111,test,False,physionet.org/files/mimic-cxr/2.0.0/files/p19/...,d04bbcbd-5143439e-50ebe9a8-71380f67-44f8d127,19991135,2544,3056,,...,True,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3050,2505,P300R179177,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p19/...,7cfe7acc-65b0b548-feb77772-04c46d33-76a03230,19997367,2402,2712,,...,True,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0


In [12]:
cxr_meta_df.columns

Index(['Unnamed: 0', 'dicom_id', 'subject_id', 'study_id',
       'PerformedProcedureStepDescription', 'ViewPosition', 'Rows', 'Columns',
       'StudyDate', 'StudyTime', 'ProcedureCodeSequence_CodeMeaning',
       'ViewCodeSequence_CodeMeaning',
       'PatientOrientationCodeSequence_CodeMeaning', 'in_eye_gaze',
       'in_reflacx', 'StudyDateTime', 'stay_id'],
      dtype='object')

In [7]:
cxr_meta_df = pd.read_csv(  os.path.join(mimic_eye_path, "spreadsheets","cxr_meta.csv"))

In [17]:
len(cxr_meta_df[cxr_meta_df.in_eye_gaze & cxr_meta_df.in_reflacx].subject_id.unique())

10

In [25]:
cxr_meta_df

Unnamed: 0.1,Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning,in_eye_gaze,in_reflacx,StudyDateTime,stay_id
0,177,fa771fa1-d9571d07-bff8f655-327734a7-6e10b29d,10002428,59258773,CHEST (PORTABLE AP),AP,3056,2544,21560419,92717.109,CHEST (PORTABLE AP),antero-posterior,Erect,False,True,2.156042e+12,
1,181,4873aa08-977bfd31-fb492e64-6ef432d1-3f12cbe3,10002430,53254222,CHEST (PA AND LAT),PA,3056,2544,21250928,160736.171,CHEST (PA AND LAT),postero-anterior,Erect,True,False,2.125093e+13,31293660.0
2,266,dcdc4bd9-4301b111-2a65a814-ee8e7bc5-7f0b9a5a,10003400,56466802,CHEST (PORTABLE AP),AP,3056,2544,21361209,133738.015,CHEST (PORTABLE AP),antero-posterior,Erect,False,True,2.136121e+13,33678912.0
3,497,5bdabba9-388f6646-ac06b5f5-f68b2fd2-3630de21,10011607,55814288,CHEST (PA AND LAT),AP,3056,2544,21820111,230933.250,CHEST (PA AND LAT),antero-posterior,Erect,False,True,2.182011e+13,37054412.0
4,539,4a629500-9c3281ca-90bab490-9b6ac9c1-e5e6a580,10012261,53728467,CHEST (PORTABLE AP),AP,2544,3056,21750906,191151.453,CHEST (PORTABLE AP),antero-posterior,Erect,False,True,2.175091e+13,38668412.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3684,376634,d9661ff6-877ac981-a20a8810-92309d46-173008ad,19991135,50498205,CHEST (PORTABLE AP),AP,3056,2544,21260910,53556.406,CHEST (PORTABLE AP),antero-posterior,,False,True,2.126091e+12,
3685,376636,d8f5555c-d8bcf97c-3a9d5596-17b9f854-6e15d081,19991135,50634986,,PA,2022,1736,21261102,151208.000,CHEST (PA AND LAT),postero-anterior,Erect,False,True,2.126110e+13,
3686,376644,d04bbcbd-5143439e-50ebe9a8-71380f67-44f8d127,19991135,54910031,CHEST (PORTABLE AP),AP,3056,2544,21260908,202437.031,CHEST (PORTABLE AP),antero-posterior,Erect,False,True,2.126091e+13,
3687,377010,7cfe7acc-65b0b548-feb77772-04c46d33-76a03230,19997367,58396642,CHEST (PORTABLE AP),AP,2712,2402,21260425,134114.328,CHEST (PORTABLE AP),antero-posterior,Erect,False,True,2.126043e+13,


In [28]:
cxr_meta_df.stay_id[0].isna()

AttributeError: 'numpy.float64' object has no attribute 'isna'

In [30]:
len(cxr_meta_df)

3689

In [29]:
cxr_meta_df[~cxr_meta_df.stay_id.isna()]

Unnamed: 0.1,Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning,in_eye_gaze,in_reflacx,StudyDateTime,stay_id
1,181,4873aa08-977bfd31-fb492e64-6ef432d1-3f12cbe3,10002430,53254222,CHEST (PA AND LAT),PA,3056,2544,21250928,160736.171,CHEST (PA AND LAT),postero-anterior,Erect,True,False,2.125093e+13,31293660.0
2,266,dcdc4bd9-4301b111-2a65a814-ee8e7bc5-7f0b9a5a,10003400,56466802,CHEST (PORTABLE AP),AP,3056,2544,21361209,133738.015,CHEST (PORTABLE AP),antero-posterior,Erect,False,True,2.136121e+13,33678912.0
3,497,5bdabba9-388f6646-ac06b5f5-f68b2fd2-3630de21,10011607,55814288,CHEST (PA AND LAT),AP,3056,2544,21820111,230933.250,CHEST (PA AND LAT),antero-posterior,Erect,False,True,2.182011e+13,37054412.0
4,539,4a629500-9c3281ca-90bab490-9b6ac9c1-e5e6a580,10012261,53728467,CHEST (PORTABLE AP),AP,2544,3056,21750906,191151.453,CHEST (PORTABLE AP),antero-posterior,Erect,False,True,2.175091e+13,38668412.0
6,567,8106217e-c41ab813-c6002d3f-ed4ee98b-7b165bae,10012498,55812956,CHEST (PA AND LAT),PA,3056,2544,21470331,211642.296,CHEST (PA AND LAT),postero-anterior,Erect,True,False,2.147033e+13,37362927.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3672,375781,b7f7645f-afa4f5e7-4c67d87d-2cee15d7-fed5d9dc,19963862,52809931,CHEST (PA AND LAT),PA,3056,2544,21260220,230910.984,CHEST (PA AND LAT),postero-anterior,Erect,True,False,2.126022e+13,33008859.0
3674,375885,bfab2672-a24df0b6-c212efa4-c56b045e-3d632a6f,19966568,53145635,CHEST (PA AND LAT),PA,3056,2544,21410323,144309.687,CHEST (PA AND LAT),postero-anterior,Erect,True,False,2.141032e+13,35050304.0
3677,376105,9bb20972-fcc29834-4a48133d-ca0084a9-678f5f63,19973587,55588916,CHEST (PA AND LAT),PA,3056,2544,21430716,165110.906,CHEST (PA AND LAT),postero-anterior,Erect,True,False,2.143072e+13,35525824.0
3678,376109,76b55d6d-492245e2-c56a31ae-9f0d1cf4-78005e3b,19974002,52374783,CHEST (PA AND LAT),PA,3056,2544,21470415,214926.953,CHEST (PA AND LAT),postero-anterior,Erect,True,False,2.147042e+13,36616304.0


In [31]:
cxr_meta_df

Unnamed: 0.1,Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning,in_eye_gaze,in_reflacx,StudyDateTime,stay_id
0,177,fa771fa1-d9571d07-bff8f655-327734a7-6e10b29d,10002428,59258773,CHEST (PORTABLE AP),AP,3056,2544,21560419,92717.109,CHEST (PORTABLE AP),antero-posterior,Erect,False,True,2.156042e+12,
1,181,4873aa08-977bfd31-fb492e64-6ef432d1-3f12cbe3,10002430,53254222,CHEST (PA AND LAT),PA,3056,2544,21250928,160736.171,CHEST (PA AND LAT),postero-anterior,Erect,True,False,2.125093e+13,31293660.0
2,266,dcdc4bd9-4301b111-2a65a814-ee8e7bc5-7f0b9a5a,10003400,56466802,CHEST (PORTABLE AP),AP,3056,2544,21361209,133738.015,CHEST (PORTABLE AP),antero-posterior,Erect,False,True,2.136121e+13,33678912.0
3,497,5bdabba9-388f6646-ac06b5f5-f68b2fd2-3630de21,10011607,55814288,CHEST (PA AND LAT),AP,3056,2544,21820111,230933.250,CHEST (PA AND LAT),antero-posterior,Erect,False,True,2.182011e+13,37054412.0
4,539,4a629500-9c3281ca-90bab490-9b6ac9c1-e5e6a580,10012261,53728467,CHEST (PORTABLE AP),AP,2544,3056,21750906,191151.453,CHEST (PORTABLE AP),antero-posterior,Erect,False,True,2.175091e+13,38668412.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3684,376634,d9661ff6-877ac981-a20a8810-92309d46-173008ad,19991135,50498205,CHEST (PORTABLE AP),AP,3056,2544,21260910,53556.406,CHEST (PORTABLE AP),antero-posterior,,False,True,2.126091e+12,
3685,376636,d8f5555c-d8bcf97c-3a9d5596-17b9f854-6e15d081,19991135,50634986,,PA,2022,1736,21261102,151208.000,CHEST (PA AND LAT),postero-anterior,Erect,False,True,2.126110e+13,
3686,376644,d04bbcbd-5143439e-50ebe9a8-71380f67-44f8d127,19991135,54910031,CHEST (PORTABLE AP),AP,3056,2544,21260908,202437.031,CHEST (PORTABLE AP),antero-posterior,Erect,False,True,2.126091e+13,
3687,377010,7cfe7acc-65b0b548-feb77772-04c46d33-76a03230,19997367,58396642,CHEST (PORTABLE AP),AP,2712,2402,21260425,134114.328,CHEST (PORTABLE AP),antero-posterior,Erect,False,True,2.126043e+13,


In [7]:
print(f"The REFLACX dataset has {len(reflacx_df)} instances.")
### some features and datasets details can be explained here. ###

The REFLACX dataset has 3052 instances.


## Check what are those features containing missing in REFLACX.

In [8]:
print_f.print_title("Columns have NaN value in REFLACX metadata") 
checkMissingValue(reflacx_df)

[('Airway wall thickening', 2757), ('Emphysema', 2757), ('Fibrosis', 2757), ('Fracture', 2757), ('Mass', 2757), ('Nodule', 2757), ('Other', 2839), ('Pleural effusion', 2757), ('Pleural thickening', 2757), ('Quality issue', 2757), ('Wide mediastinum', 2757), ('Abnormal mediastinal contour', 295), ('Acute fracture', 295), ('Enlarged hilum', 295), ('Hiatal hernia', 295), ('High lung volume / emphysema', 295), ('Interstitial lung disease', 295), ('Lung nodule or mass', 295), ('Pleural abnormality', 295)]


### Filling the missing features

In [9]:
# Filling nan for numerical cols.
reflacx_df[REFLACX_REPETITIVE_ALL_LABEL_COLS] = reflacx_df[REFLACX_REPETITIVE_ALL_LABEL_COLS].fillna(0)

# Folling nan for boolean cols.
reflacx_df[["Quality issue", "Support devices"]] = reflacx_df[
    ["Quality issue", "Support devices"]
].fillna(False)

# check the missing values again.
print_f.print_title("Columns have NaN value in REFLACX metadata")
checkMissingValue(reflacx_df)

[]


## Dealing with the repetitive labels in REFLACX

In [10]:
reflacx_df["Lung nodule or mass"] = reflacx_df[
    ["Lung nodule or mass", "Mass", "Nodule"]
].max(axis=1)

reflacx_df["High lung volume / emphysema"] = reflacx_df[
    ["High lung volume / emphysema", "Emphysema"]
].max(axis=1)

reflacx_df["Pleural abnormality"] = reflacx_df[
    ["Pleural abnormality", "Pleural thickening", "Pleural effusion"]
].max(axis=1)

# Remove the repetitive labels that not in the version we want.
del reflacx_df["Mass"]
del reflacx_df["Nodule"]

del reflacx_df["Emphysema"]

del reflacx_df["Pleural thickening"]
del reflacx_df["Pleural effusion"]

# Load other datasets that have the modalities we desire.

In [11]:
mimic_eye_path

'D:\\mimic-eye'

In [12]:
# we don't actually need to merge with other datasets, we just need to import the eye-tracking data, that's all.

# cxr_meta_df = pd.read_csv("E:\\physionet.org\\files\\mimic-cxr-jpg\\2.0.0\\mimic-cxr-2.0.0-metadata.csv.gz")
# hosp_patients_df = pd.read_csv("E:\\physionet.org\\files\\mimiciv\\2.0\\hosp\\patients.csv.gz")
# ed_triage_df = pd.read_csv("E:\\physionet.org\\files\\mimic-iv-ed\\2.0\\ed\\triage.csv.gz")

# cxr_meta_df.head(5)
cxr_meta_df = pd.read_csv(  os.path.join(mimic_eye_path, "spreadsheets","cxr_meta.csv"))
negbio_df = pd.read_csv(  os.path.join(mimic_eye_path, "spreadsheets","CXR-JPG","cxr_negbio.csv"))
chexpert_df = pd.read_csv(  os.path.join(mimic_eye_path, "spreadsheets","CXR-JPG","cxr_chexpert.csv"))

In [13]:
cols_to_use = cxr_meta_df.columns.difference(reflacx_df.columns)
merged_df = reflacx_df.merge(cxr_meta_df[list(cols_to_use)+ ['dicom_id']], "left", on="dicom_id", suffixes=("",""))
merged_df = merged_df.merge(negbio_df.add_suffix("_negbio"), "left", left_on="study_id", right_on="study_id_negbio" 
)
merged_df = merged_df.merge(chexpert_df.add_suffix("_chexpert"), "left", left_on="study_id", right_on="study_id_chexpert",
)



In [14]:
for col in [ col for col in merged_df.columns if (col.endswith("_negbio") or col.endswith("_chexpert"))]:
    merged_df[col] = merged_df[col].fillna(0)

In [15]:
merged_df["No Finding_negbio"]

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
3047    0.0
3048    0.0
3049    0.0
3050    0.0
3051    0.0
Name: No Finding_negbio, Length: 3052, dtype: float64

In [16]:
class CXRImage:
    def __init__(self, patient_id, study_id, dicom_id, reflacx_id=None) -> None:
        self.patient_id = patient_id
        self.study_id = study_id
        self.dicom_id = dicom_id
        self.reflacx_id = reflacx_id

    def get_physio_image_path(self, physio_path, jpg=True, version="2.0.0"):
        file_extension = "jpg"if jpg else "dcm"

        return os.path.join(
            physio_path,
            "files",
            ("mimic-cxr-jpg" if jpg else "mimic-cxr"),
            version,
            "files",
            f"p{self.patient_id[:2]}",
            f"p{self.patient_id}",
            f"s{self.study_id}",
            f"{self.dicom_id}.{file_extension}"
        )

    def get_mimic_eye_image_path(self, XAMI_MIMIC_PATH_str):
        return os.path.join(
            XAMI_MIMIC_PATH_str,
            f"patient_{self.patient_id}",
            "CXR-JPG",
            f"s{self.study_id}",
            f"{self.dicom_id}.jpg"
        )

def create_cxr_from_path(path, reflacx_id=None):
    fs = path.split("/")
    dicom_id = fs[-1].split(".")[0]
    study_id = fs[-2][1:]
    patient_id = fs[-3][1:]

    return CXRImage(
        patient_id=patient_id,
        study_id=study_id,
        dicom_id=dicom_id,
        reflacx_id=reflacx_id,
    )


In [17]:
# in order to have a faster reading speed, we use XAMI-MIMIC
XAMI_MIMIC_PATH_str = "{XAMI_MIMIC_PATH}"

# Calculate the age according to the time when the CXR taken.
merged_df["image_path"] = merged_df['image'].apply(
    lambda x: create_cxr_from_path(x).get_mimic_eye_image_path(XAMI_MIMIC_PATH_str))
# merged_df["bbox_path"] = (
#     f"{XAMI_MIMIC_PATH_str}\patient_"
#     + merged_df["subject_id"].astype(str)
#     + "\REFLACX\\"
#     + merged_df["id"].astype(str)
#     + "\\anomaly_location_ellipses.csv"
# )

merged_df["bbox_path"] = merged_df.apply(lambda x: os.path.join(
    XAMI_MIMIC_PATH_str,
    f"patient_{str(x['subject_id'])}",
    "REFLACX", 
    "main_data",
    str(x["id"]),
    "anomaly_location_ellipses.csv"),
    axis=1)

merged_df["fixation_path"] = merged_df.apply(lambda x: os.path.join(
    XAMI_MIMIC_PATH_str,
    f"patient_{str(x['subject_id'])}",
    "REFLACX",
    "main_data",
    str(x["id"]),
    "fixations.csv"),
    axis=1)


# merged_df["fixation_path"] = (
#     f"{XAMI_MIMIC_PATH_str}\patient_"
#     + merged_df["subject_id"].astype(str)
#     + "\REFLACX\\"
#     + merged_df["id"].astype(str)
#     + "\\fixations.csv"
# )


In [18]:
## only keep the one without eye-tracking data

merged_df = merged_df[merged_df["eye_tracking_data_discarded"] != True]

## we first test what kind of modalities exist, then check how can we separate them. The problem can be from the labels (abnormalities), so let's first see which disease has most of the agreement from different radiologists.

In [19]:
print_f.print_title("Columns have missing value in merged_df")
for col in ["PatientOrientationCodeSequence_CodeMeaning", "PerformedProcedureStepDescription", "ViewCodeSequence_CodeMeaning"]:
    del merged_df[col]
checkMissingValue(merged_df)


[('stay_id', 2129)]


In [20]:
merged_df['split'] = get_split_list(len(merged_df), train_portion=0.7)

In [21]:
from collections import OrderedDict

count_map = dict((merged_df[REFLACX_ALL_LABEL_COLS]>0).sum(axis=0))
OrderedDict({k: v for k, v in sorted(count_map.items(), key=lambda item: item[1])})

OrderedDict([('Fibrosis', 5),
             ('Quality issue', 9),
             ('Wide mediastinum', 10),
             ('Fracture', 12),
             ('Airway wall thickening', 21),
             ('Hiatal hernia', 24),
             ('Acute fracture', 33),
             ('Interstitial lung disease', 34),
             ('Enlarged hilum', 56),
             ('Abnormal mediastinal contour', 84),
             ('High lung volume / emphysema', 89),
             ('Pneumothorax', 105),
             ('Lung nodule or mass', 162),
             ('Groundglass opacity', 389),
             ('Pulmonary edema', 427),
             ('Enlarged cardiac silhouette', 710),
             ('Consolidation', 814),
             ('Atelectasis', 840),
             ('Pleural abnormality', 933),
             ('Support devices', 1305)])

In [22]:
sort_countmap = OrderedDict({k: v for k, v in sorted(count_map.items(), key=lambda item: item[1])})

In [23]:
sort_countmap.keys()

odict_keys(['Fibrosis', 'Quality issue', 'Wide mediastinum', 'Fracture', 'Airway wall thickening', 'Hiatal hernia', 'Acute fracture', 'Interstitial lung disease', 'Enlarged hilum', 'Abnormal mediastinal contour', 'High lung volume / emphysema', 'Pneumothorax', 'Lung nodule or mass', 'Groundglass opacity', 'Pulmonary edema', 'Enlarged cardiac silhouette', 'Consolidation', 'Atelectasis', 'Pleural abnormality', 'Support devices'])

In [24]:
merged_df.columns

Index(['Unnamed: 0', 'id', 'split', 'eye_tracking_data_discarded', 'image',
       'dicom_id', 'subject_id', 'image_size_x', 'image_size_y',
       'Airway wall thickening', 'Atelectasis', 'Consolidation',
       'Enlarged cardiac silhouette', 'Fibrosis', 'Fracture',
       'Groundglass opacity', 'Other', 'Pneumothorax', 'Pulmonary edema',
       'Quality issue', 'Support devices', 'Wide mediastinum',
       'Abnormal mediastinal contour', 'Acute fracture', 'Enlarged hilum',
       'Hiatal hernia', 'High lung volume / emphysema',
       'Interstitial lung disease', 'Lung nodule or mass',
       'Pleural abnormality', 'Columns', 'ProcedureCodeSequence_CodeMeaning',
       'Rows', 'StudyDate', 'StudyDateTime', 'StudyTime', 'ViewPosition',
       'in_eye_gaze', 'in_reflacx', 'stay_id', 'study_id', 'Unnamed: 0_negbio',
       'subject_id_negbio', 'study_id_negbio', 'Atelectasis_negbio',
       'Cardiomegaly_negbio', 'Consolidation_negbio', 'Edema_negbio',
       'Enlarged Cardiomediastinum

In [25]:
needed_columns = [
    "id",
    "dicom_id",
    "subject_id",
    "stay_id",
    "study_id",

    "split",
    "eye_tracking_data_discarded",
    "image_path",
    "bbox_path",
    "fixation_path",
    # "gender",
    # "anchor_age",
    # "anchor_year",
    # "dod",

    "StudyDate",
    "StudyDateTime",
    "StudyTime",

    "in_eye_gaze",
    "in_reflacx",

    "image",
    "image_size_x",
    "image_size_y",
    "ViewPosition",


    "Airway wall thickening",
    "Atelectasis",
    "Consolidation",
    "Enlarged cardiac silhouette",
    "Fibrosis",
    "Fracture",
    "Groundglass opacity",
    "Other",
    "Pneumothorax",
    "Pulmonary edema",
    "Quality issue",
    "Support devices",
    "Wide mediastinum",
    "Abnormal mediastinal contour",
    "Acute fracture",
    "Enlarged hilum",
    "Hiatal hernia",
    "High lung volume / emphysema",
    "Interstitial lung disease",
    "Lung nodule or mass",
    "Pleural abnormality",

    # negbio
    "Atelectasis_negbio",
    "Cardiomegaly_negbio",
    "Consolidation_negbio",
    "Edema_negbio",
    "Enlarged Cardiomediastinum_negbio",
    "Fracture_negbio",
    "Lung Lesion_negbio",
    "Lung Opacity_negbio",
    "No Finding_negbio",
    "Pleural Effusion_negbio",
    "Pleural Other_negbio",
    "Pneumonia_negbio",
    "Pneumothorax_negbio",
    "Support Devices_negbio",

    # chexpert
    "Atelectasis_chexpert",
    "Cardiomegaly_chexpert",
    "Consolidation_chexpert",
    "Edema_chexpert",
    "Enlarged Cardiomediastinum_chexpert",
    "Fracture_chexpert",
    "Lung Lesion_chexpert",
    "Lung Opacity_chexpert",
    "No Finding_chexpert",
    "Pleural Effusion_chexpert",
    "Pleural Other_chexpert",
    "Pneumonia_chexpert",
    "Pneumothorax_chexpert",
    "Support Devices_chexpert",
    
    # # clinical
    # "temperature",
    # "heartrate",
    # "resprate",
    # "o2sat",
    # "sbp",
    # "dbp",
    # "pain",
    # "acuity",
    # "chiefcomplaint",
    # "age",
]


merged_df = merged_df[needed_columns]

In [26]:
merged_df = merged_df.reset_index()
merged_df.to_csv(os.path.join("./spreadsheets", "reflacx_eye.csv"))

In [27]:
merged_df = merged_df[~merged_df['stay_id'].isna()]

In [28]:
merged_df.columns

Index(['index', 'id', 'dicom_id', 'subject_id', 'stay_id', 'study_id', 'split',
       'eye_tracking_data_discarded', 'image_path', 'bbox_path',
       'fixation_path', 'StudyDate', 'StudyDateTime', 'StudyTime',
       'in_eye_gaze', 'in_reflacx', 'image', 'image_size_x', 'image_size_y',
       'ViewPosition', 'Airway wall thickening', 'Atelectasis',
       'Consolidation', 'Enlarged cardiac silhouette', 'Fibrosis', 'Fracture',
       'Groundglass opacity', 'Other', 'Pneumothorax', 'Pulmonary edema',
       'Quality issue', 'Support devices', 'Wide mediastinum',
       'Abnormal mediastinal contour', 'Acute fracture', 'Enlarged hilum',
       'Hiatal hernia', 'High lung volume / emphysema',
       'Interstitial lung disease', 'Lung nodule or mass',
       'Pleural abnormality', 'Atelectasis_negbio', 'Cardiomegaly_negbio',
       'Consolidation_negbio', 'Edema_negbio',
       'Enlarged Cardiomediastinum_negbio', 'Fracture_negbio',
       'Lung Lesion_negbio', 'Lung Opacity_negbio', 'No 

In [29]:
# DO another for clinical data
hosp_patients_df = pd.read_csv(os.path.join(mimic_eye_path, "spreadsheets","Hosp","patients.csv"),)
ed_triage_df = pd.read_csv(os.path.join(mimic_eye_path, "spreadsheets","ED","triage.csv"), index_col=False)

In [30]:
merged_df = (
    merged_df
    .merge(hosp_patients_df, "left", on="subject_id", suffixes=("", "_patient"))
    .merge(ed_triage_df, "left", on="stay_id", suffixes=("", "_triage"))
)

merged_df["age"] = merged_df["anchor_age"] + (
    merged_df["StudyDate"].apply(lambda x: int(str(x)[:4])) - merged_df["anchor_year"]
)

In [31]:
print_f.print_title("Columns have missing value in merged_df")
checkMissingValue(merged_df)


[('dod', 595), ('temperature', 84), ('heartrate', 66), ('resprate', 74), ('o2sat', 70), ('sbp', 67), ('dbp', 70), ('pain', 66), ('acuity', 26)]


In [32]:
needed_columns = [
    "id",
    "dicom_id",
    "subject_id",
    "stay_id",
    "study_id",

    "split",
    "eye_tracking_data_discarded",
    "image_path",
    "bbox_path",
    "fixation_path",
    "gender",
    "anchor_age",
    "anchor_year",
    "dod",

    "StudyDate",
    "StudyDateTime",
    "StudyTime",

    "in_eye_gaze",
    "in_reflacx",

    "image",
    "image_size_x",
    "image_size_y",
    "ViewPosition",


    "Airway wall thickening",
    "Atelectasis",
    "Consolidation",
    "Enlarged cardiac silhouette",
    "Fibrosis",
    "Fracture",
    "Groundglass opacity",
    "Other",
    "Pneumothorax",
    "Pulmonary edema",
    "Quality issue",
    "Support devices",
    "Wide mediastinum",
    "Abnormal mediastinal contour",
    "Acute fracture",
    "Enlarged hilum",
    "Hiatal hernia",
    "High lung volume / emphysema",
    "Interstitial lung disease",
    "Lung nodule or mass",
    "Pleural abnormality",

    # negbio
    "Atelectasis_negbio",
    "Cardiomegaly_negbio",
    "Consolidation_negbio",
    "Edema_negbio",
    "Enlarged Cardiomediastinum_negbio",
    "Fracture_negbio",
    "Lung Lesion_negbio",
    "Lung Opacity_negbio",
    "No Finding_negbio",
    "Pleural Effusion_negbio",
    "Pleural Other_negbio",
    "Pneumonia_negbio",
    "Pneumothorax_negbio",
    "Support Devices_negbio",

    # chexpert
    "Atelectasis_chexpert",
    "Cardiomegaly_chexpert",
    "Consolidation_chexpert",
    "Edema_chexpert",
    "Enlarged Cardiomediastinum_chexpert",
    "Fracture_chexpert",
    "Lung Lesion_chexpert",
    "Lung Opacity_chexpert",
    "No Finding_chexpert",
    "Pleural Effusion_chexpert",
    "Pleural Other_chexpert",
    "Pneumonia_chexpert",
    "Pneumothorax_chexpert",
    "Support Devices_chexpert",
    
    # clinical
    "temperature",
    "heartrate",
    "resprate",
    "o2sat",
    "sbp",
    "dbp",
    "pain",
    "acuity",
    "chiefcomplaint",
    "age",
]


merged_df = merged_df[needed_columns]

In [33]:
clinical_cols = [
    "temperature",
    "heartrate",
    "resprate",
    "o2sat",
    "sbp",
    "dbp",
    "acuity",
    # "chiefcomplaint",
    "age",
]

In [34]:
clinical_cols

['temperature',
 'heartrate',
 'resprate',
 'o2sat',
 'sbp',
 'dbp',
 'acuity',
 'age']

In [35]:
for col in clinical_cols:
    if merged_df[col].isna().any():
        filling_value = round(merged_df[~merged_df[col].isna()][col].mean(), 1)
        print(f"filling missing value for {col} by value {filling_value:.2f}")
        merged_df[col] = merged_df[col].fillna(filling_value)

filling missing value for temperature by value 98.10
filling missing value for heartrate by value 85.60
filling missing value for resprate by value 18.20
filling missing value for o2sat by value 97.80
filling missing value for sbp by value 132.90
filling missing value for dbp by value 73.50
filling missing value for acuity by value 2.30


In [36]:
checkMissingValue(merged_df)

[('dod', 595), ('pain', 66)]


In [37]:
merged_df['split'] = get_split_list(len(merged_df), train_portion=0.7)

In [38]:
merged_df.to_csv(os.path.join("./spreadsheets", "reflacx_clinical_eye.csv"))