In [1]:
import os, json
import pandas as pd

from data.paths import TabularDataPaths
from utils import print as print_f
from utils.preprocessing import get_split_list, checkMissingValue
from utils.init import reproducibility

from data.constants import REFLACX_ALL_LABEL_COLS, REFLACX_REPETITIVE_ALL_LABEL_COLS
from data.paths import MIMIC_EYE_PATH

In [2]:
seed = 0
reproducibility(0)

In [3]:
cxr_meta_df = pd.read_csv(os.path.join('./spreadsheets/cxr_meta_stayId.csv'))

In [4]:
cxr_meta_df.head(5)

Unnamed: 0.1,Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning,StudyDateTime,stay_id
0,0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,CHEST (PA AND LAT),PA,3056,2544,21800506,213014.531,CHEST (PA AND LAT),postero-anterior,Erect,21800510000000.0,33258284
1,2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,CHEST (PA AND LAT),PA,3056,2544,21800626,165500.312,CHEST (PA AND LAT),postero-anterior,Erect,21800630000000.0,38112554
2,6,ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c,10000032,56699142,CHEST (PORTABLE AP),AP,3056,2544,21800805,234424.765,CHEST (PORTABLE AP),antero-posterior,,21800810000000.0,35968195
3,17,9b314ad7-fbcb0422-6db62dfc-732858d0-a5527d8b,10000935,51178377,CHEST (PA AND LAT),AP,3056,2544,21870823,191426.062,CHEST (PA AND LAT),antero-posterior,Erect,21870820000000.0,32845079
4,20,8e3f2822-0c1d4b71-2a265bbf-5b96e531-ccf5fa30,10000935,56164612,CHEST (PA AND LAT),AP,2544,3056,21870711,111436.562,CHEST (PA AND LAT),antero-posterior,Erect,21870710000000.0,35197384


In [5]:
disk_location = "D:\\"
PHYSIONET_PATH = os.path.join(disk_location, "physionet.org","files")
CXR_JPG_FOLDER_PATH = os.path.join(PHYSIONET_PATH, "mimic-cxr-jpg", "2.0.0")

# load from physionet instead.
negbio_df = pd.read_csv(  os.path.join(CXR_JPG_FOLDER_PATH, "mimic-cxr-2.0.0-negbio.csv.gz"))
chexpert_df = pd.read_csv(  os.path.join(CXR_JPG_FOLDER_PATH ,"mimic-cxr-2.0.0-chexpert.csv.gz"))

In [6]:
merged_df = cxr_meta_df.merge(negbio_df.add_suffix("_negbio"), "left", left_on="study_id", right_on="study_id_negbio" 
)
merged_df = merged_df.merge(chexpert_df.add_suffix("_chexpert"), "left", left_on="study_id", right_on="study_id_chexpert",
)

In [7]:
for col in [ col for col in merged_df.columns if (col.endswith("_negbio") or col.endswith("_chexpert"))]:
    merged_df[col] = merged_df[col].fillna(0)

In [8]:
class CXRImage:
    def __init__(self, patient_id, study_id, dicom_id, reflacx_id=None) -> None:
        self.patient_id = patient_id
        self.study_id = study_id
        self.dicom_id = dicom_id

    def get_physio_image_path(self, physio_path, jpg=True, version="2.0.0"):
        file_extension = "jpg"if jpg else "dcm"
        return os.path.join(
            physio_path,
            "files",
            ("mimic-cxr-jpg" if jpg else "mimic-cxr"),
            version,
            "files",
            f"p{str(self.patient_id)[:2]}",
            f"p{self.patient_id}",
            f"s{self.study_id}",
            f"{self.dicom_id}.{file_extension}"
        )

def create_cxr_from_instance(instance):

    return CXRImage(
        patient_id=instance['subject_id'],
        study_id=instance['study_id'],
        dicom_id=instance['dicom_id'],
    )


In [9]:
# in order to have a faster reading speed, we use XAMI-MIMIC
PHYSIONET_PATH_str = "{PHYSIONET_PATH}"

# Calculate the age according to the time when the CXR taken.
merged_df["image_path"] = merged_df.apply(
    lambda x: create_cxr_from_instance(x).get_physio_image_path(PHYSIONET_PATH_str), 1)
# merged_df["bbox_path"] = (
#     f"{XAMI_MIMIC_PATH_str}\patient_"
#     + merged_df["subject_id"].astype(str)
#     + "\REFLACX\\"
#     + merged_df["id"].astype(str)
#     + "\\anomaly_location_ellipses.csv"
# )

# merged_df["bbox_path"] = merged_df.apply(lambda x: os.path.join(
#     PHYSIO_PATH_str,
#     f"patient_{str(x['subject_id'])}",
#     "REFLACX", 
#     "main_data",
#     str(x["id"]),
#     "anomaly_location_ellipses.csv"),
#     axis=1)

# merged_df["fixation_path"] = merged_df.apply(lambda x: os.path.join(
#     PHYSIO_PATH_str,
#     f"patient_{str(x['subject_id'])}",
#     "REFLACX",
#     "main_data",
#     str(x["id"]),
#     "fixations.csv"),
#     axis=1)


# merged_df["fixation_path"] = (
#     f"{XAMI_MIMIC_PATH_str}\patient_"
#     + merged_df["subject_id"].astype(str)
#     + "\REFLACX\\"
#     + merged_df["id"].astype(str)
#     + "\\fixations.csv"
# )


In [10]:
checkMissingValue(merged_df)

[('PerformedProcedureStepDescription', 277), ('ViewCodeSequence_CodeMeaning', 3), ('PatientOrientationCodeSequence_CodeMeaning', 2213)]


In [11]:
# DO another for clinical data
ED_FOLDER_PATH = os.path.join(PHYSIONET_PATH, "mimic-iv-ed", "2.0","ed")
MIMICIV_FOLDER_PATH = os.path.join(PHYSIONET_PATH, "mimiciv", "2.0")
HOSP_FOLDER_PATH = os.path.join(MIMICIV_FOLDER_PATH, "hosp")

hosp_patients_df = pd.read_csv(os.path.join(HOSP_FOLDER_PATH, "patients.csv.gz"))
ed_triage_df = pd.read_csv(os.path.join(ED_FOLDER_PATH, "triage.csv.gz"), index_col=False)

In [12]:
merged_df = (
    merged_df
    .merge(hosp_patients_df, "left", on="subject_id", suffixes=("", "_patient"))
    .merge(ed_triage_df, "left", on="stay_id", suffixes=("", "_triage"))
)

merged_df["age"] = merged_df["anchor_age"] + (
    merged_df["StudyDate"].apply(lambda x: int(str(x)[:4])) - merged_df["anchor_year"]
)

In [13]:
print_f.print_title("Columns have missing value in merged_df")
checkMissingValue(merged_df)

[('PerformedProcedureStepDescription', 277), ('ViewCodeSequence_CodeMeaning', 3), ('PatientOrientationCodeSequence_CodeMeaning', 2213), ('dod', 52891), ('temperature', 5673), ('heartrate', 3977), ('resprate', 4984), ('o2sat', 4824), ('sbp', 4153), ('dbp', 4344), ('pain', 4571), ('acuity', 2041), ('chiefcomplaint', 1)]


In [14]:
clinical_cols = [
    "temperature",
    "heartrate",
    "resprate",
    "o2sat",
    "sbp",
    "dbp",
    "acuity",
    # "chiefcomplaint",
    "age",
]

In [15]:
len(merged_df)

74967

In [16]:
merged_df = merged_df.dropna(subset=clinical_cols)

In [17]:
len(merged_df)

67263

In [18]:
# for col in clinical_cols:
#     if merged_df[col].isna().any():
#         filling_value = round(merged_df[~merged_df[col].isna()][col].mean(), 1)
#         print(f"filling missing value for {col} by value {filling_value:.2f}")
#         merged_df[col] = merged_df[col].fillna(filling_value)

In [19]:
checkMissingValue(merged_df)

[('PerformedProcedureStepDescription', 248), ('ViewCodeSequence_CodeMeaning', 1), ('PatientOrientationCodeSequence_CodeMeaning', 1531), ('dod', 48328), ('pain', 1217), ('chiefcomplaint', 1)]


In [20]:
merged_df['split'] = get_split_list(len(merged_df), train_portion=0.9) # 70% training.

In [21]:
# get the reflacx df 

reflacx_clinical_df = pd.read_csv(os.path.join('spreadsheets', 'reflacx_clinical_eye.csv'))
val_dicom_ids = list(reflacx_clinical_df[reflacx_clinical_df['split']== "val"]['dicom_id'])
test_dicom_ids = list(reflacx_clinical_df[reflacx_clinical_df['split']== "test"]['dicom_id'])

for val_id in val_dicom_ids:
    idxs = merged_df[merged_df['dicom_id'] == val_id].index
    for idx in idxs:
        assert(merged_df.at[idx, 'dicom_id'] == val_id), "assume to be the same dicom_id"
        merged_df.at[idx,'split'] = 'val'

for val_id in test_dicom_ids:
    idxs = merged_df[merged_df['dicom_id'] == val_id].index
    for idx in idxs:
        assert(merged_df.at[idx, 'dicom_id'] == val_id), "assume to be the same dicom_id"
        merged_df.at[idx,'split'] = 'test'

In [22]:
merged_df['split'].value_counts()

train    60332
test      3470
val       3461
Name: split, dtype: int64

In [23]:
merged_df['image_path'].iloc[0]

'{PHYSIONET_PATH}\\files\\mimic-cxr-jpg\\2.0.0\\files\\p10\\p10000032\\s50414267\\02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.jpg'

In [24]:
merged_df =  merged_df[merged_df['temperature'] > 60] # remove the instance with irealistic temperature.

In [25]:
# create discrete values here

def sbp_discrete_range(sbp):
    if sbp >= 140:
        return ">=140"
    elif sbp > 90:
        return "(90,140)"
    else:
        return "<=90"
    
def dbp_discrete_range(dbp):
    if dbp >= 90:
        return '>=90'
    elif dbp  > 60:
        return '(60,90)'
    else:
        return "<=60"
    
def o2sat_discrete_range(o2sat):
    if o2sat>=95:
        return ">=95"
    elif o2sat>= 90:
        return "[90,95)"
    elif o2sat > 80:
        return "(80,90)"
    else:
        return "<=80"
    
def resprate_discrete_range(resprate):
    if resprate >= 18:
        return ">=18"
    elif resprate >= 12:
        return "[12,18)"
    else:
        return "<12"
    
def heartrate_discrete_range(heartrate):    
    if heartrate >= 100:
        return ">=100"
    elif heartrate >= 60:
        return "[60,100)"
    else:
        return "<60"
    
def temperature_discrete_range(temperature):
    if temperature >= 100.4:
        return ">=100.4"
    elif temperature >= 99:
        return "[99,100.4)"
    elif temperature >= 97:
        return "[97,99)"
    else:
        return "<97"

def age_discrete_range(age):
    if age >= 65:
        return ">=65"
    else:
        return "<65"
    
## replace all of them to be 1.
merged_df['sbp_discrete'] = merged_df['sbp'].apply(lambda x: sbp_discrete_range(x)) 
merged_df['dbp_discrete'] = merged_df['dbp'].apply(lambda x: dbp_discrete_range(x)) 
merged_df['o2sat_discrete'] = merged_df['o2sat'].apply(lambda x: o2sat_discrete_range(x)) 
merged_df['resprate_discrete'] = merged_df['resprate'].apply(lambda x: resprate_discrete_range(x)) 
merged_df['heartrate_discrete'] = merged_df['heartrate'].apply(lambda x: heartrate_discrete_range(x)) 
merged_df['temperature_discrete'] = merged_df['temperature'].apply(lambda x: temperature_discrete_range(x)) 
merged_df['age_discrete'] = merged_df['age'].apply(lambda x: age_discrete_range(x)) 
merged_df["acuity_discrete"] = merged_df['acuity'].apply(lambda x: str(x))

merged_df['acuity_discrete'].value_counts()

2.0    32530
3.0    28148
1.0     5923
4.0      556
5.0        5
Name: acuity_discrete, dtype: int64

In [26]:
merged_df.to_csv(os.path.join("./spreadsheets", "physio_clinical_cxr_meta.csv"))

In [27]:
merged_df.columns

Index(['Unnamed: 0', 'dicom_id', 'subject_id', 'study_id',
       'PerformedProcedureStepDescription', 'ViewPosition', 'Rows', 'Columns',
       'StudyDate', 'StudyTime', 'ProcedureCodeSequence_CodeMeaning',
       'ViewCodeSequence_CodeMeaning',
       'PatientOrientationCodeSequence_CodeMeaning', 'StudyDateTime',
       'stay_id', 'subject_id_negbio', 'study_id_negbio', 'Atelectasis_negbio',
       'Cardiomegaly_negbio', 'Consolidation_negbio', 'Edema_negbio',
       'Enlarged Cardiomediastinum_negbio', 'Fracture_negbio',
       'Lung Lesion_negbio', 'Lung Opacity_negbio', 'No Finding_negbio',
       'Pleural Effusion_negbio', 'Pleural Other_negbio', 'Pneumonia_negbio',
       'Pneumothorax_negbio', 'Support Devices_negbio', 'subject_id_chexpert',
       'study_id_chexpert', 'Atelectasis_chexpert', 'Cardiomegaly_chexpert',
       'Consolidation_chexpert', 'Edema_chexpert',
       'Enlarged Cardiomediastinum_chexpert', 'Fracture_chexpert',
       'Lung Lesion_chexpert', 'Lung Opacity_

In [32]:
merged_df[['age','sbp','dbp','heartrate', 'resprate']].iloc[9]

age           75.0
sbp          180.0
dbp           88.0
heartrate     57.0
resprate      18.0
Name: 9, dtype: float64