In [1]:
import os
import pandas as pd
from utils.preprocessing import get_split_list, checkMissingValue
from utils.init import reproducibility
from datasets import paths


In [2]:
seed = 0
reproducibility(0)

In [3]:
cxr_meta_df = pd.read_csv(os.path.join('./spreadsheets/cxr_meta_stayId.csv'))

In [4]:
PHYSIONET_PATH = os.path.join(paths.PHYSIONET_PATH, "files")
CXR_JPG_FOLDER_PATH = os.path.join(PHYSIONET_PATH, "mimic-cxr-jpg", "2.0.0")
MIMICIV_FOLDER_PATH = os.path.join(PHYSIONET_PATH, "mimiciv", "2.0")
ED_FOLDER_PATH = os.path.join(PHYSIONET_PATH, "mimic-iv-ed", "2.0", "ed")
HOSP_FOLDER_PATH = os.path.join(MIMICIV_FOLDER_PATH, "hosp")


In [5]:
# load from physionet instead.
negbio_df = pd.read_csv(  os.path.join(CXR_JPG_FOLDER_PATH, "mimic-cxr-2.0.0-negbio.csv.gz"))
chexpert_df = pd.read_csv(  os.path.join(CXR_JPG_FOLDER_PATH ,"mimic-cxr-2.0.0-chexpert.csv.gz"))

In [6]:
merged_df = cxr_meta_df.merge(negbio_df.add_suffix("_negbio"), "left", left_on="study_id", right_on="study_id_negbio" 
)
merged_df = merged_df.merge(chexpert_df.add_suffix("_chexpert"), "left", left_on="study_id", right_on="study_id_chexpert",
)

In [7]:
for col in [ col for col in merged_df.columns if (col.endswith("_negbio") or col.endswith("_chexpert"))]:
    merged_df[col] = merged_df[col].fillna(0)

In [8]:
hosp_patients_df = pd.read_csv(os.path.join(HOSP_FOLDER_PATH, "patients.csv.gz"))
ed_triage_df = pd.read_csv(os.path.join(ED_FOLDER_PATH, "triage.csv.gz"), index_col=False)

In [9]:
merged_df = (
    merged_df
    .merge(hosp_patients_df, "left", on="subject_id", suffixes=("", "_patient"))
    .merge(ed_triage_df, "left", on="stay_id", suffixes=("", "_triage"))
)

merged_df["age"] = merged_df["anchor_age"] + (
    merged_df["StudyDate"].apply(lambda x: int(str(x)[:4])) - merged_df["anchor_year"]
)

In [10]:
clinical_cols = [
    "temperature",
    "heartrate",
    "resprate",
    "o2sat",
    "sbp",
    "dbp",
    "acuity",
    # "chiefcomplaint",
    "age",
]

In [11]:
merged_df = merged_df.dropna(subset=clinical_cols)

In [12]:
checkMissingValue(merged_df)

[('PerformedProcedureStepDescription', 248), ('ViewCodeSequence_CodeMeaning', 1), ('PatientOrientationCodeSequence_CodeMeaning', 1531), ('dod', 48328), ('pain', 1217), ('chiefcomplaint', 1)]


In [13]:
# create discrete values.
merged_df =  merged_df[merged_df['temperature'] > 60] # remove the instance with irealistic temperature.
merged_df =  merged_df[merged_df['o2sat'] <= 100] 
merged_df =  merged_df[merged_df['sbp'] <= 1000] 
merged_df =  merged_df[merged_df['dbp'] <= 10000]
 
# create discrete values here

def sbp_discrete_range(sbp):
    if sbp >= 140:
        return ">=140"
    elif sbp > 90:
        return "(90,140)"
    else:
        return "<=90"
    
def dbp_discrete_range(dbp):
    if dbp >= 90:
        return '>=90'
    elif dbp  > 60:
        return '(60,90)'
    else:
        return "<=60"
    
def o2sat_discrete_range(o2sat):
    if o2sat>=95:
        return ">=95"
    elif o2sat>= 90:
        return "[90,95)"
    elif o2sat > 80:
        return "(80,90)"
    else:
        return "<=80"
    
def resprate_discrete_range(resprate):
    if resprate >= 18:
        return ">=18"
    elif resprate >= 12:
        return "[12,18)"
    else:
        return "<12"
    
def heartrate_discrete_range(heartrate):    
    if heartrate >= 100:
        return ">=100"
    elif heartrate >= 60:
        return "[60,100)"
    else:
        return "<60"
    
def temperature_discrete_range(temperature):
    if temperature >= 100.4:
        return ">=100.4"
    elif temperature >= 99:
        return "[99,100.4)"
    elif temperature >= 97:
        return "[97,99)"
    else:
        return "<97"

def age_discrete_range(age):
    if age >= 65:
        return ">=65"
    else:
        return "<65"
    
## replace all of them to be 1.
merged_df['sbp_discrete'] = merged_df['sbp'].apply(lambda x: sbp_discrete_range(x)) 
merged_df['dbp_discrete'] = merged_df['dbp'].apply(lambda x: dbp_discrete_range(x)) 
merged_df['o2sat_discrete'] = merged_df['o2sat'].apply(lambda x: o2sat_discrete_range(x)) 
merged_df['resprate_discrete'] = merged_df['resprate'].apply(lambda x: resprate_discrete_range(x)) 
merged_df['heartrate_discrete'] = merged_df['heartrate'].apply(lambda x: heartrate_discrete_range(x)) 
merged_df['temperature_discrete'] = merged_df['temperature'].apply(lambda x: temperature_discrete_range(x)) 
merged_df['age_discrete'] = merged_df['age'].apply(lambda x: age_discrete_range(x)) 
merged_df["acuity_discrete"] = merged_df['acuity'].apply(lambda x: str(x))

In [14]:
# # before the split we keep only small amount.
print(f"Have {len(merged_df)} instances.") 

# merged_df = merged_df.sample(n=1000, random_state=123) # using random state to ensure reproducibility.
# print(f"For testing purpose, we shrink it down to {len(merged_df)} instances.")

merged_df['split'] = get_split_list(len(merged_df), train_portion=0.9) # 70% training.

Have 67156 instances.
For testing purpose, we shrink it down to 67156 instances.


In [16]:
# Algin test cases

reflacx_clinical_df = pd.read_csv(os.path.join('spreadsheets', 'reflacx_clinical.csv'))
val_dicom_ids = list(reflacx_clinical_df[reflacx_clinical_df['split']== "val"]['dicom_id'])
test_dicom_ids = list(reflacx_clinical_df[reflacx_clinical_df['split']== "test"]['dicom_id'])

for val_id in val_dicom_ids:
    idxs = merged_df[merged_df['dicom_id'] == val_id].index
    for idx in idxs:
        assert(merged_df.at[idx, 'dicom_id'] == val_id), "assume to be the same dicom_id"
        merged_df.at[idx,'split'] = 'val'

for val_id in test_dicom_ids:
    idxs = merged_df[merged_df['dicom_id'] == val_id].index
    for idx in idxs:
        assert(merged_df.at[idx, 'dicom_id'] == val_id), "assume to be the same dicom_id"
        merged_df.at[idx,'split'] = 'test'

In [18]:
merged_df['split'].value_counts()

train    60238
test      3464
val       3454
Name: split, dtype: int64

In [17]:
merged_df.to_csv(os.path.join("./spreadsheets", "physio_clinical.csv"))