In [1]:
import os, json
import pandas as pd

from data.paths import TabularDataPaths
from utils import print as print_f
from utils.preprocessing import get_split_list, checkMissingValue
from utils.init import reproducibility

from data.constants import REFLACX_ALL_LABEL_COLS, REFLACX_REPETITIVE_ALL_LABEL_COLS
from data.paths import MIMIC_EYE_PATH

In [2]:
seed = 0
reproducibility(0)

In [3]:
cxr_meta_df = pd.read_csv(os.path.join('./spreadsheets/cxr_meta_stayId.csv'))

In [4]:
cxr_meta_df.head(5)

Unnamed: 0.1,Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning,StudyDateTime,stay_id
0,0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,CHEST (PA AND LAT),PA,3056,2544,21800506,213014.531,CHEST (PA AND LAT),postero-anterior,Erect,21800510000000.0,33258284
1,2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,CHEST (PA AND LAT),PA,3056,2544,21800626,165500.312,CHEST (PA AND LAT),postero-anterior,Erect,21800630000000.0,38112554
2,6,ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c,10000032,56699142,CHEST (PORTABLE AP),AP,3056,2544,21800805,234424.765,CHEST (PORTABLE AP),antero-posterior,,21800810000000.0,35968195
3,17,9b314ad7-fbcb0422-6db62dfc-732858d0-a5527d8b,10000935,51178377,CHEST (PA AND LAT),AP,3056,2544,21870823,191426.062,CHEST (PA AND LAT),antero-posterior,Erect,21870820000000.0,32845079
4,20,8e3f2822-0c1d4b71-2a265bbf-5b96e531-ccf5fa30,10000935,56164612,CHEST (PA AND LAT),AP,2544,3056,21870711,111436.562,CHEST (PA AND LAT),antero-posterior,Erect,21870710000000.0,35197384


In [5]:
disk_location = "D:\\"
PHYSIONET_PATH = os.path.join(disk_location, "physionet.org","files")
CXR_JPG_FOLDER_PATH = os.path.join(PHYSIONET_PATH, "mimic-cxr-jpg", "2.0.0")

# load from physionet instead.
negbio_df = pd.read_csv(  os.path.join(CXR_JPG_FOLDER_PATH, "mimic-cxr-2.0.0-negbio.csv.gz"))
chexpert_df = pd.read_csv(  os.path.join(CXR_JPG_FOLDER_PATH ,"mimic-cxr-2.0.0-chexpert.csv.gz"))

In [6]:
merged_df = cxr_meta_df.merge(negbio_df.add_suffix("_negbio"), "left", left_on="study_id", right_on="study_id_negbio" 
)
merged_df = merged_df.merge(chexpert_df.add_suffix("_chexpert"), "left", left_on="study_id", right_on="study_id_chexpert",
)

In [7]:
for col in [ col for col in merged_df.columns if (col.endswith("_negbio") or col.endswith("_chexpert"))]:
    merged_df[col] = merged_df[col].fillna(0)

In [8]:
class CXRImage:
    def __init__(self, patient_id, study_id, dicom_id, reflacx_id=None) -> None:
        self.patient_id = patient_id
        self.study_id = study_id
        self.dicom_id = dicom_id

    def get_physio_image_path(self, physio_path, jpg=True, version="2.0.0"):
        file_extension = "jpg"if jpg else "dcm"
        return os.path.join(
            physio_path,
            "files",
            ("mimic-cxr-jpg" if jpg else "mimic-cxr"),
            version,
            "files",
            f"p{str(self.patient_id)[:2]}",
            f"p{self.patient_id}",
            f"s{self.study_id}",
            f"{self.dicom_id}.{file_extension}"
        )

def create_cxr_from_instance(instance):

    return CXRImage(
        patient_id=instance['subject_id'],
        study_id=instance['study_id'],
        dicom_id=instance['dicom_id'],
    )


In [9]:
# in order to have a faster reading speed, we use XAMI-MIMIC
PHYSIONET_PATH_str = "{PHYSIONET_PATH}"

# Calculate the age according to the time when the CXR taken.
merged_df["image_path"] = merged_df.apply(
    lambda x: create_cxr_from_instance(x).get_physio_image_path(PHYSIONET_PATH_str), 1)
# merged_df["bbox_path"] = (
#     f"{XAMI_MIMIC_PATH_str}\patient_"
#     + merged_df["subject_id"].astype(str)
#     + "\REFLACX\\"
#     + merged_df["id"].astype(str)
#     + "\\anomaly_location_ellipses.csv"
# )

# merged_df["bbox_path"] = merged_df.apply(lambda x: os.path.join(
#     PHYSIO_PATH_str,
#     f"patient_{str(x['subject_id'])}",
#     "REFLACX", 
#     "main_data",
#     str(x["id"]),
#     "anomaly_location_ellipses.csv"),
#     axis=1)

# merged_df["fixation_path"] = merged_df.apply(lambda x: os.path.join(
#     PHYSIO_PATH_str,
#     f"patient_{str(x['subject_id'])}",
#     "REFLACX",
#     "main_data",
#     str(x["id"]),
#     "fixations.csv"),
#     axis=1)


# merged_df["fixation_path"] = (
#     f"{XAMI_MIMIC_PATH_str}\patient_"
#     + merged_df["subject_id"].astype(str)
#     + "\REFLACX\\"
#     + merged_df["id"].astype(str)
#     + "\\fixations.csv"
# )


In [10]:
checkMissingValue(merged_df)

[('PerformedProcedureStepDescription', 277), ('ViewCodeSequence_CodeMeaning', 3), ('PatientOrientationCodeSequence_CodeMeaning', 2213)]


In [11]:
# DO another for clinical data
ED_FOLDER_PATH = os.path.join(PHYSIONET_PATH, "mimic-iv-ed", "2.0","ed")
MIMICIV_FOLDER_PATH = os.path.join(PHYSIONET_PATH, "mimiciv", "2.0")
HOSP_FOLDER_PATH = os.path.join(MIMICIV_FOLDER_PATH, "hosp")

hosp_patients_df = pd.read_csv(os.path.join(HOSP_FOLDER_PATH, "patients.csv.gz"))
ed_triage_df = pd.read_csv(os.path.join(ED_FOLDER_PATH, "triage.csv.gz"), index_col=False)

In [12]:
merged_df = (
    merged_df
    .merge(hosp_patients_df, "left", on="subject_id", suffixes=("", "_patient"))
    .merge(ed_triage_df, "left", on="stay_id", suffixes=("", "_triage"))
)

merged_df["age"] = merged_df["anchor_age"] + (
    merged_df["StudyDate"].apply(lambda x: int(str(x)[:4])) - merged_df["anchor_year"]
)

In [13]:
print_f.print_title("Columns have missing value in merged_df")
checkMissingValue(merged_df)

[('PerformedProcedureStepDescription', 277), ('ViewCodeSequence_CodeMeaning', 3), ('PatientOrientationCodeSequence_CodeMeaning', 2213), ('dod', 52891), ('temperature', 5673), ('heartrate', 3977), ('resprate', 4984), ('o2sat', 4824), ('sbp', 4153), ('dbp', 4344), ('pain', 4571), ('acuity', 2041), ('chiefcomplaint', 1)]


In [14]:
clinical_cols = [
    "temperature",
    "heartrate",
    "resprate",
    "o2sat",
    "sbp",
    "dbp",
    "acuity",
    # "chiefcomplaint",
    "age",
]

In [15]:
len(merged_df)

74967

In [16]:
merged_df = merged_df.dropna(subset=clinical_cols)

In [17]:
len(merged_df)

67263

In [18]:
# for col in clinical_cols:
#     if merged_df[col].isna().any():
#         filling_value = round(merged_df[~merged_df[col].isna()][col].mean(), 1)
#         print(f"filling missing value for {col} by value {filling_value:.2f}")
#         merged_df[col] = merged_df[col].fillna(filling_value)

In [19]:
checkMissingValue(merged_df)

[('PerformedProcedureStepDescription', 248), ('ViewCodeSequence_CodeMeaning', 1), ('PatientOrientationCodeSequence_CodeMeaning', 1531), ('dod', 48328), ('pain', 1217), ('chiefcomplaint', 1)]


In [20]:
# create discrete values.
merged_df =  merged_df[merged_df['temperature'] > 60] # remove the instance with irealistic temperature.
merged_df =  merged_df[merged_df['o2sat'] <= 100] 
merged_df =  merged_df[merged_df['sbp'] <= 1000] 
merged_df =  merged_df[merged_df['dbp'] <= 10000]
 
# create discrete values here

def sbp_discrete_range(sbp):
    if sbp >= 140:
        return ">=140"
    elif sbp > 90:
        return "(90,140)"
    else:
        return "<=90"
    
def dbp_discrete_range(dbp):
    if dbp >= 90:
        return '>=90'
    elif dbp  > 60:
        return '(60,90)'
    else:
        return "<=60"
    
def o2sat_discrete_range(o2sat):
    if o2sat>=95:
        return ">=95"
    elif o2sat>= 90:
        return "[90,95)"
    elif o2sat > 80:
        return "(80,90)"
    else:
        return "<=80"
    
def resprate_discrete_range(resprate):
    if resprate >= 18:
        return ">=18"
    elif resprate >= 12:
        return "[12,18)"
    else:
        return "<12"
    
def heartrate_discrete_range(heartrate):    
    if heartrate >= 100:
        return ">=100"
    elif heartrate >= 60:
        return "[60,100)"
    else:
        return "<60"
    
def temperature_discrete_range(temperature):
    if temperature >= 100.4:
        return ">=100.4"
    elif temperature >= 99:
        return "[99,100.4)"
    elif temperature >= 97:
        return "[97,99)"
    else:
        return "<97"

def age_discrete_range(age):
    if age >= 65:
        return ">=65"
    else:
        return "<65"
    
## replace all of them to be 1.
merged_df['sbp_discrete'] = merged_df['sbp'].apply(lambda x: sbp_discrete_range(x)) 
merged_df['dbp_discrete'] = merged_df['dbp'].apply(lambda x: dbp_discrete_range(x)) 
merged_df['o2sat_discrete'] = merged_df['o2sat'].apply(lambda x: o2sat_discrete_range(x)) 
merged_df['resprate_discrete'] = merged_df['resprate'].apply(lambda x: resprate_discrete_range(x)) 
merged_df['heartrate_discrete'] = merged_df['heartrate'].apply(lambda x: heartrate_discrete_range(x)) 
merged_df['temperature_discrete'] = merged_df['temperature'].apply(lambda x: temperature_discrete_range(x)) 
merged_df['age_discrete'] = merged_df['age'].apply(lambda x: age_discrete_range(x)) 
merged_df["acuity_discrete"] = merged_df['acuity'].apply(lambda x: str(x))

merged_df['acuity_discrete'].value_counts()

2.0    32529
3.0    28145
1.0     5921
4.0      556
5.0        5
Name: acuity_discrete, dtype: int64

In [35]:
import pandas as pd
import os

In [36]:
sampled = pd.read_csv(os.path.join("./spreadsheets", "physio_clinical_cxr_meta.csv"))

In [21]:
# # before the split we keep only small amount.
print(f"Have 67162 {len(merged_df)} instances.") 

####### Second 10k sampling #######
# we first load the 10k instances to check what we have used before.
# sampled = pd.read_csv(os.path.join("./spreadsheets", "physio_clinical_cxr_meta.csv"))
# sampled_dicom_ids = list(sampled['dicom_id'])
# # then remove them in the merged_df
# merged_df = merged_df[~merged_df['dicom_id'].isin(sampled_dicom_ids)]

# Let's shrink it down to 5000?
merged_df = merged_df.sample(n=10000, random_state=123) # using random state to ensure reproducibility.

print(f"For testing purpose, we shrink it down to {len(merged_df)} instances.")

Have 67162 67156 instances.
For testing purpose, we shrink it down to 10000 instances.


In [22]:
merged_df['split'] = get_split_list(len(merged_df), train_portion=0.7) # 70% training.

In [23]:
# get the reflacx df 

reflacx_clinical_df = pd.read_csv(os.path.join('spreadsheets', 'reflacx_clinical_eye.csv'))
val_dicom_ids = list(reflacx_clinical_df[reflacx_clinical_df['split']== "val"]['dicom_id'])
test_dicom_ids = list(reflacx_clinical_df[reflacx_clinical_df['split']== "test"]['dicom_id'])

for val_id in val_dicom_ids:
    idxs = merged_df[merged_df['dicom_id'] == val_id].index
    for idx in idxs:
        assert(merged_df.at[idx, 'dicom_id'] == val_id), "assume to be the same dicom_id"
        merged_df.at[idx,'split'] = 'val'

for val_id in test_dicom_ids:
    idxs = merged_df[merged_df['dicom_id'] == val_id].index
    for idx in idxs:
        assert(merged_df.at[idx, 'dicom_id'] == val_id), "assume to be the same dicom_id"
        merged_df.at[idx,'split'] = 'test'

In [24]:
merged_df['split'].value_counts()

train    6977
test     1515
val      1508
Name: split, dtype: int64

In [25]:
merged_df.to_csv(os.path.join("./spreadsheets", "physio_clinical_cxr_meta.csv"))

In [26]:
merged_df.columns

Index(['Unnamed: 0', 'dicom_id', 'subject_id', 'study_id',
       'PerformedProcedureStepDescription', 'ViewPosition', 'Rows', 'Columns',
       'StudyDate', 'StudyTime', 'ProcedureCodeSequence_CodeMeaning',
       'ViewCodeSequence_CodeMeaning',
       'PatientOrientationCodeSequence_CodeMeaning', 'StudyDateTime',
       'stay_id', 'subject_id_negbio', 'study_id_negbio', 'Atelectasis_negbio',
       'Cardiomegaly_negbio', 'Consolidation_negbio', 'Edema_negbio',
       'Enlarged Cardiomediastinum_negbio', 'Fracture_negbio',
       'Lung Lesion_negbio', 'Lung Opacity_negbio', 'No Finding_negbio',
       'Pleural Effusion_negbio', 'Pleural Other_negbio', 'Pneumonia_negbio',
       'Pneumothorax_negbio', 'Support Devices_negbio', 'subject_id_chexpert',
       'study_id_chexpert', 'Atelectasis_chexpert', 'Cardiomegaly_chexpert',
       'Consolidation_chexpert', 'Edema_chexpert',
       'Enlarged Cardiomediastinum_chexpert', 'Fracture_chexpert',
       'Lung Lesion_chexpert', 'Lung Opacity_

In [27]:
import pandas as pd
import os

In [28]:
df = pd.read_csv(os.path.join("./spreadsheets", "physio_clinical_cxr_meta.csv"))

In [29]:
df[df['split'] =='train']

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,...,age,sbp_discrete,dbp_discrete,o2sat_discrete,resprate_discrete,heartrate_discrete,temperature_discrete,age_discrete,acuity_discrete,split
0,275,1375,a3576366-31c9a763-cb4205e6-217a0456-ecbe3051,10030487,55326723,CHEST (PA AND LAT),AP,3056,2544,21250713,...,91,"(90,140)",<=60,>=95,>=18,"[60,100)","[97,99)",>=65,3.0,train
1,53144,267602,7b48d82d-e42ba582-bc1987d1-b525a2ac-b3afe83d,17097339,54317085,CHEST (PA AND LAT),PA,3056,2544,21330622,...,78,"(90,140)","(60,90)",>=95,>=18,>=100,"[97,99)",>=65,2.0,train
2,56892,286809,67eb117b-fd61f9a2-8117034c-839150f9-2656e623,17600438,59762607,CHEST (PA AND LAT),AP,2544,3056,21771109,...,75,"(90,140)",<=60,>=95,>=18,"[60,100)","[97,99)",>=65,3.0,train
3,26050,131610,9f28d6c9-b25c9639-f44113b0-3b2a090a-fc93c9da,13503272,55433908,CHEST (PA AND LAT),PA,3056,2524,21960313,...,41,>=140,>=90,>=95,>=18,"[60,100)",<97,<65,3.0,train
4,50365,253984,4eb79ebf-ccb58da0-e7952406-85d76594-20176790,16743747,56587072,CHEST (PA AND LAT),PA,3056,2544,21470925,...,77,>=140,"(60,90)",>=95,>=18,>=100,"[97,99)",>=65,2.0,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9993,27905,141053,e4ee156b-af4f725b-381db2ff-bf1eb795-bc68d31c,13750116,57391055,CHEST (PA AND LAT),PA,2544,3056,21641127,...,41,>=140,"(60,90)",>=95,>=18,>=100,"[99,100.4)",<65,3.0,train
9994,33451,169155,437f3329-ca6e1bf1-fd2c21f5-c7795468-e9a6119e,14508231,58054297,CHEST (PORTABLE AP),AP,2544,3056,21420502,...,56,>=140,"(60,90)",>=95,>=18,"[60,100)","[97,99)",<65,3.0,train
9995,10870,55037,c43c6d87-d2dbcbb3-2e5500a7-10e31a36-3f9509d7,11489816,56219859,CHEST (PA AND LAT),AP,2544,3056,21260626,...,59,"(90,140)","(60,90)",>=95,>=18,"[60,100)","[97,99)",<65,3.0,train
9997,43418,219774,9c7ba4c1-5b4fdec0-bdbab2ec-8cfecdc7-6d881efd,15813980,56854314,CHEST (PA AND LAT),PA,2544,3056,21380405,...,73,"(90,140)",<=60,>=95,>=18,"[60,100)","[97,99)",>=65,2.0,train


In [30]:
for col in [col for col in df.columns if col.endswith("_chexpert")]:
    print(df[col].value_counts())

19133405.0    11
14508231.0     9
11648387.0     8
16662316.0     8
11021643.0     8
              ..
13361901.0     1
12742898.0     1
13602379.0     1
15416852.0     1
19620258.0     1
Name: subject_id_chexpert, Length: 8673, dtype: int64
53997822.0    2
51553589.0    2
54377357.0    2
51003958.0    2
53032484.0    2
             ..
57970178.0    1
58318386.0    1
57811088.0    1
58227498.0    1
51782907.0    1
Name: study_id_chexpert, Length: 9844, dtype: int64
 0.0    8490
 1.0    1056
-1.0     454
Name: Atelectasis_chexpert, dtype: int64
 0.0    9171
 1.0     771
-1.0      58
Name: Cardiomegaly_chexpert, dtype: int64
 0.0    9681
 1.0     168
-1.0     151
Name: Consolidation_chexpert, dtype: int64
 0.0    8949
 1.0     562
-1.0     489
Name: Edema_chexpert, dtype: int64
 0.0    9853
 1.0      79
-1.0      68
Name: Enlarged Cardiomediastinum_chexpert, dtype: int64
 0.0    9774
 1.0     184
-1.0      42
Name: Fracture_chexpert, dtype: int64
 0.0    9691
 1.0     261
-1.0      48
Nam

In [31]:
discrete_features = [
        # "sbp_discrete",
        # "dbp_discrete",
        # "o2sat_discrete",
        # "resprate_discrete",
        # "heartrate_discrete",
        "temperature_discrete",
        "age_discrete",
        # "acuity_discrete",
        "gender",
    ]

# can remove some discrete columns here.
# also need to adjust the one in setup.

def get_discrete_string(x):
    discrete_str = ""
    for col in discrete_features:
        discrete_str += f"{col}:{str(x[col])},"
    return discrete_str
    


df['discrete_string']  = df.apply(lambda x: get_discrete_string(x), axis=1)

In [32]:
v_dict = df['discrete_string'].value_counts()

In [33]:
for k, v in v_dict.items():
    print(v)

2023
1992
1848
1585
404
404
276
274
185
170
169
159
153
141
132
85


In [34]:
for k, v in v_dict.items():
    print(f"{k}: {v}")


temperature_discrete:[97,99),age_discrete:<65,gender:F,: 2023
temperature_discrete:[97,99),age_discrete:<65,gender:M,: 1992
temperature_discrete:[97,99),age_discrete:>=65,gender:F,: 1848
temperature_discrete:[97,99),age_discrete:>=65,gender:M,: 1585
temperature_discrete:[99,100.4),age_discrete:<65,gender:M,: 404
temperature_discrete:[99,100.4),age_discrete:<65,gender:F,: 404
temperature_discrete:[99,100.4),age_discrete:>=65,gender:M,: 276
temperature_discrete:[99,100.4),age_discrete:>=65,gender:F,: 274
temperature_discrete:<97,age_discrete:<65,gender:M,: 185
temperature_discrete:>=100.4,age_discrete:<65,gender:M,: 170
temperature_discrete:<97,age_discrete:>=65,gender:F,: 169
temperature_discrete:<97,age_discrete:<65,gender:F,: 159
temperature_discrete:<97,age_discrete:>=65,gender:M,: 153
temperature_discrete:>=100.4,age_discrete:>=65,gender:M,: 141
temperature_discrete:>=100.4,age_discrete:<65,gender:F,: 132
temperature_discrete:>=100.4,age_discrete:>=65,gender:F,: 85
