In [1]:
import pandas as pd
from data.paths import TabularDataPaths

In [2]:
## Define your MIMIC path here.
XAMI_MIMIC_PATH = "D:\XAMI-MIMIC"

In [3]:
reflacx_meta_df = pd.read_csv(TabularDataPaths.SpreadSheet.get_sreadsheet(XAMI_MIMIC_PATH, TabularDataPaths.SpreadSheet.REFLACX.metadata))

In [4]:
reflacx_meta_df.columns

Index(['Unnamed: 0', 'id', 'split', 'eye_tracking_data_discarded', 'image',
       'dicom_id', 'subject_id', 'image_size_x', 'image_size_y',
       'Airway wall thickening', 'Atelectasis', 'Consolidation', 'Emphysema',
       'Enlarged cardiac silhouette', 'Fibrosis', 'Fracture',
       'Groundglass opacity', 'Mass', 'Nodule', 'Other', 'Pleural effusion',
       'Pleural thickening', 'Pneumothorax', 'Pulmonary edema',
       'Quality issue', 'Support devices', 'Wide mediastinum',
       'Abnormal mediastinal contour', 'Acute fracture', 'Enlarged hilum',
       'Hiatal hernia', 'High lung volume / emphysema',
       'Interstitial lung disease', 'Lung nodule or mass',
       'Pleural abnormality'],
      dtype='object')

In [5]:
labels_cols = [
    'Airway wall thickening', 'Atelectasis', 'Consolidation',
    'Enlarged cardiac silhouette', 'Fibrosis', 'Fracture',
    'Groundglass opacity', 'Pneumothorax', 'Pulmonary edema', 'Wide mediastinum',
    'Abnormal mediastinal contour', 'Acute fracture', 'Enlarged hilum',
    'Quality issue', 'Support devices',
    'Hiatal hernia', 'High lung volume / emphysema',
    'Interstitial lung disease', 'Lung nodule or mass',
    'Pleural abnormality'
]

In [6]:
all_disease_cols = [
    'Airway wall thickening', 'Atelectasis', 'Consolidation', 'Emphysema',
    'Enlarged cardiac silhouette', 'Fibrosis', 'Fracture',
    'Groundglass opacity', 'Mass', 'Nodule', 'Other', 'Pleural effusion',
    'Pleural thickening', 'Pneumothorax', 'Pulmonary edema', 'Wide mediastinum',
    'Abnormal mediastinal contour', 'Acute fracture', 'Enlarged hilum',
    'Hiatal hernia', 'High lung volume / emphysema',
    'Interstitial lung disease', 'Lung nodule or mass',
    'Pleural abnormality'
]

In [7]:
reflacx_meta_df.head(5)

Unnamed: 0.1,Unnamed: 0,id,split,eye_tracking_data_discarded,image,dicom_id,subject_id,image_size_x,image_size_y,Airway wall thickening,...,Support devices,Wide mediastinum,Abnormal mediastinal contour,Acute fracture,Enlarged hilum,Hiatal hernia,High lung volume / emphysema,Interstitial lung disease,Lung nodule or mass,Pleural abnormality
0,0,P102R108387,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,...,True,0.0,,,,,,,,
1,1,P102R379837,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,...,True,0.0,,,,,,,,
2,2,P102R558314,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,4.0,...,True,0.0,,,,,,,,
3,3,P102R765317,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,...,True,2.0,,,,,,,,
4,4,P102R915878,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,...,True,0.0,,,,,,,,


In [8]:
# Checking the columns that contian nan values.
[ col for col in reflacx_meta_df.columns  if reflacx_meta_df[col].isna().any()]

['Airway wall thickening',
 'Emphysema',
 'Fibrosis',
 'Fracture',
 'Mass',
 'Nodule',
 'Other',
 'Pleural effusion',
 'Pleural thickening',
 'Quality issue',
 'Wide mediastinum',
 'Abnormal mediastinal contour',
 'Acute fracture',
 'Enlarged hilum',
 'Hiatal hernia',
 'High lung volume / emphysema',
 'Interstitial lung disease',
 'Lung nodule or mass',
 'Pleural abnormality']

In [9]:
reflacx_meta_df[all_disease_cols] = reflacx_meta_df[all_disease_cols].fillna(0)

In [10]:
# FOR THE BOOLEAN COLUMN
reflacx_meta_df[['Quality issue', 'Support devices']] = reflacx_meta_df[['Quality issue', 'Support devices']].fillna(False)

In [11]:
## Checking again for the missing value.

[ col for col in reflacx_meta_df.columns  if reflacx_meta_df[col].isna().any()]

[]

In [12]:
# Max value version.

reflacx_meta_df['Lung nodule or mass']  = reflacx_meta_df[['Lung nodule or mass', 'Mass', 'Nodule']].max(axis= 1)
del reflacx_meta_df['Mass']
del reflacx_meta_df['Nodule']

reflacx_meta_df['High lung volume / emphysema']  = reflacx_meta_df[['High lung volume / emphysema', 'Emphysema']].max(axis= 1)
del reflacx_meta_df['Emphysema']


reflacx_meta_df['Pleural abnormality']  = reflacx_meta_df[['Pleural abnormality','Pleural thickening', 'Pleural effusion']].max(axis= 1)
del reflacx_meta_df['Pleural thickening']
del reflacx_meta_df['Pleural effusion']

In [13]:
# # combining repetitive labels 
# reflacx_meta_df['Lung nodule or mass'] = (~reflacx_meta_df['Lung nodule or mass'].isna() & reflacx_meta_df['Lung nodule or mass'] > 0) | (~reflacx_meta_df['Mass'].isna() & reflacx_meta_df['Mass'] > 0) | (~reflacx_meta_df['Nodule'].isna() & reflacx_meta_df['Nodule'] > 0)
# del reflacx_meta_df['Mass']
# del reflacx_meta_df['Nodule']
# reflacx_meta_df['High lung volume / emphysema'] = (~reflacx_meta_df['High lung volume / emphysema'].isna() & reflacx_meta_df['High lung volume / emphysema']>0) | (~reflacx_meta_df['Emphysema'].isna() & reflacx_meta_df['Emphysema']>0) 
# del reflacx_meta_df['Emphysema']
# reflacx_meta_df['Pleural abnormality'] = (~reflacx_meta_df['Pleural abnormality'].isna() & reflacx_meta_df['Pleural abnormality'] > 0) | (~reflacx_meta_df['Pleural thickening'].isna() & reflacx_meta_df['Pleural thickening'] > 0) | (~reflacx_meta_df['Pleural effusion'].isna() & reflacx_meta_df['Pleural effusion'] > 0)
# del reflacx_meta_df['Pleural thickening']
# del reflacx_meta_df['Pleural effusion']

In [14]:
cxr_meta_df = pd.read_csv(TabularDataPaths.SpreadSheet.get_sreadsheet(XAMI_MIMIC_PATH ,TabularDataPaths.SpreadSheet.cxr_meta))
core_patients_df = pd.read_csv("D:\mimic-iv-1.0\mimic-iv-1.0\core\patients.csv")
ed_triage_df = pd.read_csv('D:\mimic-iv-ed-1.0\mimic-iv-ed-1.0\ed\\triage.csv')

In [15]:
merged_df = reflacx_meta_df.merge(cxr_meta_df, 'left', left_on='dicom_id', right_on='dicom_id', suffixes=("","cxr")).merge(core_patients_df, 'left', on="subject_id", suffixes=("","_patient")).merge(ed_triage_df, 'left', on="stay_id", suffixes=("","_triage"))

In [16]:
# ! If I tracking data is needed, we need to uncomment this one.
# merged_df = merged_df[~merged_df['eye_tracking_data_discarded']]

## only get the one with `stay_id`, so we can link to triage.csv
merged_df = merged_df[~merged_df['stay_id'].isna()]
merged_df['stay_id'] = merged_df['stay_id'].astype(int)

## Calculate the age according to the time when the CXR taken.
merged_df['age'] = merged_df['anchor_age'] + (merged_df['StudyDate'].apply(lambda x: int(str(x)[:4])) - merged_df['anchor_year'])

merged_df['image_path'] =   f"{XAMI_MIMIC_PATH}\patient_" + merged_df["subject_id"].astype(str) + "\CXR-JPG\s" + merged_df["study_id"].astype(str) + "\\" + merged_df["dicom_id"].astype(str) + ".jpg"

merged_df['anomaly_location_ellipses_path'] = f"{XAMI_MIMIC_PATH}\patient_" + merged_df["subject_id"].astype(str) + "\REFLACX\\" + merged_df["id"].astype(str) + "\\anomaly_location_ellipses.csv"

In [17]:
needed_columns = [

    # ids
    'id', 'dicom_id', 'subject_id', 'stay_id', 'study_id', 'split',

    # image meta
    'image_path', 'ViewPosition', 'image_size_x', 'image_size_y', 'anomaly_location_ellipses_path',

    # patients
    'age', 'gender',

    # triage
    'temperature', 'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp', 'pain',
    'acuity',


    # labels
    'Airway wall thickening', 'Atelectasis', 'Consolidation',
    'Enlarged cardiac silhouette', 'Fibrosis',
    'Groundglass opacity', 'Other', 'Pneumothorax', 'Pulmonary edema',
    'Quality issue', 'Support devices', 'Wide mediastinum',
    'Abnormal mediastinal contour', 'Acute fracture', 'Enlarged hilum',
    'Hiatal hernia', 'High lung volume / emphysema',
    'Interstitial lung disease', 'Lung nodule or mass',
    'Pleural abnormality',

    #'Fracture' # don't have any positive case

]


In [18]:
merged_df = merged_df[needed_columns]

In [19]:
# Checking the columns that contian nan values.
[ (col, sum(merged_df[col].isna())) for col in merged_df.columns  if merged_df[col].isna().any()]

[('temperature', 76),
 ('heartrate', 62),
 ('resprate', 70),
 ('o2sat', 66),
 ('sbp', 63),
 ('dbp', 65),
 ('pain', 83),
 ('acuity', 24)]

In [20]:
# replace the missing value by mean.

for col in merged_df.columns:
    if merged_df[col].isna().any():
        print(f"filling missing value for {col}")
        merged_df[col] = merged_df[col].fillna(round(merged_df[~merged_df[col].isna()][col].mean(), 1))



filling missing value for temperature
filling missing value for heartrate
filling missing value for resprate
filling missing value for o2sat
filling missing value for sbp
filling missing value for dbp
filling missing value for pain
filling missing value for acuity


In [21]:
merged_df.to_csv("reflacx_with_clinical.csv")

In [22]:
merged_df

Unnamed: 0,id,dicom_id,subject_id,stay_id,study_id,split,image_path,ViewPosition,image_size_x,image_size_y,...,Support devices,Wide mediastinum,Abnormal mediastinal contour,Acute fracture,Enlarged hilum,Hiatal hernia,High lung volume / emphysema,Interstitial lung disease,Lung nodule or mass,Pleural abnormality
0,P102R108387,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,32067002,55032240,train,D:\XAMI-MIMIC\patient_18111516\CXR-JPG\s550322...,AP,2544,3056,...,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,P102R379837,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,32067002,55032240,train,D:\XAMI-MIMIC\patient_18111516\CXR-JPG\s550322...,AP,2544,3056,...,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,P102R558314,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,32067002,55032240,train,D:\XAMI-MIMIC\patient_18111516\CXR-JPG\s550322...,AP,2544,3056,...,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,P102R765317,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,32067002,55032240,train,D:\XAMI-MIMIC\patient_18111516\CXR-JPG\s550322...,AP,2544,3056,...,True,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,P102R915878,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,32067002,55032240,train,D:\XAMI-MIMIC\patient_18111516\CXR-JPG\s550322...,AP,2544,3056,...,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3020,P300R833708,a2fe8aae-2fe32131-b47c4e5b-090f4c13-88e7ac97,19875621,32582616,55102074,train,D:\XAMI-MIMIC\patient_19875621\CXR-JPG\s551020...,PA,2544,3056,...,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3023,P300R918521,a2a80c63-8b9575dc-bc08895e-40392d6a-d7fc17d5,19884194,36719043,57912042,train,D:\XAMI-MIMIC\patient_19884194\CXR-JPG\s579120...,AP,2544,3056,...,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3027,P300R611251,2b20dcdf-4077bc16-48fc8eb5-265ef218-f6552cb0,19906407,33352559,57296330,train,D:\XAMI-MIMIC\patient_19906407\CXR-JPG\s572963...,AP,2881,2544,...,True,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3028,P300R519683,92134f99-0e73faba-1280ad81-218c68ba-933a85c5,19907884,39112538,57427881,test,D:\XAMI-MIMIC\patient_19907884\CXR-JPG\s574278...,PA,2544,3056,...,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
