In [1]:
import os, json
import pandas as pd

from data.paths import TabularDataPaths
from utils import print as print_f
from utils.preprocessing import get_split_list
from data.constants import XAMI_MIMIC_PATH, SPREADSHEET_FOLDER
from utils.init import reproducibility

In [2]:
seed = 0
reproducibility(0)

XAMI_MIMIC_PATH_str = "{XAMI_MIMIC_PATH}"

In [3]:
reflacx_meta_df = pd.read_csv(
    TabularDataPaths.SpreadSheet.get_sreadsheet(
        XAMI_MIMIC_PATH, TabularDataPaths.SpreadSheet.REFLACX.metadata,
    )
)
reflacx_meta_df.head(5)


Unnamed: 0.1,Unnamed: 0,id,split,eye_tracking_data_discarded,image,dicom_id,subject_id,image_size_x,image_size_y,Airway wall thickening,...,Support devices,Wide mediastinum,Abnormal mediastinal contour,Acute fracture,Enlarged hilum,Hiatal hernia,High lung volume / emphysema,Interstitial lung disease,Lung nodule or mass,Pleural abnormality
0,0,P102R108387,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,...,True,0.0,,,,,,,,
1,1,P102R379837,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,...,True,0.0,,,,,,,,
2,2,P102R558314,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,4.0,...,True,0.0,,,,,,,,
3,3,P102R765317,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,...,True,2.0,,,,,,,,
4,4,P102R915878,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,...,True,0.0,,,,,,,,


In [6]:
all_subjects = list(reflacx_meta_df['subject_id'].unique())

In [7]:
all_subjects

[18111516,
 12500924,
 15192710,
 14850453,
 13017215,
 17970010,
 10543994,
 13855491,
 16826047,
 16326093,
 10934681,
 14311521,
 13716770,
 13736002,
 18522520,
 19346228,
 12182106,
 15276846,
 14094298,
 12638488,
 17741319,
 12208824,
 10610928,
 16976729,
 13174810,
 16289699,
 18441078,
 14057989,
 12482083,
 19625397,
 19991135,
 10933609,
 14755867,
 13128687,
 19182863,
 17265926,
 16164648,
 15047777,
 10103748,
 10481158,
 14785071,
 17611292,
 16751749,
 11877319,
 15084163,
 10792610,
 16848073,
 19918077,
 14443106,
 10679138,
 17258978,
 18862368,
 17600369,
 14387068,
 14858737,
 17112432,
 12169013,
 17804936,
 16116913,
 17799242,
 13281743,
 14841168,
 19601036,
 12019744,
 16570377,
 13042648,
 16129000,
 11537996,
 14883411,
 10141364,
 17669276,
 11677801,
 16742247,
 13094477,
 19509694,
 10481689,
 13761048,
 19827413,
 10141577,
 11474034,
 15223112,
 12528429,
 10439781,
 11411448,
 11566800,
 19486351,
 19458616,
 13764208,
 11527061,
 10314359,
 15880873,

In [14]:
core_patients_df = pd.read_csv("D:\mimic-iv-1.0\mimic-iv-1.0\core\patients.csv")
core_admission_df = pd.read_csv("D:\mimic-iv-1.0\mimic-iv-1.0\core\\admissions.csv")

In [18]:
unique_patient_reflacx = reflacx_meta_df.drop_duplicates(subset=['subject_id'])

In [21]:
print(f"REFLACX has {len(reflacx_meta_df)} instances with {len(unique_patient_reflacx)} unique patients.") 

REFLACX has 3052 instances with 2199 unique patients.


In [54]:
unique_merge_df = unique_patient_reflacx.merge(core_admission_df.drop_duplicates(subset=['subject_id'])[["subject_id", "ethnicity"]], 'left', left_on='subject_id', right_on='subject_id')

In [55]:
len(unique_patient_reflacx)

2199

In [56]:
len(unique_merge_df)

2199

In [63]:
count_df = pd.DataFrame(unique_merge_df['ethnicity'].value_counts())

In [69]:
count_df.rename(columns={"ethnicity": "ethnicity (count)"}, inplace=True)
count_df["ethnicity (%)"] = (count_df['ethnicity (count)'] / sum(count_df['ethnicity (count)']))*100

In [71]:
print(f"REFLACX has {len(reflacx_meta_df)} instances with {len(unique_patient_reflacx)} unique patients.") 
count_df

REFLACX has 3052 instances with 2199 unique patients.


Unnamed: 0,ethnicity (count),ethnicity (%)
WHITE,1361,65.181992
BLACK/AFRICAN AMERICAN,345,16.522989
HISPANIC/LATINO,103,4.93295
OTHER,95,4.549808
UNKNOWN,90,4.310345
ASIAN,81,3.87931
UNABLE TO OBTAIN,7,0.335249
AMERICAN INDIAN/ALASKA NATIVE,6,0.287356


In [75]:
mimic_reflacx_df = pd.read_csv(os.path.join(SPREADSHEET_FOLDER, "reflacx_with_clinical.csv"))

In [None]:
mimic_reflacx_df

In [76]:
unique_patient_mimic_reflacx = mimic_reflacx_df.drop_duplicates(subset=['subject_id'])

In [77]:
print(f"MIMIC REFLACX has {len(mimic_reflacx_df)} instances with {len(unique_patient_mimic_reflacx)} unique patients.") 


MIMIC REFLACX has 670 instances with 540 unique patients.


In [78]:
unique_mimic_merge_df = unique_patient_mimic_reflacx.merge(core_admission_df.drop_duplicates(subset=['subject_id'])[["subject_id", "ethnicity"]], 'left', left_on='subject_id', right_on='subject_id')

In [87]:
def get_count_table(mimic_reflacx_df, core_admission_df):
    unique_patient_mimic_reflacx = mimic_reflacx_df.drop_duplicates(subset=['subject_id'])
    unique_mimic_merge_df = unique_patient_mimic_reflacx.merge(core_admission_df.drop_duplicates(subset=['subject_id'])[["subject_id", "ethnicity"]], 'left', left_on='subject_id', right_on='subject_id')
    mimic_count_df = pd.DataFrame(unique_mimic_merge_df['ethnicity'].value_counts())
    mimic_count_df.rename(columns={"ethnicity": "ethnicity (count)"}, inplace=True)
    mimic_count_df["ethnicity (%)"] = (mimic_count_df['ethnicity (count)'] / sum(mimic_count_df['ethnicity (count)']))*100
    return f"has {len(mimic_reflacx_df)} instances with {len(unique_patient_mimic_reflacx)} unique patients.", mimic_count_df

In [88]:
printing_str, count_table = get_count_table(mimic_reflacx_df, core_admission_df)

In [89]:
printing_str

'has 670 instances with 540 unique patients.'

In [90]:
count_table

Unnamed: 0,ethnicity (count),ethnicity (%)
WHITE,353,65.613383
BLACK/AFRICAN AMERICAN,105,19.516729
HISPANIC/LATINO,27,5.018587
ASIAN,22,4.089219
OTHER,16,2.973978
UNKNOWN,11,2.04461
AMERICAN INDIAN/ALASKA NATIVE,3,0.557621
UNABLE TO OBTAIN,1,0.185874


In [96]:
printing_str, count_table = get_count_table(mimic_reflacx_df[mimic_reflacx_df['split'] == 'train'], core_admission_df)
print(f"training subset {printing_str}")
count_table


training subset has 468 instances with 391 unique patients.


Unnamed: 0,ethnicity (count),ethnicity (%)
WHITE,254,65.29563
BLACK/AFRICAN AMERICAN,80,20.565553
HISPANIC/LATINO,21,5.398458
ASIAN,17,4.37018
OTHER,9,2.313625
UNKNOWN,7,1.799486
AMERICAN INDIAN/ALASKA NATIVE,1,0.257069


In [95]:
printing_str, count_table = get_count_table(mimic_reflacx_df[mimic_reflacx_df['split'] == 'val'], core_admission_df)
print(f"validation subset {printing_str}")
count_table

validation subset has 101 instances with 95 unique patients.


Unnamed: 0,ethnicity (count),ethnicity (%)
WHITE,65,69.148936
BLACK/AFRICAN AMERICAN,13,13.829787
HISPANIC/LATINO,6,6.382979
ASIAN,3,3.191489
OTHER,3,3.191489
UNKNOWN,3,3.191489
AMERICAN INDIAN/ALASKA NATIVE,1,1.06383


In [97]:
printing_str, count_table = get_count_table(mimic_reflacx_df[mimic_reflacx_df['split'] == 'test'], core_admission_df)
print(f"Test subset {printing_str}")
count_table

Test subset has 101 instances with 97 unique patients.


Unnamed: 0,ethnicity (count),ethnicity (%)
WHITE,66,68.75
BLACK/AFRICAN AMERICAN,17,17.708333
OTHER,4,4.166667
ASIAN,4,4.166667
HISPANIC/LATINO,2,2.083333
UNKNOWN,1,1.041667
AMERICAN INDIAN/ALASKA NATIVE,1,1.041667
UNABLE TO OBTAIN,1,1.041667


In [80]:
mimic_count_df = pd.DataFrame(unique_mimic_merge_df['ethnicity'].value_counts())

In [81]:
mimic_count_df.rename(columns={"ethnicity": "ethnicity (count)"}, inplace=True)
mimic_count_df["ethnicity (%)"] = (mimic_count_df['ethnicity (count)'] / sum(mimic_count_df['ethnicity (count)']))*100

In [82]:
print(f"MIMIC REFLACX has {len(mimic_reflacx_df)} instances with {len(unique_patient_mimic_reflacx)} unique patients.") 
mimic_count_df

MIMIC REFLACX has 670 instances with 540 unique patients.


Unnamed: 0,ethnicity (count),ethnicity (%)
WHITE,353,65.613383
BLACK/AFRICAN AMERICAN,105,19.516729
HISPANIC/LATINO,27,5.018587
ASIAN,22,4.089219
OTHER,16,2.973978
UNKNOWN,11,2.04461
AMERICAN INDIAN/ALASKA NATIVE,3,0.557621
UNABLE TO OBTAIN,1,0.185874


In [83]:
unique_mimic_merge_df

Unnamed: 0.1,Unnamed: 0,index,id,dicom_id,subject_id,stay_id,study_id,image_path,ViewPosition,image_size_x,...,Acute fracture,Enlarged hilum,Hiatal hernia,High lung volume / emphysema,Interstitial lung disease,Lung nodule or mass,Pleural abnormality,Fracture,split,ethnicity
0,0,0,P102R108387,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,32067002,55032240,{XAMI_MIMIC_PATH}\patient_18111516\CXR-JPG\s55...,AP,2544,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train,BLACK/AFRICAN AMERICAN
1,5,70,P116R051205,887ce638-036e6a6b-15d37e27-30515374-f78b68e9,18522520,35511308,59022382,{XAMI_MIMIC_PATH}\patient_18522520\CXR-JPG\s59...,PA,3056,...,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,train,WHITE
2,10,75,P117R192765,2aa0617f-e2078a66-aabf9e81-83a6e28c-0b16ad30,19346228,31703757,58752468,{XAMI_MIMIC_PATH}\patient_19346228\CXR-JPG\s58...,PA,2544,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,test,WHITE
3,15,85,P119R043258,85c363f0-f551672b-c20e32d9-0d7c5522-cc076373,15276846,35796840,57959166,{XAMI_MIMIC_PATH}\patient_15276846\CXR-JPG\s57...,PA,2544,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,val,WHITE
4,20,105,P123R085394,418c0cac-27616a53-94e8e56b-3041a288-0e90a16d,12208824,34131872,55766505,{XAMI_MIMIC_PATH}\patient_12208824\CXR-JPG\s55...,PA,2544,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,train,BLACK/AFRICAN AMERICAN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
535,665,3020,P300R833708,a2fe8aae-2fe32131-b47c4e5b-090f4c13-88e7ac97,19875621,32582616,55102074,{XAMI_MIMIC_PATH}\patient_19875621\CXR-JPG\s55...,PA,2544,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,val,HISPANIC/LATINO
536,666,3023,P300R918521,a2a80c63-8b9575dc-bc08895e-40392d6a-d7fc17d5,19884194,36719043,57912042,{XAMI_MIMIC_PATH}\patient_19884194\CXR-JPG\s57...,AP,2544,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train,WHITE
537,667,3027,P300R611251,2b20dcdf-4077bc16-48fc8eb5-265ef218-f6552cb0,19906407,33352559,57296330,{XAMI_MIMIC_PATH}\patient_19906407\CXR-JPG\s57...,AP,2881,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,val,WHITE
538,668,3028,P300R519683,92134f99-0e73faba-1280ad81-218c68ba-933a85c5,19907884,39112538,57427881,{XAMI_MIMIC_PATH}\patient_19907884\CXR-JPG\s57...,PA,2544,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train,WHITE


In [None]:
print(f"REFLACX has {len(reflacx_meta_df)} instances with {len(unique_patient_reflacx)} unique patients.") 

In [16]:
reflacx_meta_df

Unnamed: 0.1,Unnamed: 0,id,split,eye_tracking_data_discarded,image,dicom_id,subject_id,image_size_x,image_size_y,Airway wall thickening,...,Support devices,Wide mediastinum,Abnormal mediastinal contour,Acute fracture,Enlarged hilum,Hiatal hernia,High lung volume / emphysema,Interstitial lung disease,Lung nodule or mass,Pleural abnormality
0,0,P102R108387,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,...,True,0.0,,,,,,,,
1,1,P102R379837,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,...,True,0.0,,,,,,,,
2,2,P102R558314,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,4.0,...,True,0.0,,,,,,,,
3,3,P102R765317,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,...,True,2.0,,,,,,,,
4,4,P102R915878,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,...,True,0.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3047,2502,P300R939601,test,False,physionet.org/files/mimic-cxr/2.0.0/files/p19/...,d9661ff6-877ac981-a20a8810-92309d46-173008ad,19991135,2544,3056,,...,False,,5.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
3048,2503,P300R331925,test,False,physionet.org/files/mimic-cxr/2.0.0/files/p19/...,d8f5555c-d8bcf97c-3a9d5596-17b9f854-6e15d081,19991135,1736,2022,,...,True,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
3049,2504,P300R336111,test,False,physionet.org/files/mimic-cxr/2.0.0/files/p19/...,d04bbcbd-5143439e-50ebe9a8-71380f67-44f8d127,19991135,2544,3056,,...,True,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3050,2505,P300R179177,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p19/...,7cfe7acc-65b0b548-feb77772-04c46d33-76a03230,19997367,2402,2712,,...,True,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0


In [9]:
core_patients_df

Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10000048,F,23,2126,2008 - 2010,
1,10002723,F,0,2128,2017 - 2019,
2,10003939,M,0,2184,2008 - 2010,
3,10004222,M,0,2161,2014 - 2016,
4,10005325,F,0,2154,2011 - 2013,
...,...,...,...,...,...,...
382273,19998203,M,29,2132,2011 - 2013,
382274,19998350,M,52,2127,2011 - 2013,
382275,19999068,M,63,2161,2011 - 2013,
382276,19999270,M,33,2184,2014 - 2016,


In [10]:
cxr_meta_df = pd.read_csv(
    TabularDataPaths.SpreadSheet.get_sreadsheet(
        XAMI_MIMIC_PATH, TabularDataPaths.SpreadSheet.cxr_meta
    )
)

In [12]:
cxr_meta_df.columns

Index(['Unnamed: 0', 'dicom_id', 'subject_id', 'study_id',
       'PerformedProcedureStepDescription', 'ViewPosition', 'Rows', 'Columns',
       'StudyDate', 'StudyTime', 'ProcedureCodeSequence_CodeMeaning',
       'ViewCodeSequence_CodeMeaning',
       'PatientOrientationCodeSequence_CodeMeaning', 'in_eye_gaze',
       'in_reflacx', 'StudyDateTime', 'stay_id'],
      dtype='object')