In [4]:
import os
import pandas as pd
from tqdm.notebook  import tqdm
from shutil import copytree, copy
from data_path import  REFLACX_FOLDER_PATH, XAMI_SPREADSHEET_FOLDER_PATH, XAMI_MIMIC_PATH
from utils.getter import get_cxr_match_by_dicom_ids

pd.options.mode.chained_assignment = None

In [5]:
cxr_df_with_stay_id_only = pd.read_csv(os.path.join(XAMI_SPREADSHEET_FOLDER_PATH ,"cxr_meta_with_stay_id_only.csv"), index_col=0)
cxr_df_with_stay_id_only.head(5)

Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning,in_eye_gaze,in_reflacx,StudyDateTime,stay_id
181,4873aa08-977bfd31-fb492e64-6ef432d1-3f12cbe3,10002430,53254222,CHEST (PA AND LAT),PA,3056,2544,21250928,160736.171,CHEST (PA AND LAT),postero-anterior,Erect,True,False,21250930000000.0,31293660
266,dcdc4bd9-4301b111-2a65a814-ee8e7bc5-7f0b9a5a,10003400,56466802,CHEST (PORTABLE AP),AP,3056,2544,21361209,133738.015,CHEST (PORTABLE AP),antero-posterior,Erect,False,True,21361210000000.0,33678912
497,5bdabba9-388f6646-ac06b5f5-f68b2fd2-3630de21,10011607,55814288,CHEST (PA AND LAT),AP,3056,2544,21820111,230933.25,CHEST (PA AND LAT),antero-posterior,Erect,False,True,21820110000000.0,37054412
539,4a629500-9c3281ca-90bab490-9b6ac9c1-e5e6a580,10012261,53728467,CHEST (PORTABLE AP),AP,2544,3056,21750906,191151.453,CHEST (PORTABLE AP),antero-posterior,Erect,False,True,21750910000000.0,38668412
567,8106217e-c41ab813-c6002d3f-ed4ee98b-7b165bae,10012498,55812956,CHEST (PA AND LAT),PA,3056,2544,21470331,211642.296,CHEST (PA AND LAT),postero-anterior,Erect,True,False,21470330000000.0,37362927


In [6]:
reflacx_meta_df = pd.concat([pd.read_csv(os.path.join(
    REFLACX_FOLDER_PATH, "main_data", f"metadata_phase_{i}.csv")) for i in range(1, 4)])
os.makedirs(os.path.join(
    XAMI_SPREADSHEET_FOLDER_PATH, "REFLACX"), exist_ok=True)
reflacx_meta_df.to_csv(os.path.join(
    XAMI_SPREADSHEET_FOLDER_PATH, "REFLACX", "metadata.csv"))


In [7]:
reflacx_meta_df["Lung nodule or mass"] = reflacx_meta_df[
    ["Lung nodule or mass", "Mass", "Nodule"]
].max(axis=1)

del reflacx_meta_df["Mass"]
del reflacx_meta_df["Nodule"]

reflacx_meta_df["High lung volume / emphysema"] = reflacx_meta_df[
    ["High lung volume / emphysema", "Emphysema"]
].max(axis=1)
del reflacx_meta_df["Emphysema"]

reflacx_meta_df["Pleural abnormality"] = reflacx_meta_df[
    ["Pleural abnormality", "Pleural thickening", "Pleural effusion"]
].max(axis=1)
del reflacx_meta_df["Pleural thickening"]
del reflacx_meta_df["Pleural effusion"]

In [8]:
all_valid_dicom_id_with_stay_id_indentified = set(cxr_df_with_stay_id_only['dicom_id'])
reflacx_meta__df_with_stay_id_indentified =  reflacx_meta_df[reflacx_meta_df['dicom_id'].isin(all_valid_dicom_id_with_stay_id_indentified)]
print(f"We have {len(reflacx_meta__df_with_stay_id_indentified)} in the REFLACX dataset can be used with clinical data.")

We have 907 in the REFLACX dataset can be used with clinical data.


In [9]:
reflacx_meta__df_with_stay_id_indentified.head(5)

Unnamed: 0,id,split,eye_tracking_data_discarded,image,dicom_id,subject_id,image_size_x,image_size_y,Airway wall thickening,Atelectasis,...,Support devices,Wide mediastinum,Abnormal mediastinal contour,Acute fracture,Enlarged hilum,Hiatal hernia,High lung volume / emphysema,Interstitial lung disease,Lung nodule or mass,Pleural abnormality
0,P102R108387,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,0.0,...,True,0.0,,,,,0.0,,0.0,0.0
1,P102R379837,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,0.0,...,True,0.0,,,,,0.0,,0.0,0.0
2,P102R558314,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,4.0,0.0,...,True,0.0,,,,,0.0,,0.0,0.0
3,P102R765317,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,0.0,...,True,2.0,,,,,0.0,,1.0,0.0
4,P102R915878,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,0.0,...,True,0.0,,,,,0.0,,0.0,0.0


In [10]:
all_patients_to_import = list(set(reflacx_meta__df_with_stay_id_indentified['subject_id']))

In [11]:
for subject_id in tqdm(all_patients_to_import):
    subject_match = reflacx_meta__df_with_stay_id_indentified[
        reflacx_meta__df_with_stay_id_indentified["subject_id"] == subject_id]
    all_dicom_ids = list(subject_match['dicom_id'])
    cxr_matches = get_cxr_match_by_dicom_ids(
        cxr_df_with_stay_id_only, all_dicom_ids)
    all_stay_ids = list(set(cxr_matches['stay_id']))

    for stay_id in all_stay_ids:
        dicom_ids_for_this_stay = list(
            cxr_matches[cxr_matches['stay_id'] == stay_id]['dicom_id'])
        match_meta_df = reflacx_meta__df_with_stay_id_indentified[reflacx_meta__df_with_stay_id_indentified['dicom_id'].isin(
            dicom_ids_for_this_stay)]

        save_folder_path = os.path.join(XAMI_MIMIC_PATH, f"patient_{subject_id}", f"stay_{stay_id}", "REFLACX")
        os.makedirs(save_folder_path, exist_ok=True)

        match_meta_df.to_csv(os.path.join(save_folder_path, "metadata.csv"))

        all_study_ids = list(match_meta_df['id'])

        for study_id in all_study_ids:
            source_path = os.path.join(
                REFLACX_FOLDER_PATH, "main_data", study_id)
            destination_path = os.path.join(save_folder_path, study_id)
            copytree(source_path, destination_path, dirs_exist_ok=True, copy_function=copy)


  0%|          | 0/739 [00:00<?, ?it/s]