In [1]:
import os
import pandas as pd
from tqdm.notebook  import tqdm
from shutil import copytree, copy
from data_path import  REFLACX_FOLDER_PATH, XAMI_SPREADSHEET_FOLDER_PATH, XAMI_MIMIC_PATH
from utils.getter import get_cxr_match_by_dicom_ids

pd.options.mode.chained_assignment  = None

In [2]:
cxr_meta_df = pd.read_csv(os.path.join(XAMI_SPREADSHEET_FOLDER_PATH ,"cxr_meta.csv"), index_col=0)
cxr_meta_df.head(5)

Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning,in_eye_gaze,in_reflacx,StudyDateTime,stay_id
177,fa771fa1-d9571d07-bff8f655-327734a7-6e10b29d,10002428,59258773,CHEST (PORTABLE AP),AP,3056,2544,21560419,92717.109,CHEST (PORTABLE AP),antero-posterior,Erect,False,True,2156042000000.0,
181,4873aa08-977bfd31-fb492e64-6ef432d1-3f12cbe3,10002430,53254222,CHEST (PA AND LAT),PA,3056,2544,21250928,160736.171,CHEST (PA AND LAT),postero-anterior,Erect,True,False,21250930000000.0,31293660.0
266,dcdc4bd9-4301b111-2a65a814-ee8e7bc5-7f0b9a5a,10003400,56466802,CHEST (PORTABLE AP),AP,3056,2544,21361209,133738.015,CHEST (PORTABLE AP),antero-posterior,Erect,False,True,21361210000000.0,33678912.0
497,5bdabba9-388f6646-ac06b5f5-f68b2fd2-3630de21,10011607,55814288,CHEST (PA AND LAT),AP,3056,2544,21820111,230933.25,CHEST (PA AND LAT),antero-posterior,Erect,False,True,21820110000000.0,37054412.0
539,4a629500-9c3281ca-90bab490-9b6ac9c1-e5e6a580,10012261,53728467,CHEST (PORTABLE AP),AP,2544,3056,21750906,191151.453,CHEST (PORTABLE AP),antero-posterior,Erect,False,True,21750910000000.0,38668412.0


In [3]:
reflacx_meta_df = pd.concat([pd.read_csv(os.path.join(
    REFLACX_FOLDER_PATH, "main_data", f"metadata_phase_{i}.csv")) for i in range(1, 4)])
os.makedirs(os.path.join(
    XAMI_SPREADSHEET_FOLDER_PATH, "REFLACX"), exist_ok=True)
reflacx_meta_df.to_csv(os.path.join(
    XAMI_SPREADSHEET_FOLDER_PATH, "REFLACX", "metadata.csv"))


In [4]:
with_clinical_count = len(reflacx_meta_df[reflacx_meta_df['dicom_id'].isin(set(cxr_meta_df['dicom_id']))])
print(f"We have {with_clinical_count} in the REFLACX dataset can be used with clinical data.")

We have 3052 in the REFLACX dataset can be used with clinical data.


In [5]:
all_patients_to_import = list(set(reflacx_meta_df['subject_id']))

In [6]:
for subject_id in tqdm(all_patients_to_import):
    match_meta_df = reflacx_meta_df[
        reflacx_meta_df["subject_id"] == subject_id]
    all_dicom_ids = list(match_meta_df['dicom_id'])

    save_folder_path = os.path.join(XAMI_MIMIC_PATH, f"patient_{subject_id}", "REFLACX")

    # create folders
    os.makedirs(save_folder_path, exist_ok=True)
    os.makedirs(os.path.join(save_folder_path, "main_data"), exist_ok=True)
    os.makedirs(os.path.join(save_folder_path, "gaze_data"), exist_ok=True)

    match_meta_df.to_csv(os.path.join(save_folder_path, "metadata.csv"))

    all_study_ids = list(match_meta_df['id'])
    
    for study_id in all_study_ids:
            main_source_path = os.path.join(
                REFLACX_FOLDER_PATH, "main_data", study_id)
            main_destination_path = os.path.join(save_folder_path, "main_data",study_id)
            copytree(main_source_path, main_destination_path, dirs_exist_ok=True, copy_function=copy)

            # also do this for gaze data
            gaze_source_path = os.path.join(
                REFLACX_FOLDER_PATH, "gaze_data", study_id)
            gaze_destination_path = os.path.join(save_folder_path, "gaze_data",study_id)
            copytree(gaze_source_path, gaze_destination_path, dirs_exist_ok=True, copy_function=copy)

  0%|          | 0/2199 [00:00<?, ?it/s]