In [1]:
import os
import pandas as pd
from tqdm.notebook  import tqdm
from shutil import copytree, copy
from data_path import CXR_FOLDER_PATH, XAMI_SPREADSHEET_FOLDER_PATH, XAMI_MIMIC_PATH, CXR_DICOM_FOLDER_PATH

pd.options.mode.chained_assignment  = None

In [2]:
cxr_df_with_stay_id_only = pd.read_csv(os.path.join(
    XAMI_SPREADSHEET_FOLDER_PATH, "cxr_meta_with_stay_id_only.csv"), index_col=0)
cxr_chexpert_df = pd.read_csv(os.path.join(
    CXR_FOLDER_PATH, "mimic-cxr-2.0.0-chexpert.csv.gz"), compression='gzip', header=0, sep=',', quotechar='"')
cxr_negbio_df = pd.read_csv(os.path.join(
    CXR_FOLDER_PATH, "mimic-cxr-2.0.0-negbio.csv.gz"), compression='gzip', header=0, sep=',', quotechar='"')
cxr_split_df = pd.read_csv(os.path.join(CXR_FOLDER_PATH, "mimic-cxr-2.0.0-split.csv.gz"),
                           compression='gzip', header=0, sep=',', quotechar='"')


In [3]:
cxr_df_with_stay_id_only.head(5)

Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning,in_eye_gaze,in_reflacx,StudyDateTime,stay_id
181,4873aa08-977bfd31-fb492e64-6ef432d1-3f12cbe3,10002430,53254222,CHEST (PA AND LAT),PA,3056,2544,21250928,160736.171,CHEST (PA AND LAT),postero-anterior,Erect,True,False,21250930000000.0,31293660
266,dcdc4bd9-4301b111-2a65a814-ee8e7bc5-7f0b9a5a,10003400,56466802,CHEST (PORTABLE AP),AP,3056,2544,21361209,133738.015,CHEST (PORTABLE AP),antero-posterior,Erect,False,True,21361210000000.0,33678912
497,5bdabba9-388f6646-ac06b5f5-f68b2fd2-3630de21,10011607,55814288,CHEST (PA AND LAT),AP,3056,2544,21820111,230933.25,CHEST (PA AND LAT),antero-posterior,Erect,False,True,21820110000000.0,37054412
539,4a629500-9c3281ca-90bab490-9b6ac9c1-e5e6a580,10012261,53728467,CHEST (PORTABLE AP),AP,2544,3056,21750906,191151.453,CHEST (PORTABLE AP),antero-posterior,Erect,False,True,21750910000000.0,38668412
567,8106217e-c41ab813-c6002d3f-ed4ee98b-7b165bae,10012498,55812956,CHEST (PA AND LAT),PA,3056,2544,21470331,211642.296,CHEST (PA AND LAT),postero-anterior,Erect,True,False,21470330000000.0,37362927


In [4]:
all_subjects_in_cxr = list(set(cxr_df_with_stay_id_only["subject_id"]))
all_subjects_in_cxr.sort()

In [6]:
for subject_id in tqdm(all_subjects_in_cxr):

    subject_match = cxr_df_with_stay_id_only[cxr_df_with_stay_id_only['subject_id'] == subject_id]
    all_stay_ids = list(set(subject_match["stay_id"]))

    for stay_id in all_stay_ids:
        stay_match = subject_match[subject_match["stay_id"] == stay_id]
        all_study_ids = list(set(stay_match['study_id']))

        for study_id in all_study_ids:
            # CXR-jpg
            source_path = os.path.join(
                CXR_FOLDER_PATH, "files", f"p{str(subject_id)[:2]}", f"p{subject_id}", f"s{study_id}")
            save_folder_path = os.path.join(
                XAMI_MIMIC_PATH, f"patient_{subject_id}", f"stay_{stay_id}", "CXR-JPG")
            os.makedirs(save_folder_path, exist_ok=True)
            destination_path = os.path.join(
                save_folder_path, f"study_{study_id}")
            copytree(source_path, destination_path,
                     copy_function=copy, dirs_exist_ok=True)

            # CXR-meta
            match_cxr_meta = cxr_df_with_stay_id_only[cxr_df_with_stay_id_only['study_id'] == study_id]
            match_cxr_chexpert = cxr_chexpert_df[cxr_chexpert_df["study_id"] == study_id]
            match_cxr_negbio = cxr_negbio_df[cxr_negbio_df["study_id"] == study_id]
            match_cxr_split = cxr_split_df[cxr_split_df["study_id"] == study_id]

            match_cxr_meta.to_csv(os.path.join(
                destination_path, "cxr_chexpert.csv"))
            match_cxr_chexpert.to_csv(os.path.join(
                destination_path, "cxr_chexpert.csv"))
            match_cxr_negbio.to_csv(os.path.join(
                destination_path, "cxr_negbio.csv"))
            match_cxr_split.to_csv(os.path.join(
                destination_path, "cxr_split.csv"))

            # import the dicom
            source_path = os.path.join(
                CXR_DICOM_FOLDER_PATH, "files",  f"p{str(subject_id)[:2]}", f"p{subject_id}", f"s{study_id}")
            save_folder_path = os.path.join(
                XAMI_MIMIC_PATH, f"patient_{subject_id}", f"stay_{stay_id}", "CXR-DICOM")
            os.makedirs(save_folder_path, exist_ok=True)
            destination_path = os.path.join(save_folder_path, f"study_{study_id}")
            copytree(source_path, destination_path, copy_function=copy, dirs_exist_ok=True)

            # ## import report text
            source_path = os.path.join(CXR_DICOM_FOLDER_PATH, "files", f"p{str(subject_id)[:2]}", f"p{subject_id}", f"s{study_id}.txt")
            destination_path = save_folder_path
            copy(source_path, destination_path)


  0%|          | 0/1133 [00:00<?, ?it/s]