In [1]:
import os
import pandas as pd
from tqdm.notebook  import tqdm
from shutil import copytree, copy
from data_path import CXR_FOLDER_PATH, XAMI_SPREADSHEET_FOLDER_PATH, XAMI_MIMIC_PATH, CXR_DICOM_FOLDER_PATH

pd.options.mode.chained_assignment  = None

In [2]:
cxr_meta_df = pd.read_csv(os.path.join(
    XAMI_SPREADSHEET_FOLDER_PATH, "cxr_meta.csv"), index_col=0)

In [2]:
cxr_meta_df = pd.read_csv(os.path.join(
    XAMI_SPREADSHEET_FOLDER_PATH, "cxr_meta.csv"), index_col=0)

cxr_chexpert_df = pd.read_csv(os.path.join(
    CXR_FOLDER_PATH, "mimic-cxr-2.0.0-chexpert.csv.gz"), compression='gzip', header=0, sep=',', quotechar='"')
cxr_negbio_df = pd.read_csv(os.path.join(
    CXR_FOLDER_PATH, "mimic-cxr-2.0.0-negbio.csv.gz"), compression='gzip', header=0, sep=',', quotechar='"')
cxr_split_df = pd.read_csv(os.path.join(CXR_FOLDER_PATH, "mimic-cxr-2.0.0-split.csv.gz"),
                           compression='gzip', header=0, sep=',', quotechar='"')


In [3]:
cxr_meta_df.head(5)

Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning,in_eye_gaze,in_reflacx,StudyDateTime,stay_id
177,fa771fa1-d9571d07-bff8f655-327734a7-6e10b29d,10002428,59258773,CHEST (PORTABLE AP),AP,3056,2544,21560419,92717.109,CHEST (PORTABLE AP),antero-posterior,Erect,False,True,2156042000000.0,
181,4873aa08-977bfd31-fb492e64-6ef432d1-3f12cbe3,10002430,53254222,CHEST (PA AND LAT),PA,3056,2544,21250928,160736.171,CHEST (PA AND LAT),postero-anterior,Erect,True,False,21250930000000.0,31293660.0
266,dcdc4bd9-4301b111-2a65a814-ee8e7bc5-7f0b9a5a,10003400,56466802,CHEST (PORTABLE AP),AP,3056,2544,21361209,133738.015,CHEST (PORTABLE AP),antero-posterior,Erect,False,True,21361210000000.0,33678912.0
497,5bdabba9-388f6646-ac06b5f5-f68b2fd2-3630de21,10011607,55814288,CHEST (PA AND LAT),AP,3056,2544,21820111,230933.25,CHEST (PA AND LAT),antero-posterior,Erect,False,True,21820110000000.0,37054412.0
539,4a629500-9c3281ca-90bab490-9b6ac9c1-e5e6a580,10012261,53728467,CHEST (PORTABLE AP),AP,2544,3056,21750906,191151.453,CHEST (PORTABLE AP),antero-posterior,Erect,False,True,21750910000000.0,38668412.0


In [4]:
cxr_chexpert_df = pd.read_csv(os.path.join(
    CXR_FOLDER_PATH, "mimic-cxr-2.0.0-chexpert.csv.gz"), compression='gzip', header=0, sep=',', quotechar='"')
cxr_negbio_df = pd.read_csv(os.path.join(
    CXR_FOLDER_PATH, "mimic-cxr-2.0.0-negbio.csv.gz"), compression='gzip', header=0, sep=',', quotechar='"')
cxr_split_df = pd.read_csv(os.path.join(CXR_FOLDER_PATH, "mimic-cxr-2.0.0-split.csv.gz"),
                           compression='gzip', header=0, sep=',', quotechar='"')


In [5]:
cxr_split_df.head(5)

Unnamed: 0,dicom_id,study_id,subject_id,split
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,50414267,10000032,train
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,50414267,10000032,train
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,53189527,10000032,train
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,53189527,10000032,train
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,53911762,10000032,train


In [6]:
all_subjects_in_cxr = list(set(cxr_meta_df["subject_id"]))
all_subjects_in_cxr.sort()

In [7]:
for subject_id in tqdm(all_subjects_in_cxr):

    # cxr-meta
    match_cxr_meta = cxr_meta_df[cxr_meta_df['subject_id'] == subject_id]
    match_cxr_chexpert = cxr_chexpert_df[cxr_chexpert_df["subject_id"] == subject_id]
    match_cxr_negbio = cxr_negbio_df[cxr_negbio_df["subject_id"] == subject_id]
    match_cxr_split = cxr_split_df[cxr_split_df["subject_id"] == subject_id]

    cxr_jpg_save_folder_path = os.path.join(
            XAMI_MIMIC_PATH, f"patient_{subject_id}", "CXR-JPG")
    os.makedirs(cxr_jpg_save_folder_path, exist_ok=True)

    cxr_dicom_save_folder_path = os.path.join(
            XAMI_MIMIC_PATH, f"patient_{subject_id}", "CXR-DICOM")
    os.makedirs(cxr_dicom_save_folder_path, exist_ok=True)

    match_cxr_meta.to_csv(os.path.join(
        cxr_jpg_save_folder_path, "cxr_meta.csv"))
    match_cxr_chexpert.to_csv(os.path.join(
        cxr_jpg_save_folder_path, "cxr_chexpert.csv"))
    match_cxr_negbio.to_csv(os.path.join(
        cxr_jpg_save_folder_path, "cxr_negbio.csv"))
    match_cxr_split.to_csv(os.path.join(
        cxr_jpg_save_folder_path, "cxr_split.csv"))

    all_study_ids = list(set(match_cxr_meta['study_id']))

    # only copy the study we need.
    for study_id in all_study_ids:
        # CXR-jpg
        source_path = os.path.join(
            CXR_FOLDER_PATH, "files", f"p{str(subject_id)[:2]}", f"p{subject_id}", f"s{study_id}")
        copytree(source_path, os.path.join(cxr_jpg_save_folder_path, f"s{study_id}"),
                copy_function=copy, dirs_exist_ok=True)

        # import the dicom
        source_path = os.path.join(
            CXR_DICOM_FOLDER_PATH, "files",  f"p{str(subject_id)[:2]}", f"p{subject_id}", f"s{study_id}")

        copytree(source_path, os.path.join(cxr_dicom_save_folder_path, f"s{study_id}"),
                copy_function=copy, dirs_exist_ok=True)

        # import report text
        source_path = os.path.join(CXR_DICOM_FOLDER_PATH, "files", f"p{str(subject_id)[:2]}", f"p{subject_id}", f"s{study_id}.txt")
        copy(source_path, cxr_dicom_save_folder_path)

    ########### ALL CXR belong to the patient ###########

    # import the dicom
    # source_path = os.path.join(
    #     CXR_DICOM_FOLDER_PATH, "files",  f"p{str(subject_id)[:2]}", f"p{subject_id}")
    # dsetination_path = os.path.join(
    #     XAMI_MIMIC_PATH, f"patient_{subject_id}", "CXR-DICOM")
    # os.makedirs(dsetination_path, exist_ok=True)
    # copytree(source_path, dsetination_path, copy_function=copy, dirs_exist_ok=True)

    # # CXR-jpg
    # source_path = os.path.join(
    #     CXR_FOLDER_PATH, "files", f"p{str(subject_id)[:2]}", f"p{subject_id}")
    # save_folder_path = os.path.join(
    #     XAMI_MIMIC_PATH, f"patient_{subject_id}", "CXR-JPG")
    # os.makedirs(save_folder_path, exist_ok=True)
    
    # destination_path = os.path.join(
    #     save_folder_path)
    # copytree(source_path, destination_path,
    #             copy_function=copy, dirs_exist_ok=True)

  0%|          | 0/3192 [00:00<?, ?it/s]

In [9]:
## CXR-JPG spreadsheets, this one contain all the information in CXR dicom spreadsheets.
spreadsheet_folder_path = os.path.join(
    XAMI_SPREADSHEET_FOLDER_PATH, "CXR-JPG")

os.makedirs(spreadsheet_folder_path, exist_ok=True)

cxr_chexpert_df.to_csv(os.path.join(
    spreadsheet_folder_path, "cxr_chexpert.csv"))
cxr_negbio_df.to_csv(os.path.join(
    spreadsheet_folder_path, "cxr_negbio.csv"))
cxr_split_df.to_csv(os.path.join(
    spreadsheet_folder_path, "cxr_split.csv"))