In [None]:
import os
import pandas as pd
from tqdm.notebook  import tqdm
from shutil import copytree, copy
from data_path import CXR_FOLDER_PATH, XAMI_SPREADSHEET_FOLDER_PATH, XAMI_MIMIC_PATH, CXR_DICOM_FOLDER_PATH

pd.options.mode.chained_assignment  = None

In [None]:
cxr_df_with_stay_id_only = pd.read_csv(os.path.join(
    XAMI_SPREADSHEET_FOLDER_PATH, "cxr_meta_with_stay_id_only.csv"), index_col=0)
cxr_chexpert_df = pd.read_csv(os.path.join(
    CXR_FOLDER_PATH, "mimic-cxr-2.0.0-chexpert.csv.gz"), compression='gzip', header=0, sep=',', quotechar='"')
cxr_negbio_df = pd.read_csv(os.path.join(
    CXR_FOLDER_PATH, "mimic-cxr-2.0.0-negbio.csv.gz"), compression='gzip', header=0, sep=',', quotechar='"')
cxr_split_df = pd.read_csv(os.path.join(CXR_FOLDER_PATH, "mimic-cxr-2.0.0-split.csv.gz"),
                           compression='gzip', header=0, sep=',', quotechar='"')


In [None]:
cxr_df_with_stay_id_only.head(5)

In [None]:
all_subjects_in_cxr = list(set(cxr_df_with_stay_id_only["subject_id"]))
all_subjects_in_cxr.sort()

In [None]:
for subject_id in tqdm(all_subjects_in_cxr):

    subject_match = cxr_df_with_stay_id_only[cxr_df_with_stay_id_only['subject_id'] == subject_id]
    all_stay_ids = list(set(subject_match["stay_id"]))

    for stay_id in all_stay_ids:
        stay_match = subject_match[subject_match["stay_id"] == stay_id]
        all_study_ids = list(set(stay_match['study_id']))

        for study_id in all_study_ids:
            # CXR-jpg
            source_path = os.path.join(
                CXR_FOLDER_PATH, "files", f"p{str(subject_id)[:2]}", f"p{subject_id}", f"s{study_id}")
            save_folder_path = os.path.join(
                XAMI_MIMIC_PATH, f"patient_{subject_id}", f"stay_{stay_id}", "CXR-JPG")
            os.makedirs(save_folder_path, exist_ok=True)
            destination_path = os.path.join(
                save_folder_path, f"study_{study_id}")
            copytree(source_path, destination_path,
                     copy_function=copy, dirs_exist_ok=True)

            # CXR-meta
            match_cxr_meta = cxr_df_with_stay_id_only[cxr_df_with_stay_id_only['study_id'] == study_id]
            match_cxr_chexpert = cxr_chexpert_df[cxr_chexpert_df["study_id"] == study_id]
            match_cxr_negbio = cxr_negbio_df[cxr_negbio_df["study_id"] == study_id]
            match_cxr_split = cxr_split_df[cxr_split_df["study_id"] == study_id]

            match_cxr_meta.to_csv(os.path.join(
                destination_path, "cxr_chexpert.csv"))
            match_cxr_chexpert.to_csv(os.path.join(
                destination_path, "cxr_chexpert.csv"))
            match_cxr_negbio.to_csv(os.path.join(
                destination_path, "cxr_negbio.csv"))
            match_cxr_split.to_csv(os.path.join(
                destination_path, "cxr_split.csv"))

            # import the dicom
            source_path = os.path.join(
                CXR_DICOM_FOLDER_PATH, "files",  f"p{str(subject_id)[:2]}", f"p{subject_id}", f"s{study_id}")
            save_folder_path = os.path.join(
                XAMI_MIMIC_PATH, f"patient_{subject_id}", f"stay_{stay_id}", "CXR-DICOM")
            os.makedirs(save_folder_path, exist_ok=True)
            destination_path = os.path.join(save_folder_path, f"study_{study_id}")
            copytree(source_path, destination_path, copy_function=copy, dirs_exist_ok=True)

            # ## import report text
            source_path = os.path.join(CXR_DICOM_FOLDER_PATH, "files", f"p{str(subject_id)[:2]}", f"p{subject_id}", f"s{study_id}.txt")
            destination_path = save_folder_path
            copy(source_path, destination_path)
