In [1]:
import pandas as pd
import time
import numpy as np
import os
import random
from datetime import datetime   
import math
from tqdm.notebook  import tqdm
from shutil import copytree

pd.options.mode.chained_assignment  = None

In [2]:
# Define the path for all of the folders.
ED_FOLDER_PATH = "E:/AI-VR dataset/MIMIC-IV ED"
CLINICAL_FOLDER_PATH = "E:/AI-VR dataset/MIMIC-IV Clinical Database"
CXR_FOLDER_PATH = "E:/AI-VR dataset/MIMIC-CXR-JPG/physionet.org/files/mimic-cxr-jpg/2.0.0"
EYEGAZE_FOLDER_PATH = "E:/AI-VR dataset/eye-gaze-data-for-chest-x-rays-1.0.0"
REFLACX_FOLDER_PATH = "E:/AI-VR dataset/reflacx-reports-and-eye-tracking-data-for-localization-of-abnormalities-in-chest-x-rays-1.0.0/reflacx-reports-and-eye-tracking-data-for-localization-of-abnormalities-in-chest-x-rays-1.0.0"
# XAMI_MIMIC_PATH = "E:/AI-VR dataset/XAMI-MIMIC"
XAMI_MIMIC_PATH = "./XAMI-MIMIC"

In [3]:
cxr_df_with_stay_id_only = pd.read_csv("cxr_meta_with_stay_id_only.csv", index_col=0)
cxr_df_with_stay_id_only.head(5)

Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning,StudyDateTime,stay_id,in_eye_gaze,in_reflacx,in_ed,in_core
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,CHEST (PA AND LAT),PA,3056,2544,21800506,213014.531,CHEST (PA AND LAT),postero-anterior,Erect,21800510000000.0,33258284,False,False,True,True
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,10000032,50414267,CHEST (PA AND LAT),LATERAL,3056,2544,21800506,213014.531,CHEST (PA AND LAT),lateral,Erect,21800510000000.0,33258284,False,False,True,True
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,CHEST (PA AND LAT),PA,3056,2544,21800626,165500.312,CHEST (PA AND LAT),postero-anterior,Erect,21800630000000.0,38112554,False,False,True,True
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,10000032,53189527,CHEST (PA AND LAT),LATERAL,3056,2544,21800626,165500.312,CHEST (PA AND LAT),lateral,Erect,21800630000000.0,38112554,False,False,True,True
6,ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c,10000032,56699142,CHEST (PORTABLE AP),AP,3056,2544,21800805,234424.765,CHEST (PORTABLE AP),antero-posterior,,21800810000000.0,35968195,False,False,True,True


In [4]:
reflacx_meta_df =  pd.concat([pd.read_csv(f"{REFLACX_FOLDER_PATH}/main_data/metadata_phase_{i}.csv") for i in range(1,4)])

In [5]:
reflacx_meta_df.to_csv("reflacx_metadata.csv")

In [6]:
def get_subjectId_and_stayId_by_dicomId(cxr_df, dicom_id):
    matches = cxr_df[cxr_df['dicom_id'] == dicom_id]

    if (len(matches)> 0):
        return matches.iloc[0]['subject_id'], matches.iloc[0]['stay_id']

    return None

In [7]:
def get_cxr_match_by_dicom_ids(cxr_df, dicom_ids):
    matches = cxr_df[cxr_df['dicom_id'].isin(dicom_ids)]
    return matches

In [8]:
all_valid_dicom_id_with_stay_id_indentified = set(cxr_df_with_stay_id_only['dicom_id'])

In [9]:
reflacx_meta__df_with_stay_id_indentified =  reflacx_meta_df[reflacx_meta_df['dicom_id'].isin(all_valid_dicom_id_with_stay_id_indentified)]

In [10]:
reflacx_meta__df_with_stay_id_indentified

Unnamed: 0,id,split,eye_tracking_data_discarded,image,dicom_id,subject_id,image_size_x,image_size_y,Airway wall thickening,Atelectasis,...,Support devices,Wide mediastinum,Abnormal mediastinal contour,Acute fracture,Enlarged hilum,Hiatal hernia,High lung volume / emphysema,Interstitial lung disease,Lung nodule or mass,Pleural abnormality
0,P102R108387,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,0.0,...,True,0.0,,,,,,,,
1,P102R379837,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,0.0,...,True,0.0,,,,,,,,
2,P102R558314,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,4.0,0.0,...,True,0.0,,,,,,,,
3,P102R765317,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,0.0,...,True,2.0,,,,,,,,
4,P102R915878,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,0.0,...,True,0.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2475,P300R833708,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p19/...,a2fe8aae-2fe32131-b47c4e5b-090f4c13-88e7ac97,19875621,2544,3056,,0.0,...,False,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2478,P300R918521,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p19/...,a2a80c63-8b9575dc-bc08895e-40392d6a-d7fc17d5,19884194,2544,3056,,0.0,...,False,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2482,P300R611251,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p19/...,2b20dcdf-4077bc16-48fc8eb5-265ef218-f6552cb0,19906407,2881,2544,,0.0,...,True,,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2483,P300R519683,test,False,physionet.org/files/mimic-cxr/2.0.0/files/p19/...,92134f99-0e73faba-1280ad81-218c68ba-933a85c5,19907884,2544,3056,,0.0,...,False,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
print(f"We have {len(reflacx_meta__df_with_stay_id_indentified)} in the REFLACX dataset can be used with clinical data.")


We have 674 in the REFLACX dataset can be used with clinical data.


In [12]:
reflacx_meta__df_with_stay_id_indentified.head(5)

Unnamed: 0,id,split,eye_tracking_data_discarded,image,dicom_id,subject_id,image_size_x,image_size_y,Airway wall thickening,Atelectasis,...,Support devices,Wide mediastinum,Abnormal mediastinal contour,Acute fracture,Enlarged hilum,Hiatal hernia,High lung volume / emphysema,Interstitial lung disease,Lung nodule or mass,Pleural abnormality
0,P102R108387,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,0.0,...,True,0.0,,,,,,,,
1,P102R379837,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,0.0,...,True,0.0,,,,,,,,
2,P102R558314,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,4.0,0.0,...,True,0.0,,,,,,,,
3,P102R765317,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,0.0,...,True,2.0,,,,,,,,
4,P102R915878,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,0.0,...,True,0.0,,,,,,,,


In [13]:
all_patients_to_import = list(set(reflacx_meta__df_with_stay_id_indentified['subject_id']))

In [14]:
for subject_id in all_patients_to_import:
    subject_match = reflacx_meta__df_with_stay_id_indentified[reflacx_meta__df_with_stay_id_indentified["subject_id"] == subject_id]
    all_dicom_ids = list(subject_match['dicom_id'])
    cxr_matches = get_cxr_match_by_dicom_ids(cxr_df_with_stay_id_only ,all_dicom_ids)
    all_stay_ids = list(set(cxr_matches['stay_id']))
    
    for stay_id in all_stay_ids:
        dicom_ids_for_this_stay = list(cxr_matches[cxr_matches['stay_id'] == stay_id]['dicom_id'])
        match_meta_df = reflacx_meta__df_with_stay_id_indentified[reflacx_meta__df_with_stay_id_indentified['dicom_id'].isin(dicom_ids_for_this_stay)]

        save_folder_path = f"{XAMI_MIMIC_PATH}/patient_{subject_id}/stay_{stay_id}/REFLACX"
        os.makedirs(save_folder_path, exist_ok=True)

        match_meta_df.to_csv(f"{save_folder_path}/metadata.csv")

        all_study_ids = list(match_meta_df['id'])

        for study_id in all_study_ids:
            source_path = f"{REFLACX_FOLDER_PATH}/main_data/{study_id}"
            destination_path = f"{save_folder_path}/{study_id}"
            copytree(source_path, destination_path)   