In [1]:
import pandas as pd
import time
import numpy as np
import os
import random
from datetime import datetime   
import math
from tqdm.notebook  import tqdm
from shutil import copytree

pd.options.mode.chained_assignment  = None

In [2]:
# Define the path for all of the folders.
ED_FOLDER_PATH = "E:/AI-VR dataset/MIMIC-IV ED"
CLINICAL_FOLDER_PATH = "E:/AI-VR dataset/MIMIC-IV Clinical Database"
CXR_FOLDER_PATH = "E:/AI-VR dataset/MIMIC-CXR-JPG/physionet.org/files/mimic-cxr-jpg/2.0.0"
EYEGAZE_FOLDER_PATH = "E:/AI-VR dataset/eye-gaze-data-for-chest-x-rays-1.0.0"
REFLACX_FOLDER_PATH = "E:/AI-VR dataset/reflacx-reports-and-eye-tracking-data-for-localization-of-abnormalities-in-chest-x-rays-1.0.0/reflacx-reports-and-eye-tracking-data-for-localization-of-abnormalities-in-chest-x-rays-1.0.0"
# XAMI_MIMIC_PATH = "E:/AI-VR dataset/XAMI-MIMIC"
XAMI_MIMIC_PATH = "./XAMI-MIMIC"

In [3]:
cxr_df_with_stay_id_only = pd.read_csv("cxr_meta_with_stay_id_only.csv", index_col=0)
cxr_df_with_stay_id_only.head(5)

Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning,StudyDateTime,stay_id,in_eye_gaze,in_reflacx,in_ed,in_core
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,CHEST (PA AND LAT),PA,3056,2544,21800506,213014.531,CHEST (PA AND LAT),postero-anterior,Erect,21800510000000.0,33258284,False,False,True,True
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,10000032,50414267,CHEST (PA AND LAT),LATERAL,3056,2544,21800506,213014.531,CHEST (PA AND LAT),lateral,Erect,21800510000000.0,33258284,False,False,True,True
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,CHEST (PA AND LAT),PA,3056,2544,21800626,165500.312,CHEST (PA AND LAT),postero-anterior,Erect,21800630000000.0,38112554,False,False,True,True
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,10000032,53189527,CHEST (PA AND LAT),LATERAL,3056,2544,21800626,165500.312,CHEST (PA AND LAT),lateral,Erect,21800630000000.0,38112554,False,False,True,True
4,ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c,10000032,56699142,CHEST (PORTABLE AP),AP,3056,2544,21800805,234424.765,CHEST (PORTABLE AP),antero-posterior,,21800810000000.0,35968195,False,False,True,True


In [4]:
def get_subjectId_and_stayId_by_dicomId(cxr_df, dicom_id):
    matches = cxr_df[cxr_df['dicom_id'] == dicom_id]

    if (len(matches)> 0):
        return matches.iloc[0]['subject_id'], matches.iloc[0]['stay_id']

    return None

In [5]:
def get_cxr_match_by_dicom_ids(cxr_df, dicom_ids):
    matches = cxr_df[cxr_df['dicom_id'].isin(dicom_ids)]
    return matches

In [6]:
all_valid_dicom_id_with_stay_id_indentified = set(cxr_df_with_stay_id_only['dicom_id'])

In [7]:
eye_gaze_bounding_box_df = pd.read_csv(f"{EYEGAZE_FOLDER_PATH}/bounding_boxes.csv")
eye_gaze_fixation_df =pd.read_csv(f"{EYEGAZE_FOLDER_PATH}/fixations.csv")
eye_gaze_master_df = pd.read_csv(f"{EYEGAZE_FOLDER_PATH}/master_sheet.csv")

In [8]:
eye_gaze_master_df_with_stay_id_indentified =  eye_gaze_master_df[eye_gaze_master_df['dicom_id'].isin(all_valid_dicom_id_with_stay_id_indentified)]

In [9]:
print(f"We have {len(eye_gaze_master_df_with_stay_id_indentified)} in the EyeGaze dataset can be used with clinical data.")


We have 642 in the EyeGaze dataset can be used with clinical data.


In [10]:
eye_gaze_master_df_with_stay_id_indentified.head(5)

Unnamed: 0,dicom_id,path,study_id,patient_id,stay_id,gender,anchor_age,image_top_pad,image_bottom_pad,image_left_pad,...,fracture__chx,lung_lesion__chx,lung_opacity__chx,no_finding__chx,pleural_effusion__chx,pleural_other__chx,pneumonia__chx,pneumothorax__chx,support_devices__chx,cxr_exam_indication
0,24c7496c-d7635dfe-b8e0b87f-d818affc-78ff7cf4,files/p15/p15628804/s58573295/24c7496c-d7635df...,58573295,15628804,33811834,F,20 - 30,86,86,448,...,,,,,,,,,,___F with CHF and shortness of breath// ?Pulm...
1,78711a04-264d5305-d5feec9b-ebef1cec-fdc6db9c,files/p19/p19462352/s51900589/78711a04-264d530...,51900589,19462352,32954494,F,20 - 30,0,0,534,...,,,,,1.0,,1.0,,,"___F with hypertension, tachycardia"
2,a770d8d6-7b6a62ff-815ab876-c81709a8-9a654a54,files/p11/p11255143/s50941783/a770d8d6-7b6a62f...,50941783,11255143,34005408,F,20 - 30,0,0,534,...,,,,1.0,,,,,1.0,History of myocardial infarction. Shortness o...
3,8e457921-bc1af8aa-a65073c1-aaac8247-c5ceb780,files/p10/p10526322/s55981398/8e457921-bc1af8a...,55981398,10526322,36680301,F,20 - 30,0,0,534,...,,,,,,,,,,___F with SOB
4,62fe5d5a-1806ee3c-f4e742fa-f2b036ea-d390057a,files/p12/p12055181/s59722264/62fe5d5a-1806ee3...,59722264,12055181,30138691,M,20 - 30,0,0,534,...,,,1.0,,1.0,,-1.0,,,"___M with DOE, SOB in supine position // eval..."


In [11]:
all_patients_to_import = list(set(eye_gaze_master_df_with_stay_id_indentified['patient_id']))

In [13]:
for subject_id in all_patients_to_import:
    subject_match = eye_gaze_master_df_with_stay_id_indentified[eye_gaze_master_df_with_stay_id_indentified["patient_id"] == subject_id]
    all_dicom_ids = list(subject_match['dicom_id'])

    cxr_matches = get_cxr_match_by_dicom_ids(cxr_df_with_stay_id_only ,all_dicom_ids)
    all_stay_ids = list(set(cxr_matches['stay_id']))

    for stay_id in all_stay_ids:

        dicom_ids_for_this_stay = list(cxr_matches[cxr_matches['stay_id'] == stay_id]['dicom_id'])

        match_master_sheet = eye_gaze_master_df[eye_gaze_master_df['dicom_id'].isin(dicom_ids_for_this_stay)]
        ## Replace the wrong stay_id in the eye_gaze dataset
        match_master_sheet.loc[:, 'stay_id'] = stay_id
        
        match_bounding_box_df = eye_gaze_bounding_box_df[eye_gaze_bounding_box_df['dicom_id'].isin(dicom_ids_for_this_stay)]
        match_fixation_df = eye_gaze_fixation_df[eye_gaze_fixation_df['DICOM_ID'].isin(dicom_ids_for_this_stay)]

        save_folder_path = f"{XAMI_MIMIC_PATH}/patient_{subject_id}/stay_{stay_id}/EyeGaze"
        os.makedirs(save_folder_path, exist_ok=True)

        match_master_sheet.to_csv(f"{save_folder_path}/master_sheet.csv")
        match_bounding_box_df.to_csv(f"{save_folder_path}/bounding_boxes.csv")
        match_fixation_df.to_csv(f"{save_folder_path}/fixations.csv")

        for dicom_id in dicom_ids_for_this_stay:
            source_path = f"{EYEGAZE_FOLDER_PATH}/audio_segmentation_transcripts/{dicom_id}"
            destination_path = f"{save_folder_path}/audio_segmentation_transcripts/{dicom_id}"
            copytree(source_path, destination_path)
        