In [1]:
import pandas as pd
import os
from shutil import copytree, copy
from utils.getter import get_cxr_match_by_dicom_ids, get_stayId_string
from data_path import EYEGAZE_FOLDER_PATH, XAMI_SPREADSHEET_FOLDER_PATH, XAMI_MIMIC_PATH
from tqdm.notebook import tqdm

pd.options.mode.chained_assignment  = None

In [2]:
cxr_df_with_stay_id_only = pd.read_csv(os.path.join(XAMI_SPREADSHEET_FOLDER_PATH ,"cxr_meta_with_stay_id_only.csv"), index_col=0)
cxr_df_with_stay_id_only.head(5)

Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning,in_eye_gaze,in_reflacx,StudyDateTime,stay_id
181,4873aa08-977bfd31-fb492e64-6ef432d1-3f12cbe3,10002430,53254222,CHEST (PA AND LAT),PA,3056,2544,21250928,160736.171,CHEST (PA AND LAT),postero-anterior,Erect,True,False,21250930000000.0,31293660
266,dcdc4bd9-4301b111-2a65a814-ee8e7bc5-7f0b9a5a,10003400,56466802,CHEST (PORTABLE AP),AP,3056,2544,21361209,133738.015,CHEST (PORTABLE AP),antero-posterior,Erect,False,True,21361210000000.0,33678912
497,5bdabba9-388f6646-ac06b5f5-f68b2fd2-3630de21,10011607,55814288,CHEST (PA AND LAT),AP,3056,2544,21820111,230933.25,CHEST (PA AND LAT),antero-posterior,Erect,False,True,21820110000000.0,37054412
539,4a629500-9c3281ca-90bab490-9b6ac9c1-e5e6a580,10012261,53728467,CHEST (PORTABLE AP),AP,2544,3056,21750906,191151.453,CHEST (PORTABLE AP),antero-posterior,Erect,False,True,21750910000000.0,38668412
567,8106217e-c41ab813-c6002d3f-ed4ee98b-7b165bae,10012498,55812956,CHEST (PA AND LAT),PA,3056,2544,21470331,211642.296,CHEST (PA AND LAT),postero-anterior,Erect,True,False,21470330000000.0,37362927


In [3]:
eye_gaze_bounding_box_df = pd.read_csv(os.path.join(EYEGAZE_FOLDER_PATH, "bounding_boxes.csv"))
eye_gaze_fixation_df =pd.read_csv(os.path.join(EYEGAZE_FOLDER_PATH,"fixations.csv"))
eye_gaze_master_df = pd.read_csv(os.path.join(EYEGAZE_FOLDER_PATH, "master_sheet.csv"))

all_valid_dicom_id_with_stay_id_indentified = set(cxr_df_with_stay_id_only['dicom_id'])
eye_gaze_master_df_with_stay_id_indentified =  eye_gaze_master_df[eye_gaze_master_df['dicom_id'].isin(all_valid_dicom_id_with_stay_id_indentified)]
print(f"We have {len(eye_gaze_master_df_with_stay_id_indentified)} in the EyeGaze dataset can be used with clinical data.")

We have 642 in the EyeGaze dataset can be used with clinical data.


In [4]:
eye_gaze_master_df_with_stay_id_indentified.head(5)

Unnamed: 0,dicom_id,path,study_id,patient_id,stay_id,gender,anchor_age,image_top_pad,image_bottom_pad,image_left_pad,...,fracture__chx,lung_lesion__chx,lung_opacity__chx,no_finding__chx,pleural_effusion__chx,pleural_other__chx,pneumonia__chx,pneumothorax__chx,support_devices__chx,cxr_exam_indication
0,24c7496c-d7635dfe-b8e0b87f-d818affc-78ff7cf4,files/p15/p15628804/s58573295/24c7496c-d7635df...,58573295,15628804,33811834,F,20 - 30,86,86,448,...,,,,,,,,,,___F with CHF and shortness of breath// ?Pulm...
1,78711a04-264d5305-d5feec9b-ebef1cec-fdc6db9c,files/p19/p19462352/s51900589/78711a04-264d530...,51900589,19462352,32954494,F,20 - 30,0,0,534,...,,,,,1.0,,1.0,,,"___F with hypertension, tachycardia"
2,a770d8d6-7b6a62ff-815ab876-c81709a8-9a654a54,files/p11/p11255143/s50941783/a770d8d6-7b6a62f...,50941783,11255143,34005408,F,20 - 30,0,0,534,...,,,,1.0,,,,,1.0,History of myocardial infarction. Shortness o...
3,8e457921-bc1af8aa-a65073c1-aaac8247-c5ceb780,files/p10/p10526322/s55981398/8e457921-bc1af8a...,55981398,10526322,36680301,F,20 - 30,0,0,534,...,,,,,,,,,,___F with SOB
4,62fe5d5a-1806ee3c-f4e742fa-f2b036ea-d390057a,files/p12/p12055181/s59722264/62fe5d5a-1806ee3...,59722264,12055181,30138691,M,20 - 30,0,0,534,...,,,1.0,,1.0,,-1.0,,,"___M with DOE, SOB in supine position // eval..."


In [5]:
all_patients_to_import = list(set(eye_gaze_master_df_with_stay_id_indentified['patient_id']))

In [6]:
for subject_id in tqdm(all_patients_to_import):
    
    subject_match = eye_gaze_master_df_with_stay_id_indentified[eye_gaze_master_df_with_stay_id_indentified["patient_id"] == subject_id]
    all_dicom_ids = list(subject_match['dicom_id'])

    cxr_matches = get_cxr_match_by_dicom_ids(cxr_df_with_stay_id_only ,all_dicom_ids)
    all_stay_ids = list(set(cxr_matches['stay_id']))

    for stay_id in all_stay_ids:

        dicom_ids_for_this_stay = list(cxr_matches[cxr_matches['stay_id'] == stay_id]['dicom_id'])

        match_master_sheet = eye_gaze_master_df[eye_gaze_master_df['dicom_id'].isin(dicom_ids_for_this_stay)]
        ## Replace the wrong stay_id in the eye_gaze dataset
        match_master_sheet.loc[:, 'stay_id'] = stay_id
        
        match_bounding_box_df = eye_gaze_bounding_box_df[eye_gaze_bounding_box_df['dicom_id'].isin(dicom_ids_for_this_stay)]
        match_fixation_df = eye_gaze_fixation_df[eye_gaze_fixation_df['DICOM_ID'].isin(dicom_ids_for_this_stay)]

        save_folder_path = os.path.join(XAMI_MIMIC_PATH,f"patient_{subject_id}", f"stay_{stay_id}", "EyeGaze") 
        os.makedirs(save_folder_path, exist_ok=True)

        match_master_sheet.to_csv( os.path.join(save_folder_path,"master_sheet.csv"))
        match_bounding_box_df.to_csv( os.path.join(save_folder_path,"bounding_boxes.csv"))
        match_fixation_df.to_csv( os.path.join(save_folder_path,"fixations.csv"))

        for dicom_id in dicom_ids_for_this_stay:
            source_path = os.path.join(EYEGAZE_FOLDER_PATH, "audio_segmentation_transcripts", dicom_id)
            destination_path = os.path.join(save_folder_path, "audio_segmentation_transcripts", dicom_id)
            copytree(source_path, destination_path, dirs_exist_ok=True, copy_function=copy)
        

  0%|          | 0/605 [00:00<?, ?it/s]

In [7]:
os.makedirs(os.path.join(
    XAMI_SPREADSHEET_FOLDER_PATH, "EyeGaze"), exist_ok=True)
eye_gaze_bounding_box_df.to_csv(os.path.join(
    XAMI_SPREADSHEET_FOLDER_PATH, "EyeGaze", 'bounding_boxes.csv'))
eye_gaze_fixation_df.to_csv(os.path.join(
    XAMI_SPREADSHEET_FOLDER_PATH, "EyeGaze", 'fixations.csv'))

# Update the eye-gaze datast with correct stayId.
eye_gaze_master_df = pd.read_csv(os.path.join(
    EYEGAZE_FOLDER_PATH, "master_sheet.csv"))
updated_stayId = eye_gaze_master_df['dicom_id'].apply(
    lambda x: get_stayId_string(cxr_df_with_stay_id_only, x))
eye_gaze_master_df['stay_id'] = updated_stayId
eye_gaze_master_df.to_csv(os.path.join(
    XAMI_SPREADSHEET_FOLDER_PATH, "EyeGaze", 'master_sheet_with_updated_stayId.csv'))
