In [1]:
import pandas as pd
import os
from shutil import copytree, copy
from utils.getter import get_cxr_match_by_dicom_ids, get_stayId_string
from data_path import EYEGAZE_FOLDER_PATH, XAMI_SPREADSHEET_FOLDER_PATH, XAMI_MIMIC_PATH
from tqdm.notebook import tqdm

pd.options.mode.chained_assignment  = None

In [2]:
cxr_meta_df = pd.read_csv(os.path.join(XAMI_SPREADSHEET_FOLDER_PATH ,"cxr_meta.csv"), index_col=0)

In [3]:
eye_gaze_bounding_box_df = pd.read_csv(
    os.path.join(EYEGAZE_FOLDER_PATH, "bounding_boxes.csv"))
eye_gaze_fixation_df = pd.read_csv(
    os.path.join(EYEGAZE_FOLDER_PATH, "fixations.csv"))

eye_gaze_gaze_df = pd.read_csv(
    os.path.join(EYEGAZE_FOLDER_PATH, "eye_gaze.csv")
)

eye_gaze_master_df = pd.read_csv(os.path.join(
    EYEGAZE_FOLDER_PATH, "master_sheet.csv"))

### Replace with correct stayId.
eye_gaze_master_df['stay_id'] = eye_gaze_master_df['dicom_id'].apply(
    lambda x: get_stayId_string(cxr_meta_df, x))

with_clinical_count = len(eye_gaze_master_df[eye_gaze_master_df['dicom_id'].isin(
    list(set(cxr_meta_df[cxr_meta_df['stay_id'].notnull()]['dicom_id'])))])

print(f"We have {with_clinical_count} in the EyeGaze dataset can be used with clinical data.")


We have 883 in the EyeGaze dataset can be used with clinical data.


In [4]:
# fix the dicom_id bug in eye_gaze_gaze_df:
all_bugged_dicom_ids = set(eye_gaze_gaze_df[eye_gaze_gaze_df['DICOM_ID'].apply(lambda x: len(x) != 44)]['DICOM_ID']) # dicom_id should have the len = 44.

for id in all_bugged_dicom_ids:
    if len(id) > 44:
        pass
        #### Don't touch it, then it would be included in our dataset.
        # corrected_id = id[:44]
        # replace the id with the corrected.
        # eye_gaze_gaze_df['DICOM_ID'].replace(id, corrected_id, inplace=True,)

    else:
        # find the correct id in cxr_df
        # check if this id appear twice or once in the gaze_df
        match_dicom_ids = set(eye_gaze_gaze_df[eye_gaze_gaze_df['DICOM_ID'].apply(lambda x: x.startswith(id))]['DICOM_ID'])
        if len(match_dicom_ids) == 1:
                corrected_id = match_dicom_ids[0]   
                eye_gaze_gaze_df['DICOM_ID'].replace(id, corrected_id, inplace=True,) # replace the value.
        elif len(match_dicom_ids) > 1:
                count_map = { m_id: len(eye_gaze_gaze_df[eye_gaze_gaze_df['DICOM_ID']==m_id]) for m_id in match_dicom_ids}

                # get the id with most cases
                corrected_id = max(count_map, key=count_map.get)
                
                # remove others
                id_to_remove = [m_id for m_id in match_dicom_ids if m_id != corrected_id]
                eye_gaze_gaze_df = eye_gaze_gaze_df[~(eye_gaze_gaze_df['DICOM_ID'].isin(id_to_remove))]

                # replace the id with corrected
                eye_gaze_gaze_df['DICOM_ID'].replace(id, corrected_id, inplace=True,)
        else:
            raise StopIteration("Can't find the mathced id, it shouldn't be the bugged list.")

In [5]:
# double check if the bugged it exist.
all_bugged_dicom_ids = set(eye_gaze_gaze_df[eye_gaze_gaze_df['DICOM_ID'].apply(lambda x: len(x) != 44)]['DICOM_ID']) # dicom_id should have the len = 44.
all_bugged_dicom_ids

{'320df379-5ac0ef5d-a88bd5b6-e0620b99-7ca7bbfe_calibration'}

In [6]:
eye_gaze_master_df.head(5)

Unnamed: 0,dicom_id,path,study_id,patient_id,stay_id,gender,anchor_age,image_top_pad,image_bottom_pad,image_left_pad,...,fracture__chx,lung_lesion__chx,lung_opacity__chx,no_finding__chx,pleural_effusion__chx,pleural_other__chx,pneumonia__chx,pneumothorax__chx,support_devices__chx,cxr_exam_indication
0,24c7496c-d7635dfe-b8e0b87f-d818affc-78ff7cf4,files/p15/p15628804/s58573295/24c7496c-d7635df...,58573295,15628804,33237058.0,F,20 - 30,86,86,448,...,,,,,,,,,,___F with CHF and shortness of breath// ?Pulm...
1,78711a04-264d5305-d5feec9b-ebef1cec-fdc6db9c,files/p19/p19462352/s51900589/78711a04-264d530...,51900589,19462352,31569309.0,F,20 - 30,0,0,534,...,,,,,1.0,,1.0,,,"___F with hypertension, tachycardia"
2,a770d8d6-7b6a62ff-815ab876-c81709a8-9a654a54,files/p11/p11255143/s50941783/a770d8d6-7b6a62f...,50941783,11255143,35444327.0,F,20 - 30,0,0,534,...,,,,1.0,,,,,1.0,History of myocardial infarction. Shortness o...
3,8e457921-bc1af8aa-a65073c1-aaac8247-c5ceb780,files/p10/p10526322/s55981398/8e457921-bc1af8a...,55981398,10526322,33361557.0,F,20 - 30,0,0,534,...,,,,,,,,,,___F with SOB
4,62fe5d5a-1806ee3c-f4e742fa-f2b036ea-d390057a,files/p12/p12055181/s59722264/62fe5d5a-1806ee3...,59722264,12055181,37532814.0,M,20 - 30,0,0,534,...,,,1.0,,1.0,,-1.0,,,"___M with DOE, SOB in supine position // eval..."


In [7]:
# Save spreadsheets to its folder.

os.makedirs(os.path.join(
    XAMI_SPREADSHEET_FOLDER_PATH, "EyeGaze"), exist_ok=True)
eye_gaze_bounding_box_df.to_csv(os.path.join(
    XAMI_SPREADSHEET_FOLDER_PATH, "EyeGaze", 'bounding_boxes.csv'))
eye_gaze_fixation_df.to_csv(os.path.join(
    XAMI_SPREADSHEET_FOLDER_PATH, "EyeGaze", 'fixations.csv'))
eye_gaze_master_df.to_csv(os.path.join(
    XAMI_SPREADSHEET_FOLDER_PATH, "EyeGaze", 'master_sheet_with_updated_stayId.csv'))

eye_gaze_fixation_df.to_csv(
    os.path.join(
    XAMI_SPREADSHEET_FOLDER_PATH, "EyeGaze", 'eye_gaze.csv')
)

In [8]:
all_patients_to_import = list(set(eye_gaze_master_df['patient_id']))

In [9]:
for subject_id in tqdm(all_patients_to_import):

    match_master_sheet = eye_gaze_master_df[eye_gaze_master_df["patient_id"] == subject_id]

    # we got all the dicom_ids belong to this subject.
    all_dicom_ids_for_subject = list(match_master_sheet['dicom_id'])

    match_bounding_box_df = eye_gaze_bounding_box_df[eye_gaze_bounding_box_df['dicom_id'].isin(
        all_dicom_ids_for_subject)]
    match_fixation_df = eye_gaze_fixation_df[eye_gaze_fixation_df['DICOM_ID'].isin(
        all_dicom_ids_for_subject)]
    match_gaze_df = eye_gaze_gaze_df[eye_gaze_gaze_df['DICOM_ID'].isin(all_dicom_ids_for_subject)]

    save_folder_path = os.path.join(
        XAMI_MIMIC_PATH, f"patient_{subject_id}", "EyeGaze")
    os.makedirs(save_folder_path, exist_ok=True)

    match_master_sheet.to_csv(os.path.join(
        save_folder_path, "master_sheet.csv"))
    match_bounding_box_df.to_csv(os.path.join(
        save_folder_path, "bounding_boxes.csv"))
    match_fixation_df.to_csv(os.path.join(save_folder_path, "fixations.csv"))
    match_gaze_df.to_csv(os.path.join(save_folder_path, "eye_gaze.csv"))

    for dicom_id in all_dicom_ids_for_subject:
        source_path = os.path.join(
            EYEGAZE_FOLDER_PATH, "audio_segmentation_transcripts", dicom_id)
        destination_path = os.path.join(
            save_folder_path, "audio_segmentation_transcripts", dicom_id)
        copytree(source_path, destination_path,
                 dirs_exist_ok=True, copy_function=copy)
        os.remove(os.path.join(destination_path, "index.html"))
        

  0%|          | 0/1038 [00:00<?, ?it/s]