In [1]:
### Testing if we can load the file from D:\

import pandas as pd

In [2]:
# Yes, we can load the data in D:\

triage_df = pd.read_csv("D:/AI-VR dataset/MIMIC-IV ED/triage.csv")

In [3]:
triage_df.head(5)

####################
# Qeustions:
# 1. Which field do we need? (Should be able to remove chiefcomplaint)
# 2. How to deal with missing value?
###################

Unnamed: 0,subject_id,stay_id,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity,chiefcomplaint
0,15585360,37573921,97.0,87.0,18.0,100.0,150.0,71.0,10.0,3.0,
1,17192424,34160628,98.6,82.0,,100.0,111.0,81.0,3.0,3.0,
2,15248757,32172727,97.1,112.0,20.0,100.0,147.0,97.0,8.0,4.0,
3,16648037,38946064,98.5,59.0,18.0,99.0,160.0,86.0,2.0,2.0,
4,13492931,39828574,100.6,90.0,16.0,96.0,107.0,55.0,0.0,3.0,'


In [4]:
# List out all the path for ED, Clinical and Image

ED_FOLDER_PATH = "D:/AI-VR dataset/MIMIC-IV ED"
CLINICAL_FOLDER_PATH = "D:/AI-VR dataset/MIMIC-IV Clinical Database"
CXR_FOLDER_PATH = "D:/AI-VR dataset/MIMIC-CXR-JPG"
EYETRACKING_FOLDER_PATH = "D:/AI-VR dataset/eye-gaze-data-for-chest-x-rays-1.0.0"


In [5]:
# Determine which data I need to load 
# 1. Eye tracking data
# 2. Triage
# 3. 
# Start loading data and perform preprocessing.

###### Some other important table ######
# 1. Diagnosis <ED & Clinical-Hosp> (But we have label already in eye_gaze dataset)
# 2. We have "insurance, language, marital_status, ethnicity" in admissions table <Clinical-Core>
# 3. We have "gender, anchor_age" in patients table <Clinical-Core>


In [6]:
### Using the stayId in eye tracking master_sheet to find the information in the triage.csv

In [7]:
triage_df = pd.read_csv("D:/AI-VR dataset/MIMIC-IV ED/triage.csv")
eye_gaze_master_sheet = pd.read_csv("D:/AI-VR dataset/eye-gaze-data-for-chest-x-rays-1.0.0/master_sheet.csv")


In [11]:
# For the basic extraction, we need to select the eye tracking dataset.

from enum import Enum


class TabularData():
    class _ED (Enum):
        edstays = "edstays"
        medrecon = "medrecon"
        pyxis = "pyxis"
        triage = "triage"
        vitalsign = "vitalsign"

    class _Clinical():
        class _Core(Enum):
            admissions = "admissions"
            patients = "patients"
            transfers = "transfers"

        class _Hosp (Enum):
            poe_detail = "poe_detail"
            pharmacy = "pharmacy"
            emar = "emar"
            microbiologyevents = "microbiologyevents"
            labevents = "labevents"
            d_labitems = "d_labitems"
            prescriptions = "prescriptions"
            procedures_icd = "procedures_icd"
            poe = "poe"
            d_hcpcs = "d_hcpcs"
            diagnoses_icd = "diagnoses_icd"
            services = "services"
            hcpcsevents = "hcpcsevents"
            drgcodes = "drgcodes"
            d_icd_diagnoses = "d_icd_diagnoses"
            d_icd_procedures = "d_icd_procedures"
            emar_detail = "emar_detail"

        class _ICU(Enum):
            d_items = "d_items"
            procedureevents = "procedureevents"
            inputevents = "inputevents"
            datetimeevents = "datetimeevents"
            chartevents = "chartevents"
            outputevents = "outputevents"
            icustays = "icustays"

        Core = _Core
        Hosp = _Hosp
        ICU = _ICU

    class _EyeGaze(Enum):
        master_sheet = "master_sheet"
        fixations = "fixations"
        eye_gaze = "eye_gaze"
        bounding_boxes = "bounding_boxes"

    Clinical = _Clinical
    EyeGaze = _EyeGaze
    ED = _ED


class TranscriptsData(Enum):
    EyeGaze = "EyeGaze"


class SegmentationData(Enum):
    EyeGaze = "EyeGaze"


class CXRData(Enum):
    JPEG = "JPEG"





In [54]:
from typing import List


def load_tabular_data(tabular_data):
    linking_features = ["stay_id", "subject_id", "patient_id", "dicom_id"] # order through priority
    all_dataframes = []

    # distribute to different data folder
    for sheet_name in tabular_data:

        # Dealing with Clinical dataset
        if (sheet_name in TabularData._Clinical._Core):
            # load the data from clinical path, we will also have the value (name of the csv).
            all_dataframes.append(pd.read_csv(
                f"{CLINICAL_FOLDER_PATH}/core/{sheet_name.value}.csv"))
        elif (sheet_name in TabularData._Clinical._Hosp):
            all_dataframes.append(pd.read_csv(
                f"{CLINICAL_FOLDER_PATH}/hosp/{sheet_name.value}.csv"))
        elif (sheet_name in TabularData._Clinical._ICU):
            all_dataframes.append(pd.read_csv(
                f"{CLINICAL_FOLDER_PATH}/icu/{sheet_name.value}.csv"))

        # Dealing with ED dataset.
        elif (sheet_name in TabularData._ED):
            all_dataframes.append(pd.read_csv(
                f"{ED_FOLDER_PATH}/{sheet_name.value}.csv"))

        # Dealing with eye gaze dataset
        elif (sheet_name in TabularData._EyeGaze):
            all_dataframes.append(pd.read_csv(
                f"{EYETRACKING_FOLDER_PATH}/{sheet_name.value}.csv"))

    if (len(all_dataframes) > 1):
        # Perform join table
        main_table: pd.DataFrame = all_dataframes[0]
        join_tables: List(pd.DataFrame) = all_dataframes[1:]

        # checking if the main_table has at least one linking_code:
        if not any([linking_feature in main_table.columns for linking_feature in linking_features]):
            raise Exception("No available linking feature in the")

        # if we have patient_id available but not the subject_id
        if  ("patient_id" in main_table) and  (not "subject_id"  in main_table):
            # add the subject_id column
            main_table["subject_id"] = main_table["patient_id"]
        
        # if we have subject_id available but not the patient_id
        if  ("subject_id" in main_table) and  (not "patient_id"  in main_table):
            # add the subject_id column
            main_table["patient_id"] = main_table["subject_id"]

        for sheet_name, table_to_join in zip(tabular_data, join_tables):
            for linking_feature in linking_features:
                if linking_feature in table_to_join.columns:
                    main_table.join(table_to_join, linking_feature)
                    print(f"Table {sheet_name} has been joined through {linking_feature}")
                    break 
                
                # If the last feature is not the linking feature as well.
                if linking_feature == linking_features[-1]:
                    print(f"{sheet_name} doesn't have subjectId or patientId to join")

        return main_table
    elif len(all_dataframes) == 0:
        return all_dataframes[0]
    else:
        raise Exception("No table found.")

    # Then we join all the tables according to the patient and subjectId. Priority => subject > patient


In [55]:
df = load_tabular_data([TabularData.EyeGaze.bounding_boxes, TabularData.EyeGaze.master_sheet, TabularData.ED.edstays, TabularData.Clinical.Core.admissions, TabularData.Clinical.Hosp.drgcodes, TabularData.Clinical.ICU.inputevents])

KeyError: 'stay_id'

In [None]:
tabular_data = [TabularData.EyeGaze.bounding_boxes, TabularData.EyeGaze.master_sheet, TabularData.ED.edstays, TabularData.Clinical.Core.admissions, TabularData.Clinical.Hosp.drgcodes, TabularData.Clinical.ICU.inputevents]

linking_features = ["stay_id", "subject_id", "patient_id", "dicom_id"] # order through priority
all_dataframes = []

# distribute to different data folder
for sheet_name in tabular_data:

    # Dealing with Clinical dataset
    if (sheet_name in TabularData._Clinical._Core):
        # load the data from clinical path, we will also have the value (name of the csv).
        all_dataframes.append(pd.read_csv(
            f"{CLINICAL_FOLDER_PATH}/core/{sheet_name.value}.csv"))
    elif (sheet_name in TabularData._Clinical._Hosp):
        all_dataframes.append(pd.read_csv(
            f"{CLINICAL_FOLDER_PATH}/hosp/{sheet_name.value}.csv"))
    elif (sheet_name in TabularData._Clinical._ICU):
        all_dataframes.append(pd.read_csv(
            f"{CLINICAL_FOLDER_PATH}/icu/{sheet_name.value}.csv"))

    # Dealing with ED dataset.
    elif (sheet_name in TabularData._ED):
        all_dataframes.append(pd.read_csv(
            f"{ED_FOLDER_PATH}/{sheet_name.value}.csv"))

    # Dealing with eye gaze dataset
    elif (sheet_name in TabularData._EyeGaze):
        all_dataframes.append(pd.read_csv(
            f"{EYETRACKING_FOLDER_PATH}/{sheet_name.value}.csv"))

if (len(all_dataframes) > 1):
    # Perform join table
    main_table: pd.DataFrame = all_dataframes[0]
    join_tables: List(pd.DataFrame) = all_dataframes[1:]

    # checking if the main_table has at least one linking_code:
    if not any([linking_feature in main_table.columns for linking_feature in linking_features]):
        raise Exception("No available linking feature in the")

    # if we have patient_id available but not the subject_id
    if  ("patient_id" in main_table) and  (not "subject_id"  in main_table):
        # add the subject_id column
        main_table["subject_id"] = main_table["patient_id"]
    
    # if we have subject_id available but not the patient_id
    if  ("subject_id" in main_table) and  (not "patient_id"  in main_table):
        # add the subject_id column
        main_table["patient_id"] = main_table["subject_id"]

    for sheet_name, table_to_join in zip(tabular_data, join_tables):
        for linking_feature in linking_features:
            if linking_feature in table_to_join.columns:
                main_table.join(table_to_join, linking_feature)
                print(f"Table {sheet_name} has been joined through {linking_feature}")
                break 
            
            # If the last feature is not the linking feature as well.
            if linking_feature == linking_features[-1]:
                print(f"{sheet_name} doesn't have subjectId or patientId to join")
    raise Exception("No table found.")

In [None]:
class MIMICDataLoader():
    def __init__(self, tabular_data=[], cxr_data=[], transcripts_data=[], segmentation_data=[]) -> None:
        pass

    def load_tabular_data(self, tabular_data):
        # distribute to different data folder
        for sheet_name in tabular_data:
            if (sheet_name in TabularData._Clinical._Core): 
                pass
            elif (sheet_name in TabularData._Clinical._Core):




        pass

    def load_cxr_data(self, cxr_data):
        pass

    

In [1]:
import pandas as pd
transfers_df_0dot4 = pd.read_csv(f"transfers-0.4.csv.gz", compression='gzip', header=0, sep=',', quotechar='"')

In [4]:
len(transfers_df_0dot4[transfers_df_0dot4["eventtype"] == "ED"])

662573

In [7]:
len(ED_edstays_df)

448972

In [6]:
ED_FOLDER_PATH = "E:/AI-VR dataset/MIMIC-IV ED"
ED_edstays_df = pd.read_csv(f"{ED_FOLDER_PATH}/edstays.csv")

In [2]:
len(transfers_df_0dot4)

2192963

In [66]:
TableData.Clinical is TableData._Clinical

True

In [None]:
transfer_ed_df = transfers_df_0dot4[transfers_df_0dot4["eventtype"] == "ED"]

In [8]:
new_to_older_stay_id={}

In [9]:
new_to_older_stay_id.values()

dict_values([])

In [None]:


for ed_stay_instance in ED_edstays_df.iloc:
    subject_id = ed_stay_instance['subject_id']
    matches_transfered = transfer_ed_df[
        (transfer_ed_df['subject_id'] == subject_id) &
        (transfer_ed_df['intime'] == ed_stay_instance['intime']) &
        (transfer_ed_df['outtime'] == ed_stay_instance['outtime'])
    ]

    if (len(matches_transfered) > 0) and (not (matches_transfered.iloc[0]['transfer_id'] in new_to_older_stay_id.values())):
        new_to_older_stay_id[ed_stay_instance['stay_id']
                             ] = matches_transfered.iloc[0]['transfer_id']


In [63]:
TableData.Clinical.Core in TableData.Clinical

TypeError: argument of type 'type' is not iterable

In [38]:
json.dumps(TableData.Clinical.Core.admissions)

TypeError: Object of type Core is not JSON serializable

In [28]:
TableData.ED.edstays

<ED.edstays: 'edstays'>

In [31]:
TableData.Clinical.Core.patients

<enum 'Core'>

In [None]:
# 1. Keep the data structure 
# 2. Let's check if we have the field that repetitive? Not really, we should be able to load the selected eye gaze data.
# 3. I will use a data loader class to access the whole data.
# 4. Firstly, download the reflex dataset, and check the difference btw eye-gaze and reflex.
