# Preprocessing MIMIC-IV

This file aggregates the relevant data tables and stores them as .csv per ethnic group. To do so, we use DASK to handle the large data files.

In [1]:
import os
import dask
from dask import dataframe as dd

## Core Data

Imports all data about patients (age, ethnicity, etc.). We remove all columns that the doctor does not need or cannot have while making a diagnoses. For example: as a patient comes in and the doctor makes a diagnose, the doctor does not know yet what the 'dischtime' will be.

In [2]:
def load_core_data ():
    # Load in gzip compressed data with Dask
    # Parallelism is not supported on gzip -> blocksize=none
    admissions = dd.read_csv("mimic-iv-0.4/core/admissions.csv.gz", blocksize=None, dtype={'deathtime': 'object'})
    patients = dd.read_csv("mimic-iv-0.4/core/patients.csv.gz", blocksize=None)

    # Merge dataframes
    core = admissions.merge(patients, on="subject_id")

    # Select only the columns that are know at the patients' arrival
    core = core[['subject_id', 'hadm_id', 'admittime', 'admission_type', 'admission_location',
        'insurance', 'marital_status', 'ethnicity', 'edregtime',
        'gender', 'anchor_age', 'anchor_year']]
    
    core = core.set_index("hadm_id")
    
    return core

### Split ethnicities

Create a dataframe for each ethnic group. The goal is to output one aggregated .csv for each etnicity.

In [3]:
def split_ethnicities (core):
    # ethnicities = ['UNKNOWN', 'WHITE', 'OTHER', 'ASIAN', 'HISPANIC/LATINO','BLACK/AFRICAN AMERICAN', 'UNABLE TO OBTAIN','AMERICAN INDIAN/ALASKA NATIVE']
    ethnicities = core['ethnicity'].unique()

    print("Number of data subjects per ethnicity:")

    count = core.groupby('ethnicity').ethnicity.count()
    count = count.compute()
    print(count)

    # Create dataframe for each ethnicity
    unknown = core[core['ethnicity'] == 'UNKNOWN']
    white = core[core['ethnicity'] == 'WHITE']
    other = core[core['ethnicity'] == 'OTHER']
    asian = core[core['ethnicity'] == 'ASIAN']
    hispanic_latino = core[core['ethnicity'] == 'HISPANIC/LATINO']
    black_african_american = core[core['ethnicity'] == 'BLACK/AFRICAN AMERICAN']
    unable_to_obtain = core[core['ethnicity'] == 'UNABLE TO OBTAIN']
    american_indian_alaska_native = core[core['ethnicity'] == 'AMERICAN INDIAN/ALASKA NATIVE']

    return unknown, white, other, asian, hispanic_latino, black_african_american, unable_to_obtain, american_indian_alaska_native

## Hosp Data

### Add icd
"International Statistical Classification of Diseases and Related Health Problems (ICD) serves a broad range of uses globally and provides critical knowledge on the extent, causes and consequences of human disease and death worldwide via data that is reported and coded with the ICD."

In [4]:
def add_icd (ethnic_group):
    # Table that connects patiets' hospital visits to icd_codes
    # ['subject_id', 'hadm_id', 'seq_num', 'icd_code', 'icd_version']
    diagnoses_icd = dd.read_csv("mimic-iv-0.4/hosp/diagnoses_icd.csv.gz", blocksize=None, dtype={'icd_code': 'object'})
    diagnoses_icd = diagnoses_icd[['hadm_id', 'icd_code']]

    # Create icd_code list + counts for each hadm_id
    icd_counts = diagnoses_icd.groupby('hadm_id').count()
    icd_counts = icd_counts.rename(columns={'icd_code':'icd_code_count'})
    diagnoses_icd = diagnoses_icd.groupby('hadm_id').agg(list)
    diagnoses_icd = diagnoses_icd.merge(icd_counts, how='left', on='hadm_id')
    
    # Merge
    core_hosp_prelim = ethnic_group.merge(diagnoses_icd, how='left', on='hadm_id')

    # Add table with icd_code descriptions
    # ['icd_code', 'icd_version', 'long_title']
    # d_icd_diagnoses = dd.read_csv("mimic-iv-0.4/hosp/d_icd_diagnoses.csv.gz", blocksize=None, dtype={'icd_code': 'object'})
    # d_icd_diagnoses = d_icd_diagnoses[['icd_code', 'long_title']].copy()
    # d_icd_diagnoses = d_icd_diagnoses.rename(columns={'long_title':'icd_name'})
    # core_hosp_prelim = core_hosp_prelim.merge(d_icd_diagnoses, how="left", on="icd_code")

    # Add table with icd sequence numbers
    # ['subject_id', 'hadm_id', 'seq_num', 'icd_code', 'icd_version']
    # procedures_icd = dd.read_csv("mimic-iv-0.4/hosp/procedures_icd.csv.gz", blocksize=None, dtype={'icd_code': 'object'})
    # procedures_icd = procedures_icd[['hadm_id', 'seq_num', 'icd_code']].copy()
    # procedures_icd = procedures_icd.rename(columns={'seq_num': "icd_seq_num"})
    # core_hosp_prelim = core_hosp_prelim.merge(procedures_icd, how="left", on=['hadm_id', 'icd_code'])
    # core_hosp_prelim = core_hosp_prelim.set_index("hadm_id")

    return core_hosp_prelim

### Add emar
"The primary function of eMAR is to electronically track and record resident administration of medication and treatments."

In [5]:
def add_emar (core_hosp_prelim):
    # Add table containing medication information
    # ['subject_id', 'hadm_id', 'emar_id', 'emar_seq', 'poe_id', 'pharmacy_id', 'charttime', 'medication', 'event_txt', 'scheduletime', 'storetime']
    emar = dd.read_csv("mimic-iv-0.4/hosp/emar.csv")
    emar = emar[['hadm_id', 'charttime', 'medication', 'event_txt']].copy()
    emar = emar.rename(columns={'charttime':'emar_charttime', 'medication':'emar_medication', 'event_txt':'emar_event'})

    # Drop NaN in hadm_id and convert to int64 (like all other tables)
    emar = emar.dropna(subset="hadm_id")
    emar = emar.astype({'hadm_id': 'int64'})

    # Groupby hadm_id (this prevents computing the same thing over and over)
    emar_groupby_hamdid = emar.groupby('hadm_id')

    # Create counts for each hadm_id + convert to dataframe
    emar_counts = emar_groupby_hamdid['emar_medication'].count()
    emar_prelim = emar_counts.to_frame()
    emar_prelim = emar_prelim.rename(columns={'emar_medication':'emar_count'})

    # Add first charttime (it looks like 1 hadm_id only has 1 charttime anyways)
    emar_charttime = emar_groupby_hamdid['emar_charttime'].min()
    emar_prelim['emar_charttime'] = emar_charttime

    # Add medication list for each hadm_id
    emar_medications = emar_groupby_hamdid['emar_medication'].agg(list)
    emar_prelim['emar_medications'] = emar_medications

    # Add event list for each hadm_id
    emar_events = emar_groupby_hamdid['emar_event'].agg(list)
    emar_prelim['emar_events'] = emar_events

    # Merge 
    core_hosp_prelim = core_hosp_prelim.merge(emar_prelim, how="left", on=["hadm_id"])

    return core_hosp_prelim

### Add labevents

Measurements that have been obtained outside of the hospital on e.g. blood/urine/etc.

In [6]:
def add_labevents (core_hosp_prelim):
    # Add table containing the lab results
    # ['labevent_id', 'subject_id', 'hadm_id', 'specimen_id', 'itemid','charttime', 'storetime', 'value', 'valuenum', 'valueuom','ref_range_lower', 'ref_range_upper', 'flag', 'priority', 'comments']
    labevents= dd.read_csv("mimic-iv-0.4/hosp/labevents.csv", dtype={'comments': 'object', 'hadm_id': 'float64', 'value': 'object', 'valueuom': 'object', 'flag': 'object'})
    labevents = labevents[['hadm_id', 'charttime', 'flag', 'priority', 'comments']]
    labevents = labevents.rename(columns={'charttime': 'lab_charttime', 'flag': 'lab_flag', 'priority': 'lab_priority', 'comments': 'lab_comments'})

    # Drop NaN in hadm_id and convert to int64 (like all other tables)
    labevents = labevents.dropna(subset="hadm_id")
    labevents = labevents.astype({'hadm_id': 'int64'})

    # Groupby hadm_id (this prevents computing the same thing over and over)
    labevents_groupby_hamdid = labevents.groupby('hadm_id')

    # Create counts for each hadm_id + convert to dataframe
    labevents_counts = labevents_groupby_hamdid['lab_flag'].count()
    labevents_prelim = labevents_counts.to_frame()
    labevents_prelim = labevents_prelim.rename(columns={'lab_flag':'lab_count'})

    # Add first charttime (it looks like 1 hadm_id only has 1 charttime anyways)
    labevents_charttime = labevents_groupby_hamdid['lab_charttime'].min()
    labevents_prelim['lab_charttime'] = labevents_charttime

    # Add flag list for each hadm_id
    labevents_flag = labevents_groupby_hamdid['lab_flag'].agg(list)
    labevents_prelim['lab_flag'] = labevents_flag

    # Add priority list for each hadm_id
    labevents_priority = labevents_groupby_hamdid['lab_priority'].agg(list)
    labevents_prelim['lab_priority'] = labevents_priority

    # Add comments list for each hadm_id
    labevents_comments = labevents_groupby_hamdid['lab_comments'].agg(list)
    labevents_prelim['lab_comments'] = labevents_comments

    # Merge 
    core_hosp_prelim = core_hosp_prelim.merge(labevents_prelim, how="left", on=["hadm_id"])

    return core_hosp_prelim

## Run everything

In [7]:
# Dataframe with Selected columns of core data 
# (patients' age, ethnicity, etc.)
core = load_core_data()
print("Core data loaded...")

# One dataframe with core data per ethnic group
unknown, white, other, asian, hispanic_latino, black_african_american, unable_to_obtain, american_indian_alaska_native = split_ethnicities(core)
# List of dataframes
ethnic_groups = [unknown, white, other, asian, hispanic_latino, black_african_american, unable_to_obtain, american_indian_alaska_native]
# List of ethnic group names
ethnic_group_names = ['unknown', 'white', 'other', 'asian', 'hispanic_latino', 'black_african_american', 'unable_to_obtain', 'american_indian_alaska_native']

# Add hospital data to each ethnic group dataframe
# Save dataframes as seperate csv files
for i in range (0, len(ethnic_groups)):
    # Add icd data
    # (classification of diseases)
    print("Adding icd data for group: ", ethnic_group_names[i], "...")
    core_hosp_prelim = add_icd(ethnic_groups[i])

    # Add emar data 
    # (medication and treatment)
    print("Adding emar data for group: ", ethnic_group_names[i], "...")
    core_hosp_prelim = add_emar(core_hosp_prelim)
    
    # Add lab data 
    # (blood/urine/etc. inspection)
    print("Adding lab data for group: ", ethnic_group_names[i], "...")
    core_hosp_prelim = add_labevents(core_hosp_prelim)
    
    # Save as .csv
    print("Saving .csv file for group: ", ethnic_group_names[i], "...")
    path = "data/preprocessing_I/" + ethnic_group_names[i] + ".csv"
    core_hosp_prelim.to_csv(path, single_file = True)

Core data loaded...
Number of data subjects per ethnicity:
ethnicity
AMERICAN INDIAN/ALASKA NATIVE      1536
ASIAN                             24522
BLACK/AFRICAN AMERICAN            80526
HISPANIC/LATINO                   29887
OTHER                             26844
UNABLE TO OBTAIN                   3742
UNKNOWN                           19419
WHITE                            338044
Name: ethnicity, dtype: int64
Adding icd data for group:  unknown ...
Adding emar data for group:  unknown ...
Adding lab data for group:  unknown ...
Saving .csv file for group:  unknown ...
Adding icd data for group:  white ...
Adding emar data for group:  white ...
Adding lab data for group:  white ...
Saving .csv file for group:  white ...
Adding icd data for group:  other ...
Adding emar data for group:  other ...
Adding lab data for group:  other ...
Saving .csv file for group:  other ...
Adding icd data for group:  asian ...
Adding emar data for group:  asian ...
Adding lab data for group:  asian 