# 23-08-11

An initial exploration of the MIMIC-III / synthesised RPH dataset.

In [5]:
import pandas as pd
import numpy as np
from pathlib import Path

In [None]:
!ls

In [6]:
# setup OS agnostic pathnames
ROOT_DIR = Path('..')

# Note: to run main notebook from root directory, use:
#ROOT_DIR = Path('')

## Loading Data

**ADMISSIONS**: 
* Each row corresponds to a specific hospital admission, uniquely identified by `HADM_ID`

**DIAGNOSES_ICD**: 
* Each row corresponds to a diagnosis code associated with a specific hospital admission. 
* `SEQ_NUM` indicates the order in which the diagnosis codes were recorded during that specific patient's admission. 
* We could use this to provide context about the chronological sequence of diagnoses given to a patient during their hospital stay. We would have to look at what point in time these were updated.

In [8]:
# Patient Admissions
# 58976 recorded admissions
df_admissions = pd.read_csv(Path(ROOT_DIR / 'data' / 'ADMISSIONS.csv'))

In [9]:
# Admission Information
df_disease = pd.read_csv(Path(ROOT_DIR / 'data' / 'DIAGNOSES_ICD.csv'))

In [10]:
# Dictionary for ICD9 Codes
df_ICD_desc = pd.read_csv(Path(ROOT_DIR / 'data' / 'D_ICD_DIAGNOSES.csv'))

In [11]:
df_labevents = pd.read_csv(Path(ROOT_DIR / 'data' / 'LABEVENTS.csv'))
df_lab_desc = pd.read_csv(Path(ROOT_DIR / 'data' / 'D_LABITEMS.csv'))

In [None]:
# Merging admission info with descriptions
full_df = pd.merge(df_disease, df_ICD_desc, on='ICD9_CODE')
full_df

## ICD9 Codes

A single admission can have multiple ICD codes associated with it. We will filter for ICD9 codes related to sepsis, identify the proportion of patients with sepsis, 


In [None]:
df_disease[df_disease['HADM_ID'].values == 172335]

In [12]:
# ICD9 codes related to sepsis
df_ICD_desc[df_ICD_desc['SHORT_TITLE'].apply(lambda x:True if 'sepsis' in x or 'Sepsis' in x else False)]

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE
9049,9050,77181,NB septicemia [sepsis],Septicemia [sepsis] of newborn
10304,11403,99591,Sepsis,Sepsis
10305,11404,99592,Severe sepsis,Severe sepsis
13293,13564,67020,Puerperal sepsis-unsp,"Puerperal sepsis, unspecified as to episode of..."
13294,13565,67022,Puerprl sepsis-del w p/p,"Puerperal sepsis, delivered, with mention of p..."
13295,13566,67024,Puerperl sepsis-postpart,"Puerperal sepsis, postpartum condition or comp..."


In [14]:
# store ICD-9 values to an array
icd_sepsis = df_ICD_desc[df_ICD_desc.apply(lambda x:'seps' in x['SHORT_TITLE'].lower(),axis=1)]['ICD9_CODE'].values
icd_sepsis

array(['77181', '99591', '99592', '67020', '67022', '67024'], dtype=object)

In [15]:
# Filter admission incidences based on sepsis codes
df_t01 = df_disease[df_disease.apply(lambda x:x['ICD9_CODE'] in icd_sepsis, axis=1)]

In [18]:
df_t01.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
87,1547,117,164853,16.0,99592
144,1604,124,138376,6.0,99592
276,505,64,172056,3.0,99591
450,679,85,112077,18.0,99591
497,726,92,142807,2.0,77181


In [None]:
print(df_t01.shape[0], 'incidences of sepsis')
print(df_t01['SUBJECT_ID'].unique().shape[0], 'individuals that have had sepsis')

In [16]:
# Patients who have been admitted on multiple occassions
df_admissions.groupby('SUBJECT_ID').filter(lambda x: len(x) >= 2)['SUBJECT_ID']
print(df_admissions.groupby('SUBJECT_ID').filter(lambda x: len(x) >= 2)['SUBJECT_ID'].unique().shape[0], 'patients have been admitted on multiple occassions')

7537 patients have been admitted on multiple occassions


In [None]:
# Getting patients who have had sepsis
full_df[full_df['SHORT_TITLE'].str.contains('sepsis')]['SUBJECT_ID'].unique()

In [None]:
# Merge with admissions, because we don't have time variable

In [None]:
# multiple admissions per patient. Each admission has multiple ICD Codes
# We need use graph analysis to identify pathways

## Sweetviz EDA

In [None]:
import sweetviz

In [None]:
# sweetviz.analyze(df_lab)
# Use a subset of the data instead of processing all instances

analysis = sweetviz.analyze(df_lab.iloc[:10000])

In [None]:
# Click 'association' for all the correlation matrices

analysis.show_notebook()

# LABEVENTS

Abnormalities in test results are referenced in the `FLAG` feature. 

In [None]:
df_lab_desc

In [None]:
pd.merge(df_labevents, df_lab_desc, on='ITEMID')

In [None]:
# REF RANGE value 
# - An indicator for abnormalities
# - Is there a reference range field in MIMIC-III?

df_labevents['FLAG'].unique()

In [None]:
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3307229/

# proportion of flag labels in lab events
print("Proportion of abnormal and delta flagged tests: ", df_labevents['FLAG'].value_counts() / df_labevents.shape[0])


# Abnormal flags are recorded when comparing lab values against threshold ranges. The threshold ranges are not static, may vary based on reagents, and unfortunately are not known to us at present.

# get mean value of delta events
df_labevents[df_labevents.apply(lambda x:'delta' in x['FLAG'],axis=1)]

# LABITEMS
Tells you about organisms in the blood
MICROBIOLOGYEVENTS.csv

In [None]:

df_lab_desc
df_lab_desc['ITEMID'].unique()

In [None]:
pd.merge(df_labevents, df_items, left_on='SPEC_ITEMID', right_on='ITEM_ID')

In [None]:
df_items[df_items['LINKSTO']=='microbiologyevents']

In [None]:
# CAREVU is the bedside monitor

In [None]:
df_items['LINKSTO'].unique()

In [None]:
# sepsis patients
df_sepsis = full_df[full_df['SHORT_TITLE'].apply(lambda x:'sepsis' in x or 'Sepsis' in x)]
df_sepsis

In [None]:
entity_relation = full_df[full_df]
entity_relation.groupby(['SUBJECT_ID','HADM_ID']).apply()

# Missing Data

## HADM_ID

In [None]:
# filter missing HADM_ID data
missing_hadm_df = df_labevents[df_labevents['HADM_ID'].isnull()]

# join tables for existing HADM_IDs
joined_df = pd.merge(df_labevents[df_labevents['HADM_ID'].notnull()], df_admission, on=['SUBJECT_ID', 'HADM_ID'], how='left')

# find rows with missing HADM_ID
def find_hadm_id(row):
    possible_admissions = df_admission[df_admission['SUBJECT_ID'] == row['SUBJECT_ID']]
    for _, admission_row in possible_admissions.iterrows():
        if admission_row['ADMITTIME'] <= row['CHARTTIME'] <= admission_row['DISCHTIME']:
            return admission_row['HADM_ID']
    return None

missing_hadm_df['HADM_ID'] = missing_hadm_df.apply(find_hadm_id, axis=1)

# combine the dataframes
final_df = pd.concat([joined_df, missing_hadm_df], ignore_index=True)

print(final_df)