# PREDICTING SEPSIS RISK DURING IN-PATIENT ADMISSIONS
*Client: Royal Perth Hospital*

*Team: Group 7*

# Readme
The following libraries need to be installed in order to run the source code.

In [1]:
import pandas as pd
import numpy as np, warnings
from pathlib import Path
import os
import sweetviz as sv
from importlib import reload
import matplotlib.pyplot as plt
import seaborn as sns

np.warnings = warnings

In [2]:
import sys
sys.path.append("..")

# import utility functions
import src.utils as util
reload(util)

import src.load_parquet as load_parquet
reload(load_parquet)

# setup OS agnostic pathnames
ROOT_DIR = Path('..')

# Note: to run main notebook from root directory, use:
# ROOT_DIR = Path('')

# 1.Dataset Processing

Load the dataset. The following tables are loaded for this project:
- PATIENTS
- ADMISSIONS
- DIAGNOSES_ICD
- LABEVENTS
- MICROBIOLOGYEVENTS
- D_ICD_DIAGNOSES
- D_ITEMS

In [3]:
path_patients = ROOT_DIR / 'data' / "PATIENTS.csv"  
path_admissions = ROOT_DIR / 'data' / "ADMISSIONS.csv"
path_diagnoses_icd = ROOT_DIR / 'data' / "DIAGNOSES_ICD.csv"
path_labevents = ROOT_DIR / 'data' / "LABEVENTS.csv"
path_microbiologyevents = ROOT_DIR / 'data' / "MICROBIOLOGYEVENTS.csv"
path_desc_icd = ROOT_DIR / 'data' / "D_ICD_DIAGNOSES.csv"
path_desc_labitems = ROOT_DIR / 'data' / "D_LABITEMS.csv"
path_desc_items = ROOT_DIR / 'data' /  "D_ITEMS.csv"

In [4]:
# load transaction dataset
df_patients = pd.read_csv(path_patients)
df_admissions = pd.read_csv(path_admissions)
df_diagnoses_icd = pd.read_csv(path_diagnoses_icd)
df_labevents = pd.read_csv(path_labevents)
df_microbiologyevents = pd.read_csv(path_microbiologyevents)

# load description tables
df_desc_icd = pd.read_csv(path_desc_icd)
df_desc_labitems = pd.read_csv(path_desc_labitems)
df_desc_items = pd.read_csv(path_desc_items)

## 1.1 Data Cleaning

In [5]:
# Convert the following datetime columns to datetime format
# patients: DOB to date format, we not care about the birth time
df_patients['DOB'] = pd.to_datetime(df_patients['DOB'], format='%Y-%m-%d %H:%M:%S')
# admissions: ADMITTIME, DISCHTIME, EDREGTIME, EDOUTTIME
df_admissions['ADMITTIME'] = pd.to_datetime(df_admissions['ADMITTIME'], format='%Y-%m-%d %H:%M:%S')
df_admissions['DISCHTIME'] = pd.to_datetime(df_admissions['DISCHTIME'], format='%Y-%m-%d %H:%M:%S')
# labevents: CHARTTIME
df_labevents['CHARTTIME'] = pd.to_datetime(df_labevents['CHARTTIME'], format='%Y-%m-%d %H:%M:%S')
# microbiologyevents: CHARTDATE to date format and CHARTTIME to datetime format
df_microbiologyevents['CHARTDATE'] = pd.to_datetime(df_microbiologyevents['CHARTDATE'], format='%Y-%m-%d %H:%M:%S')
df_microbiologyevents['CHARTTIME'] = pd.to_datetime(df_microbiologyevents['CHARTTIME'], format='%Y-%m-%d %H:%M:%S')

The DIAGNOSES_ICD tables has a column ICD9_CODE which is the code for each disease diagnosed for the patient.

The Sepsis has 6 codes: ['77181', '99591', '99592', '67020', '67022', '67024']

We'll introduce a new column, IS_SEPSIS, as a binary classifier (1 for 6 sepsis ICD9 codes, 0 otherwise) for the target variable.

In [6]:
# retrieve all sepsis icd code
sepsis_icd =  df_desc_icd[df_desc_icd.apply(lambda x:'sepsis' in x['SHORT_TITLE'].lower(),axis=1)]['ICD9_CODE'].values
# add new binary classifier target variable
df_diagnoses_icd['IS_SEPSIS'] = df_diagnoses_icd.apply(lambda x: 1 if x['ICD9_CODE'] in sepsis_icd else 0, axis=1)

In [7]:
# create function for checking the admission diagnoses to find whether it is diagnosed sepsis
def check_sepsis(subject_id, hadm_id, df_diagnoses_icd):
    admission_diagnoses = df_diagnoses_icd[(df_diagnoses_icd['SUBJECT_ID'] == subject_id) & (df_diagnoses_icd['HADM_ID'] == hadm_id)]
    is_sepsis = 1 if sum(admission_diagnoses['IS_SEPSIS']) > 0 else 0
    return is_sepsis

In [8]:
# create demographic df and save to csv file for future load
try:
    pathname_demographic = ROOT_DIR / 'data' / 'demographic.csv'
    df_demographic = pd.read_csv(pathname_demographic)
except:
    # merge the patients and admission tables to a demographic dataframe
    df_demographic = pd.merge(df_admissions, df_patients[['SUBJECT_ID', 'GENDER', 'DOB', 'EXPIRE_FLAG']], on='SUBJECT_ID')
    # create an age column to each case
    df_demographic['AGE'] = (((df_demographic['ADMITTIME'].dt.date - df_demographic['DOB'].dt.date) // 365) / pd.Timedelta(days=1)).astype('int16')
    # add column IS_SEPSIS to demographic data indicating which case is diagnosed with sepsis
    df_demographic['IS_SEPSIS'] = df_demographic.apply(lambda x: check_sepsis(x['SUBJECT_ID'], x['HADM_ID'], df_diagnoses_icd), axis=1)
    util.save_csv(df_demographic, ROOT_DIR / 'data' / 'demographic.csv')

# convert admittime and dischtime to datetime
df_demographic['ADMITTIME'] = pd.to_datetime(df_demographic['ADMITTIME'], format='%Y-%m-%d %H:%M:%S')
df_demographic['DISCHTIME'] = pd.to_datetime(df_demographic['DISCHTIME'], format='%Y-%m-%d %H:%M:%S')

In [9]:
# create age category 
df_demographic = util.categorise_age(df_demographic)

In [12]:
df_demographic.loc[df_demographic.AGE>100, 'AGE'] = median_age

Load unpivoted labevents dataframe:

In [13]:
output_dir = ROOT_DIR / 'data' / 'labevents_transform_parquet'
df_labevents_processed = load_parquet.load_labevents_dask(output_dir)

2023-09-16 01:26:59.686586 Start


2023-09-16 01:27:00.299363 2045381
2023-09-16 01:27:03.987933 End


## Extracting Dataset

### 1. t0

In [45]:
# Extract input data at t = 0
key_features = [51279, 51222, 51221, 50931, 51006, 51244, 51256, 51301, 51274, 50912, 51237, 50893, 50882, 50813,
                50821, 51265, 50820, 50818, 51275, 51493]
key_df = combined_df[combined_df['ITEMID'].isin(key_features)]

# Extract those relevant columns
key_df = key_df[['SUBJECT_ID', 'HADM_ID', 'ITEMID', 'VALUENUM', 'IS_SEPSIS', 'TEST_NO', 'UPDATED_STAY']]

# Filter rows where TEST_NO == 1
key_df = key_df[key_df['TEST_NO'] == 1]

key_df = key_df.merge(df_desc_labitems[['ITEMID', 'LABEL','FLUID']], on=['ITEMID'], how='left')

# Create NEW_VALUENUM column based on condition
key_df['NEW_VALUENUM'] = np.where(key_df['UPDATED_STAY'] == 0, key_df['VALUENUM'], -999)
key_df['ITEMID'] = key_df['FLUID'] + '_' + key_df['LABEL'] + '_' + key_df['ITEMID'].astype(str)

# Use pivot_table to reshape key_df
key_df_pivot = key_df.pivot_table(index=['SUBJECT_ID', 'HADM_ID', 'IS_SEPSIS', 'TEST_NO', 'UPDATED_STAY'], 
                                  columns='ITEMID', 
                                  values='NEW_VALUENUM', 
                                  aggfunc='first').reset_index()

key_df_pivot.columns.name = None  


In [46]:
# Fill -999 to missing values in ITEM_ID
columns_to_replace = key_df_pivot.columns[5:]

for col in columns_to_replace:
    key_df_pivot[col].fillna(-999, inplace=True)

# Removes irrelevant columns
key_df_pivot.drop(columns=['TEST_NO', 'UPDATED_STAY'], inplace=True)

In [47]:
t0_df = key_df_pivot.merge(df_demographic[['SUBJECT_ID', 'HADM_ID','AGE', 'GENDER']],on=['SUBJECT_ID', 'HADM_ID'],how='left')

# One-hot encoding GENDER
t0_df['GENDER_NUM'] = t0_df['GENDER'].replace({'M': 0, 'F': 1})
t0_df.drop(columns=["GENDER"], inplace=True)
t0_df = t0_df[t0_df['AGE'] >= 18]
t0_df

Unnamed: 0,SUBJECT_ID,HADM_ID,IS_SEPSIS,Blood_Bicarbonate_50882,"Blood_Calcium, Total_50893",Blood_Creatinine_50912,Blood_Glucose_50931,Blood_Hematocrit_51221,Blood_Hemoglobin_51222,Blood_INR(PT)_51237,...,Blood_Platelet Count_51265,Blood_Red Blood Cells_51279,Blood_Urea Nitrogen_51006,Blood_White Blood Cells_51301,Blood_pCO2_50818,Blood_pH_50820,Blood_pO2_50821,Urine_RBC_51493,AGE,GENDER_NUM
1,3,145834,0,25.00,8.20,3.20,91.00,30.20,10.00,1.30,...,282.00,3.44,53.00,12.70,-999.00,-999.00,-999.00,-999.00,76,0
2,4,185777,0,24.00,8.90,0.50,140.00,34.20,11.50,-999.00,...,207.00,3.80,9.00,9.70,-999.00,-999.00,-999.00,-999.00,47,1
4,6,107064,0,-999.00,-999.00,-999.00,-999.00,-999.00,-999.00,-999.00,...,-999.00,-999.00,-999.00,-999.00,-999.00,-999.00,-999.00,-999.00,65,1
7,9,150750,0,30.00,-999.00,1.20,129.00,45.20,15.40,1.10,...,258.00,5.46,16.00,7.50,-999.00,-999.00,-999.00,-999.00,41,0
9,11,194540,0,25.00,-999.00,0.70,121.00,36.90,12.50,1.10,...,229.00,4.31,12.00,8.50,-999.00,-999.00,-999.00,-999.00,50,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54780,99985,176670,1,-999.00,-999.00,-999.00,-999.00,36.40,12.40,-999.00,...,349.00,4.20,-999.00,13.00,-999.00,-999.00,-999.00,-999.00,53,0
54781,99991,151118,1,-999.00,-999.00,-999.00,-999.00,-999.00,-999.00,-999.00,...,-999.00,-999.00,-999.00,-999.00,-999.00,-999.00,-999.00,-999.00,47,0
54782,99992,197084,0,22.00,7.10,1.20,206.00,12.90,4.70,1.10,...,188.00,1.35,56.00,21.40,-999.00,-999.00,-999.00,-999.00,65,1
54783,99995,137810,0,-999.00,-999.00,-999.00,-999.00,-999.00,-999.00,-999.00,...,-999.00,-999.00,-999.00,-999.00,-999.00,-999.00,-999.00,-999.00,88,1


In [48]:
t0_df.to_csv('/Users/amyhung/Documents/CITS5553_project/data-science-capstone-project/data/t0_v3.csv', index=False)

### 2. t4

In [49]:
def get_admittime(x):
  x = x.sort_values(['CHARTTIME'])
  min_charttime = x.iloc[0].CHARTTIME
  new_admittime = x.iloc[0].ADMITTIME
  if min_charttime < new_admittime:
    new_admittime = min_charttime
  return new_admittime

# load features from file and get top 10 features
pontential_events = pd.read_csv(ROOT_DIR / 'data/potential_events.csv')
# pontential_events['ITEMID'] = pontential_events['ITEMID'].str.replace('"', '')
feature_list = pontential_events[pontential_events.GPT=='Y']['ITEMID']

# get all labevents rows filtered by itemid, t=n
def create_labevents_processed(hours=4):
  potential_cases = df_labevents[df_labevents.ITEMID.isin(feature_list)].merge(df_demographic[['SUBJECT_ID', 'HADM_ID', 'AGE', 'GENDER', 'ADMITTIME', 'IS_SEPSIS']], on=['SUBJECT_ID', 'HADM_ID'])

  # remove all patients with AGE < 18
  potential_cases = potential_cases[potential_cases['AGE'] >=18]

  # if CHARTTIME < ADMITTIME => ADMITTIME = CHARTTIME
  new_admittime = potential_cases.groupby(['SUBJECT_ID', 'HADM_ID']).apply(lambda x: get_admittime(x)).reset_index(name='NEW_ADMITTIME')
  potential_cases = potential_cases.merge(new_admittime, on=['SUBJECT_ID', 'HADM_ID'])

  # calculate TIME of test since ADMITTIME
  potential_cases['TIME'] = np.ceil((potential_cases.CHARTTIME - potential_cases.NEW_ADMITTIME).dt.total_seconds() / 3600)
  potential_cases = potential_cases[(potential_cases.TIME <= hours)]
  potential_cases = potential_cases.merge(df_desc_labitems[['ITEMID', 'LABEL', 'FLUID']], on=['ITEMID'])

  # One-hot encoding GENDER
  potential_cases['GENDER_NUM'] = potential_cases['GENDER'].replace({'M': 0, 'F': 1})
  potential_cases.drop(columns=["GENDER"], inplace=True)

  # out_file_abnormal = ROOT_DIR / 'data/output_csv/labevents_with_time.csv'
  # my_util.save_csv(potential_cases, out_file_abnormal)
  return potential_cases

a = create_labevents_processed()

In [53]:
def compute_feature_values(x):
  x.sort_values(['CHARTTIME'], ascending=False, inplace=True)
  try:
    value = x.loc[~x.VALUE.isna()].iloc[0]['VALUENUM']
  except:
    value = -999
  return value

df_final = a.drop_duplicates(subset=['SUBJECT_ID', 'HADM_ID'], ignore_index=True)[['SUBJECT_ID', 'HADM_ID', 'AGE', 'IS_SEPSIS','GENDER_NUM']]

for itemid in feature_list:
  new_col = f'ITEMID_{itemid}'
  temp_df = a[a.ITEMID == itemid].groupby(['SUBJECT_ID', 'HADM_ID', 'ITEMID']).apply(lambda x: compute_feature_values(x)).reset_index(name=new_col)
  df_final = df_final.merge(temp_df[['SUBJECT_ID', 'HADM_ID', new_col]], on=['SUBJECT_ID', 'HADM_ID'], how='left')

df_final = df_final.fillna(-999)
#my_util.save_csv(df_final, ROOT_DIR / 'data/output_csv/train_data.csv')

In [54]:
df_final


Unnamed: 0,SUBJECT_ID,HADM_ID,AGE,IS_SEPSIS,GENDER_NUM,ITEMID_51279,ITEMID_51222,ITEMID_51221,ITEMID_50931,ITEMID_51006,...,ITEMID_50912,ITEMID_51237,ITEMID_50893,ITEMID_50882,ITEMID_50813,ITEMID_50821,ITEMID_51265,ITEMID_50820,ITEMID_50818,ITEMID_51275
0,3,145834.00,76,0,0,2.79,8.20,25.60,281.00,43.00,...,2.50,1.70,7.40,11.00,8.00,329.00,253.00,7.26,28.00,46.40
1,4,185777.00,47,0,1,3.80,11.50,34.20,140.00,9.00,...,0.50,1.00,8.90,24.00,-999.00,-999.00,207.00,-999.00,-999.00,31.30
2,9,150750.00,41,0,0,5.46,15.40,45.20,129.00,16.00,...,1.20,1.10,-999.00,30.00,-999.00,-999.00,258.00,-999.00,-999.00,21.70
3,11,194540.00,50,0,1,4.31,12.50,36.90,121.00,12.00,...,0.70,1.10,-999.00,25.00,-999.00,-999.00,229.00,-999.00,-999.00,28.30
4,13,143045.00,39,0,1,4.08,12.30,35.60,169.00,13.00,...,0.60,1.20,8.90,23.00,-999.00,-999.00,216.00,-999.00,-999.00,44.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46059,8666,129517.00,43,0,0,-999.00,-999.00,-999.00,-999.00,-999.00,...,-999.00,-999.00,-999.00,-999.00,-999.00,-999.00,-999.00,7.41,-999.00,-999.00
46060,11861,120052.00,21,0,1,-999.00,-999.00,-999.00,-999.00,-999.00,...,-999.00,-999.00,-999.00,-999.00,-999.00,-999.00,-999.00,7.22,-999.00,-999.00
46061,13542,115692.00,84,0,1,-999.00,-999.00,-999.00,-999.00,-999.00,...,-999.00,-999.00,-999.00,-999.00,-999.00,-999.00,-999.00,7.51,-999.00,-999.00
46062,25337,116249.00,81,0,1,-999.00,-999.00,-999.00,-999.00,-999.00,...,-999.00,-999.00,-999.00,-999.00,-999.00,-999.00,-999.00,7.44,-999.00,-999.00


In [55]:
df_final.to_csv('/Users/amyhung/Documents/CITS5553_project/data-science-capstone-project/data/t4.csv', index=False)

### 3. t2

In [57]:
a = create_labevents_processed(2)
df_final = a.drop_duplicates(subset=['SUBJECT_ID', 'HADM_ID'], ignore_index=True)[['SUBJECT_ID', 'HADM_ID', 'AGE', 'GENDER_NUM', 'IS_SEPSIS']]

for itemid in feature_list:
  new_col = f'ITEMID_{itemid}'
  temp_df = a[a.ITEMID == itemid].groupby(['SUBJECT_ID', 'HADM_ID', 'ITEMID']).apply(lambda x: compute_feature_values(x)).reset_index(name=new_col)
  df_final = df_final.merge(temp_df[['SUBJECT_ID', 'HADM_ID', new_col]], on=['SUBJECT_ID', 'HADM_ID'], how='left')

df_final = df_final.fillna(-999)
df_final

Unnamed: 0,SUBJECT_ID,HADM_ID,AGE,GENDER_NUM,IS_SEPSIS,ITEMID_51279,ITEMID_51222,ITEMID_51221,ITEMID_50931,ITEMID_51006,...,ITEMID_50912,ITEMID_51237,ITEMID_50893,ITEMID_50882,ITEMID_50813,ITEMID_50821,ITEMID_51265,ITEMID_50820,ITEMID_50818,ITEMID_51275
0,3,145834.00,76,0,0,3.44,10.00,30.20,91.00,53.00,...,3.20,1.30,8.20,25.00,-999.00,-999.00,282.00,-999.00,-999.00,30.70
1,4,185777.00,47,1,0,3.80,11.50,34.20,140.00,9.00,...,0.50,1.00,8.90,24.00,-999.00,-999.00,207.00,-999.00,-999.00,31.30
2,9,150750.00,41,0,0,5.46,15.40,45.20,129.00,16.00,...,1.20,1.10,-999.00,30.00,-999.00,-999.00,258.00,-999.00,-999.00,21.70
3,11,194540.00,50,1,0,4.31,12.50,36.90,121.00,12.00,...,0.70,1.10,-999.00,25.00,-999.00,-999.00,229.00,-999.00,-999.00,28.30
4,13,143045.00,39,1,0,4.08,12.30,35.60,169.00,13.00,...,0.60,1.20,8.90,23.00,-999.00,-999.00,216.00,-999.00,-999.00,44.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42739,2136,174241.00,64,0,0,-999.00,-999.00,-999.00,-999.00,-999.00,...,-999.00,-999.00,-999.00,-999.00,-999.00,-999.00,-999.00,7.23,-999.00,-999.00
42740,11861,120052.00,21,1,0,-999.00,-999.00,-999.00,-999.00,-999.00,...,-999.00,-999.00,-999.00,-999.00,-999.00,-999.00,-999.00,7.22,-999.00,-999.00
42741,25337,116249.00,81,1,0,-999.00,-999.00,-999.00,-999.00,-999.00,...,-999.00,-999.00,-999.00,-999.00,-999.00,-999.00,-999.00,7.44,-999.00,-999.00
42742,30268,199151.00,42,0,0,-999.00,-999.00,-999.00,-999.00,-999.00,...,-999.00,-999.00,-999.00,-999.00,-999.00,-999.00,-999.00,7.36,-999.00,-999.00


In [58]:
df_final.to_csv('/Users/amyhung/Documents/CITS5553_project/data-science-capstone-project/data/t2.csv', index=False)