# PREDICTING SEPSIS RISK DURING IN-PATIENT ADMISSIONS
*Client: Royal Perth Hospital*

*Team: Group 7*

# Readme
The following libraries need to be installed in order to run the source code.

In [1]:
import pandas as pd
import numpy as np, warnings
import pathlib
import os
import sweetviz as sv

np.warnings = warnings

In [62]:
# setup OS agnostic pathnames
ROOT_DIR = Path('..')

# Note: to run main notebook from root directory, use:
# ROOT_DIR = Path('')

path_patients = ROOT_DIR / 'data' / "PATIENTS.csv"  
path_admissions = ROOT_DIR / 'data' / "ADMISSIONS.csv"
path_diagnoses_icd = ROOT_DIR / 'data' / "DIAGNOSES_ICD.csv"
path_labevents = ROOT_DIR / 'data' / "LABEVENTS.csv"
path_microbiologyevents = ROOT_DIR / 'data' / "MICROBIOLOGYEVENTS.csv"
path_desc_icd = ROOT_DIR / 'data' / "D_ICD_DIAGNOSES.csv"
path_desc_labitems = ROOT_DIR / 'data' / "D_LABITEMS.csv"
path_desc_items = ROOT_DIR / 'data' /  "D_ITEMS.csv"

# 1.Dataset Processing

## 1.1 Data Cleaning

Load the dataset. The following tables are loaded for this project:
- PATIENTS
- ADMISSIONS
- DIAGNOSES_ICD
- LABEVENTS
- MICROBIOLOGYEVENTS
- D_ICD_DIAGNOSES
- D_ITEMS

In [63]:
# load transaction dataset
df_patients = pd.read_csv(path_patients)
df_admissions = pd.read_csv(path_admissions)
df_diagnoses_icd = pd.read_csv(path_diagnoses_icd)
df_labevents = pd.read_csv(path_labevents)
df_microbiologyevents = pd.read_csv(path_microbiologyevents)

# load description tables
df_desc_icd = pd.read_csv(path_desc_icd)
df_desc_labitems = pd.read_csv(path_desc_labitems)
df_desc_items = pd.read_csv(path_desc_items)

In [3]:
# Convert the following datetime columns to datetime format
# patients: DOB to date format, we not care about the birth time
df_patients['DOB'] = pd.to_datetime(df_patients['DOB'], format='%Y-%m-%d %H:%M:%S')
# admissions: ADMITTIME, DISCHTIME, EDREGTIME, EDOUTTIME
df_admissions['ADMITTIME'] = pd.to_datetime(df_admissions['ADMITTIME'], format='%Y-%m-%d %H:%M:%S')
df_admissions['DISCHTIME'] = pd.to_datetime(df_admissions['DISCHTIME'], format='%Y-%m-%d %H:%M:%S')
# labevents: CHARTTIME
df_labevents['CHARTTIME'] = pd.to_datetime(df_labevents['CHARTTIME'], format='%Y-%m-%d %H:%M:%S')
# microbiologyevents: CHARTDATE to date format and CHARTTIME to datetime format
df_microbiologyevents['CHARTDATE'] = pd.to_datetime(df_microbiologyevents['CHARTDATE'], format='%Y-%m-%d')
df_microbiologyevents['CHARTTIME'] = pd.to_datetime(df_microbiologyevents['CHARTTIME'], format='%Y-%m-%d %H:%M:%S')

The DIAGNOSES_ICD tables has a column ICD9_CODE which is the code for each disease diagnosed for the patient.

The Sepsis has 6 codes: ['77181', '99591', '99592', '67020', '67022', '67024']

We'll introduce a new column, IS_SEPSIS, as a binary classifier (1 for 6 sepsis ICD9 codes, 0 otherwise) for the target variable.

In [4]:
# retrieve all sepsis icd code
sepsis_icd =  df_desc_icd[df_desc_icd.apply(lambda x:'sepsis' in x['SHORT_TITLE'].lower(),axis=1)]['ICD9_CODE'].values
# add new binary classifier target variable
df_diagnoses_icd['IS_SEPSIS'] = df_diagnoses_icd.apply(lambda x: 1 if x['ICD9_CODE'] in sepsis_icd else 0, axis=1)

In [9]:
df_demographic = pd.merge(df_admissions, df_patients[['SUBJECT_ID', 'GENDER', 'DOB', 'EXPIRE_FLAG']], on='SUBJECT_ID')

In [5]:
# filter missing data from Labevents table
missing_hadm_df = df_labevents[df_labevents['HADM_ID'].isnull()]

## 1.2 Data Exploration

In [81]:
variables = ['CARDIACOUTPUT', 'TVSET', 'CHLORIDE', 'HEMATOCRIT', 'HEMOGLOBIN', 'ck_mb', 'fibrinogen', 'po2_bloodgas', 'troponin_t']

FLAG
abnormal    0.351101
delta       0.002318
Name: count, dtype: float64

In [None]:
sns.kdeplot(data=full_labvitals_static_variables, x='cardiacoutput', hue='gender', shade=True)
plt.show()

In [None]:
def kdeplot_features_static_variable(df, static_feature, features):
    fig, axes = plt.subplots(len(features), 1, figsize=(10,65))
    fig.suptitle('')
    
    i=0
    for feature in features:
        sns.kdeplot(data=df, x=feature, hue=static_feature, shade=True, legend=True, ax=axes[i])
        plt.title('')
        i+=1
    plt.show()

full_labvitals_static_variables = full_labvitals.merge(static_variables_df, on=['icustay_id', 'subject_id'], how='left')
kdeplot_features_static_variable(full_labvitals_static_variables, 'gender', all_features)

In [66]:

# Static variables related
def get_duplicates_count_ids(df):
    '''
    Returns a dictionary with the count of duplicated ids

    Parameters
    ----------
    df : pandas.DataFrame
        Dataframe with the ids to be counted
    
    Returns
    -------
    dict
        Dictionary with the count of duplicated ids
    '''
    columns_possibilites = [['SUBJECT_ID'],
              ['HADM_ID'],
              ['SUBJECT_ID','HADM_ID']]
    
    duplicated_per_id = {}
    
    for columns in columns_possibilites:
        duplicates_count = df.duplicated(subset=columns).sum()
        duplicated_per_id[str(columns)] = duplicates_count

    return duplicated_per_id

# dupicate ID
get_duplicates_count_ids(df_admissions)

#duplicated_ids_barplot(static_variables_ids_duplicates, 'Unique entries by ID')

#-- count the number of blood gas measurements
#-- abg_count - number of labs with pH/PCO2/PO2
#-- vbg_count - number of times VBG appears in chartevents


{"['SUBJECT_ID']": 12456, "['HADM_ID']": 0, "['SUBJECT_ID', 'HADM_ID']": 0}

Identifying missing time series values

In [None]:
import missingno as msno
msno.bar(df_labevents)

msno.bar(df_cases_labvitals)
plt.show()
msno.bar(df_controls_labvitals)
plt.show()

### Visualise univariate features


In [None]:
plot_univariate_features(full_labvitals[full_labvitals['icustay_id']==200087], ['label'] + vitals_features, '')

variables = ['sysbp', 'diabp', 'resprate', 'heartrate', 'spo2_pulsoxy', 'tempc', 'fio2', 'totalpeeplevel']

plot_variables_evolution(full_labvitals, 200087, variables, 'Series temporales de diversas constantes vitales y resultados de laboratorio.', show=True)

## 1.3 Feature Selection

# 2.Sepsis Risk Predicting Model Implementation

## 2.1 Logistics Regression Model

## 2.2 Random Forest Model

## 2.3 Gradient Boosted Model

## 2.4 LSTM Model

## 2.5 LSTM + Attention Model

# 3.Clustering Model

# 4.Model Testing

# 5.Local and Global Feature Explanation