In [1]:
import numpy as np
import pandas as pd
import os
import random
import pyarrow
from tensorflow import keras
from sklearn.model_selection import train_test_split
#import sktime

In [2]:
pd.options.display.max_rows=999

In [3]:
raw_data_p = r'/h/dsmith/physionet_data/prepped/physionet_data.parquet'

In [4]:
df = pd.read_parquet(raw_data_p, engine='pyarrow')
df = df.drop_duplicates()

In [5]:
df.shape

(1429854, 43)

In [6]:
df.head(5)

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,HCO3,...,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel,SepsisEver,id
0,60.0,98.0,36.2,135.0,109.0,72.0,17.0,33.017979,-0.441724,24.382587,...,194.0,57.0,0.0,0.492043,0.507957,-12.02,1.0,0.0,0,27338
1,60.0,98.0,36.2,135.0,109.0,72.0,17.0,33.017979,-0.441724,24.382587,...,194.0,57.0,0.0,0.492043,0.507957,-12.02,2.0,0.0,0,27338
2,56.0,98.0,36.225,127.0,105.0,73.0,15.0,33.017979,-0.441724,24.382587,...,194.0,57.0,0.0,0.492043,0.507957,-12.02,3.0,0.0,0,27338
3,54.0,98.0,36.25,119.0,100.0,76.0,13.0,33.017979,-0.441724,24.382587,...,194.0,57.0,0.0,0.492043,0.507957,-12.02,4.0,0.0,0,27338
4,52.0,98.0,36.275,119.0,87.0,67.0,12.0,33.017979,-0.441724,24.382587,...,194.0,57.0,0.0,0.492043,0.507957,-12.02,5.0,0.0,0,27338


In [7]:
df.id.nunique()

36612

In [9]:
### check for non-static variables. Keep them for later steps
var_mtx = df.groupby('id').agg('var').reset_index(drop=True)
eps = 0
var_keep = list(var_mtx.columns[var_mtx.sum() > eps])

In [10]:
var_keep

['HR',
 'O2Sat',
 'Temp',
 'SBP',
 'MAP',
 'DBP',
 'Resp',
 'EtCO2',
 'BaseExcess',
 'HCO3',
 'FiO2',
 'pH',
 'PaCO2',
 'SaO2',
 'AST',
 'BUN',
 'Alkalinephos',
 'Calcium',
 'Chloride',
 'Creatinine',
 'Bilirubin_direct',
 'Glucose',
 'Lactate',
 'Magnesium',
 'Phosphate',
 'Potassium',
 'Bilirubin_total',
 'TroponinI',
 'Hct',
 'Hgb',
 'PTT',
 'WBC',
 'Fibrinogen',
 'Platelets',
 'ICULOS',
 'SepsisLabel']

Starter Model:
1. use min_len = min(time serires length for subject i, for i = 1,2,..., N) as unified length of input time series
2. keep the last min_len\$th\$ readings of each subject
3. include all non-static variables

In [45]:
#df.groupby('id').apply(lambda x:x.shape[0]).describe([x/10 for x in range(11)])
min_len = df.groupby('id').apply(lambda x:x.shape[0]).min()

In [46]:
print(min_len)

11


In [14]:
#df.groupby('id').apply(lambda x:x.shape[0]).describe()

In [47]:
df_short = df.groupby('id').apply(lambda x:x.tail(min_len))
df_short = df_short.reset_index(drop=True)

In [49]:
df_short.head(20)

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,HCO3,...,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel,SepsisEver,id
0,89.0,99.0,38.28,164.0,85.0,57.0,27.0,33.017979,-2.0,18.608696,...,197.0,63.18,0.0,0.492043,0.507957,-0.02,88.0,0.0,1.0,1.0
1,88.0,98.0,38.056667,147.0,77.0,51.0,24.0,33.017979,-2.0,18.652174,...,197.0,63.18,0.0,0.492043,0.507957,-0.02,89.0,1.0,1.0,1.0
2,84.5,99.0,37.833333,140.5,74.0,50.0,30.0,33.017979,-2.0,18.695652,...,197.0,63.18,0.0,0.492043,0.507957,-0.02,90.0,1.0,1.0,1.0
3,83.0,99.5,37.61,133.5,70.0,49.0,28.0,33.017979,-2.0,18.73913,...,197.0,63.18,0.0,0.492043,0.507957,-0.02,91.0,1.0,1.0,1.0
4,88.0,99.0,37.644,83.75,55.17,48.0,27.0,33.017979,-2.0,18.782609,...,197.0,63.18,0.0,0.492043,0.507957,-0.02,92.0,1.0,1.0,1.0
5,87.0,97.0,37.678,130.0,67.33,54.0,28.0,33.017979,-1.8,18.826087,...,197.0,63.18,0.0,0.492043,0.507957,-0.02,93.0,1.0,1.0,1.0
6,90.0,97.0,37.712,137.0,75.0,51.0,28.0,33.017979,-1.6,18.869565,...,197.0,63.18,0.0,0.492043,0.507957,-0.02,94.0,1.0,1.0,1.0
7,90.0,98.0,37.746,148.5,83.25,55.5,26.0,33.017979,-1.4,18.913043,...,197.0,63.18,0.0,0.492043,0.507957,-0.02,95.0,1.0,1.0,1.0
8,90.0,99.0,37.78,160.0,91.5,60.0,29.5,33.017979,-1.2,18.956522,...,197.0,63.18,0.0,0.492043,0.507957,-0.02,96.0,1.0,1.0,1.0
9,82.0,98.0,37.78,129.0,66.0,43.0,25.0,33.017979,-1.0,19.0,...,197.0,63.18,0.0,0.492043,0.507957,-0.02,97.0,1.0,1.0,1.0


In [71]:
X = np.array(list(df_short.groupby('id').apply(lambda x:x[x_var].values.tolist())))
y = np.array(df_short.groupby('id').apply(lambda x: 1 if x['SepsisLabel'].sum()>=1 else 0))

In [73]:
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [74]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(27685, 11, 35)
(27685,)
(11866, 11, 35)
(11866,)


In [77]:
x_train = x_train.reshape((x_train.shape[0], min_len, len(x_var)))
x_test = x_test.reshape((x_test.shape[0], min_len, len(x_var)))

In [78]:
num_classes = len(np.unique(y_train))

In [79]:
idx = np.random.permutation(len(x_train))
x_train = x_train[idx]
y_train = y_train[idx]

In [80]:
def make_model(input_shape):
    input_layer = keras.layers.Input(input_shape)

    conv1 = keras.layers.Conv1D(filters=64, kernel_size=3, padding="same")(input_layer)
    conv1 = keras.layers.BatchNormalization()(conv1)
    conv1 = keras.layers.ReLU()(conv1)

    conv2 = keras.layers.Conv1D(filters=64, kernel_size=3, padding="same")(conv1)
    conv2 = keras.layers.BatchNormalization()(conv2)
    conv2 = keras.layers.ReLU()(conv2)

    conv3 = keras.layers.Conv1D(filters=64, kernel_size=3, padding="same")(conv2)
    conv3 = keras.layers.BatchNormalization()(conv3)
    conv3 = keras.layers.ReLU()(conv3)

    gap = keras.layers.GlobalAveragePooling1D()(conv3)

    output_layer = keras.layers.Dense(num_classes, activation="softmax")(gap)

    return keras.models.Model(inputs=input_layer, outputs=output_layer)


model = make_model(input_shape=x_train.shape[1:])
keras.utils.plot_model(model, show_shapes=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.


In [81]:
model = make_model(input_shape=x_train.shape[1:])
keras.utils.plot_model(model, show_shapes=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.


In [82]:
epochs = 500
batch_size = 32

callbacks = [
    keras.callbacks.ModelCheckpoint(
        "best_model.h5", save_best_only=True, monitor="val_loss"
    ),
    keras.callbacks.ReduceLROnPlateau(
        monitor="val_loss", factor=0.5, patience=20, min_lr=0.0001
    ),
    keras.callbacks.EarlyStopping(monitor="val_loss", patience=50, verbose=1),
]
model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["sparse_categorical_accuracy"],
)
history = model.fit(
    x_train,
    y_train,
    batch_size=batch_size,
    epochs=epochs,
    callbacks=callbacks,
    validation_split=0.2,
    verbose=1,
)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500


Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 60: early stopping


In [83]:
model = keras.models.load_model("best_model.h5")

test_loss, test_acc = model.evaluate(x_test, y_test)

print("Test accuracy", test_acc)
print("Test loss", test_loss)

Test accuracy 0.9559245109558105
Test loss 0.14151933789253235
