# Inputs

In [28]:
splitPartCount = 5
splitSeed = 27
hoursPerWindow = 1

# Preproccess

## read data

In [29]:
from utils.class_patient import Patients


patients = Patients.loadPatients()
len(patients)

1213

## remove missing

In [30]:
# fill measures whose null represent false value

from constants import NULLABLE_MEASURES


nullableMeasures = NULLABLE_MEASURES

for measureName in nullableMeasures:
    patients.fillMissingMeasureValue(measureName, 0)

In [31]:
# remove measures with less than 80% of data

measures = patients.getMeasures()

for measure, count in measures.items():
    if count < len(patients) * 80 / 100:
        patients.removeMeasures([measure])
        print(measure, count)

pco2 917
ph 954
po2 917
albumin 406
hba1c 326
lymphocyte 446
height 415
urine-ketone 294
crp 19


In [32]:
# remove patients with less than 80% of data

patients.removePatientByMissingFeatures()
len(patients)

1209

## split patients

In [33]:
splitedPatients = patients.split(splitPartCount, splitSeed)

len(splitedPatients[0])

242

In [34]:
from pandas import Timedelta


splitedPatients[0].getMeasuresBetween(Timedelta(-6), Timedelta(1))

Unnamed: 0,subject_id,hadm_id,stay_id,akd,ag,age,bg,bicarbonate,bun,calcium,...,race,rr,saps2,sbp,scr,sofa,use_NaHCO3,uti,wbc,weight
0,19054290,20046699,30643955,False,,65,,,,,...,WHITE,,34,,,3,0.0,0,,74.50
0,14866589,20066152,33060916,False,,37,,,,,...,WHITE,,22,,,2,0.0,0,,
0,14849360,20152551,32817197,False,,59,,,,,...,UNABLE TO OBTAIN,,31,,,7,0.0,0,,
0,16171124,20167052,38239449,False,,37,,,,,...,HISPANIC/LATINO - DOMINICAN,,15,,,2,0.0,0,,
0,16815664,20240670,37751533,False,,42,,,,,...,BLACK/AFRICAN AMERICAN,,27,,,2,,0,,43.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,15517352,29801212,32814591,False,,20,,,,,...,WHITE,,30,,,4,,0,,
0,10383045,29899941,39755932,False,,58,,,,,...,WHITE,,23,,,1,0.0,0,,59.40
0,15679298,29905973,39834726,False,,50,,,,,...,WHITE,,32,,,3,0.0,0,,47.40
0,10912213,29911812,34040470,False,,47,,,,,...,WHITE,,23,,,2,0.0,0,,89.05


In [35]:
splitedPatients = patients.split(splitPartCount, splitSeed)


def trainTest():
    for i in range(splitedPatients.__len__()):
        testPatients = splitedPatients[i]

        trainPatientsList = splitedPatients[:i] + splitedPatients[i + 1 :]
        trainPatients = Patients(patients=[])
        for trainPatientsElem in trainPatientsList:
            trainPatients += trainPatientsElem

        yield trainPatients, testPatients


def trainValTest():
    for i in range(splitedPatients.__len__()):
        testPatients = splitedPatients[i]

        trainPatientsList = splitedPatients[:i] + splitedPatients[i + 1 :]
        trainPatients = Patients(patients=[])
        for trainPatientsElem in trainPatientsList:
            trainPatients += trainPatientsElem

        *trainPatients, valPatients = trainPatients.split(5, 27)
        tmpPatients = Patients(patients=[])
        for trainPatientsElem in trainPatients:
            tmpPatients += trainPatientsElem
        trainPatients = tmpPatients

        yield trainPatients, valPatients, testPatients

In [36]:
for trainPatients, testPatients in trainTest():
    print(len(trainPatients.patientList), len(testPatients.patientList))

967 242
967 242
967 242
967 242
968 241


In [37]:
from constants import CATEGORICAL_MEASURES, TEMP_PATH
from utils.prepare_data import patientsToNumpy


dataset = []
for i, (trainPatients, testPatients) in enumerate(trainTest()):
    # trainPatients.fillMissingMeasureValue(CATEGORICAL_MEASURES, 0)
    npTrainX, categoryEncoder, numericEncoder, oulier, columns = patientsToNumpy(
        trainPatients, hoursPerWindow, CATEGORICAL_MEASURES
    )

    # testPatients.fillMissingMeasureValue(CATEGORICAL_MEASURES, 0)
    npTestX, *_ = patientsToNumpy(
        testPatients,
        hoursPerWindow,
        CATEGORICAL_MEASURES,
        columns,
        categoryEncoder,
        numericEncoder,
        oulier,
    )
    trainY = [p.akdPositive for p in trainPatients]
    testY = [p.akdPositive for p in testPatients]

    dataset.append((npTrainX, trainY, npTestX, testY))

    print(f"Dataset {i} created")
    pass

Dataset 0 created
Dataset 1 created
Dataset 2 created
Dataset 3 created
Dataset 4 created


In [38]:
import pickle


(TEMP_PATH / f"train-test-set-{splitPartCount}-{splitSeed}-{hoursPerWindow}.pkl").write_bytes(
    pickle.dumps(dataset)
)

41441534

In [39]:
from constants import CATEGORICAL_MEASURES, TEMP_PATH
from utils.prepare_data import patientsToNumpy


datavalset = []
for i, (trainPatients, valPatients, testPatients) in enumerate(trainValTest()):
    # trainPatients.fillMissingMeasureValue(CATEGORICAL_MEASURES, 0)
    npTrainX, categoryEncoder, numericEncoder, oulier, columns = patientsToNumpy(
        trainPatients, hoursPerWindow, CATEGORICAL_MEASURES
    )

    # testPatients.fillMissingMeasureValue(CATEGORICAL_MEASURES, 0)
    npTestX, *_ = patientsToNumpy(
        testPatients,
        hoursPerWindow,
        CATEGORICAL_MEASURES,
        columns,
        categoryEncoder,
        numericEncoder,
        oulier,
    )

    npValX, *_ = patientsToNumpy(
        valPatients,
        hoursPerWindow,
        CATEGORICAL_MEASURES,
        columns,
        categoryEncoder,
        numericEncoder,
        oulier,
    )

    trainY = [p.akdPositive for p in trainPatients]
    testY = [p.akdPositive for p in testPatients]
    valY = [p.akdPositive for p in valPatients]

    datavalset.append((npTrainX, trainY, npValX, valY, npTestX, testY))

    print(f"Dataset {i} created")
    pass

Dataset 0 created
Dataset 1 created
Dataset 2 created
Dataset 3 created
Dataset 4 created


In [40]:
import pickle

(TEMP_PATH / f"train-val-test-set-{splitPartCount}-{splitSeed}-{hoursPerWindow}.pkl").write_bytes(
    pickle.dumps(datavalset)
)

40861464

# LSTM

In [5]:
import pickle
from constants import TEMP_PATH

# load if not exist dataset
if "dataset" not in locals():
    dataset = pickle.loads(
        (TEMP_PATH / f"dataset-{splitPartCount}-{splitSeed}-{hoursPerWindow}.pkl").read_bytes()
    )
   

In [5]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

def createModel(timeSteps, features):
    model = Sequential()
    model.add(LSTM(64, input_shape=(timeSteps, features), return_sequences=True))
    model.add(Dropout(0.5))
    model.add(LSTM(32))
    model.add(Dropout(0.5))
    model.add(Dense(16, activation="relu"))
    model.add(Dense(1, activation="sigmoid"))

    optimizer = Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=["AUC", "accuracy", "precision", "recall"])
    return model

In [6]:
# crossfold
import numpy as np
from tensorflow.keras.callbacks import EarlyStopping


loses = []
aucs = []
accuracies = []
precisions = []
recals = []
for i, (npTrainX, trainY, npTestX, testY) in enumerate(dataset):
    npTrainX = np.nan_to_num(npTrainX, nan=0)
    npTestX = np.nan_to_num(npTestX, nan=0)

    model = createModel(npTrainX.shape[1], npTrainX.shape[2])

    neg, pos = np.bincount(trainY)
    weight0 = (1 / neg) * (len(trainY)) / 2.0
    weight1 = (1 / pos) * (len(trainY)) / 2.0
    weight = {0: weight0, 1: weight1}

    early_stopping = EarlyStopping(
        monitor="val_loss", patience=10, restore_best_weights=True
    )

    trainY = np.array(trainY)
    model.fit(
        npTrainX,
        trainY,
        epochs=50,
        batch_size=32,
        validation_split=0.2,
        class_weight=weight,
        callbacks=[early_stopping],
    )

    testY = np.array(testY)
    loss, auc, accuracy, precison, recal = model.evaluate(npTestX, testY)
    loses.append(loss)
    aucs.append(auc)
    accuracies.append(accuracy)
    precisions.append(precison)
    recals.append(recal)

    pass

print("Loses:", loses, np.mean(loses), np.std(loses))
print("AUCs:", aucs, np.mean(aucs), np.std(aucs))
print("Accuracies:", accuracies, np.mean(accuracies), np.std(accuracies))
print("Precisions:", precisions, np.mean(precisions), np.std(precisions))
print("Recals:", recals, np.mean(recals), np.std(recals))

Epoch 1/20


2024-06-06 23:34:21.396009: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-06-06 23:34:21.411522: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
  super().__init__(**kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - AUC: 0.6890 - accuracy: 0.6496 - loss: 0.6459 - precision: 0.5193 - recall: 0.6205 - val_AUC: 0.7377 - val_accuracy: 0.7010 - val_loss: 0.6261 - val_precision: 0.5882 - val_recall: 0.6849
Epoch 2/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - AUC: 0.7681 - accuracy: 0.7341 - loss: 0.5813 - precision: 0.6412 - recall: 0.7308 - val_AUC: 0.7502 - val_accuracy: 0.7062 - val_loss: 0.6010 - val_precision: 0.5952 - val_recall: 0.6849
Epoch 3/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - AUC: 0.8011 - accuracy: 0.7255 - loss: 0.5484 - precision: 0.6442 - recall: 0.7338 - val_AUC: 0.7517 - val_accuracy: 0.6856 - val_loss: 0.5873 - val_precision: 0.5769 - val_recall: 0.6164
Epoch 4/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - AUC: 0.8368 - accuracy: 0.7740 - loss: 0.5012 - precision: 0.6571 - recall: 0.7909 - val_AUC: 0.7496 -

## Static validate 

In [46]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam


def createModel(timeSteps, features):
    model = Sequential()
    model.add(LSTM(64, input_shape=(timeSteps, features), return_sequences=True))
    model.add(Dropout(0.5))
    model.add(LSTM(32))
    model.add(Dropout(0.5))
    model.add(Dense(16, activation="relu"))
    model.add(Dense(1, activation="sigmoid"))

    optimizer = Adam(learning_rate=0.001)
    model.compile(
        optimizer=optimizer,
        loss="binary_crossentropy",
        metrics=["AUC", "accuracy", "precision", "recall"],
    )
    return model

In [51]:
# crossfold
import numpy as np
from tensorflow.keras.callbacks import EarlyStopping

if "datavalset" not in locals():
    datavalset = pickle.loads(
        (TEMP_PATH / f"train-val-test-set-{splitPartCount}-{splitSeed}-{hoursPerWindow}.pkl").read_bytes()
    )

loses = []
aucs = []
accuracies = []
precisions = []
recals = []
for i, (npTrainX, trainY, npValX, valY, npTestX, testY) in enumerate(datavalset):
    npTrainX = np.nan_to_num(npTrainX, nan=0)
    npValX = np.nan_to_num(npValX, nan=0)
    npTestX = np.nan_to_num(npTestX, nan=0)

    model = createModel(npTrainX.shape[1], npTrainX.shape[2])

    neg, pos = np.bincount(trainY)
    weight0 = (1 / neg) * (len(trainY)) / 2.0
    weight1 = (1 / pos) * (len(trainY)) / 2.0
    weight = {0: weight0, 1: weight1}

    early_stopping = EarlyStopping(
        monitor="val_loss", patience=10, restore_best_weights=True
    )

    trainY = np.array(trainY)
    valY = np.array(valY)
    testY = np.array(testY)

    model.fit(
        npTrainX,
        trainY,
        epochs=50,
        batch_size=32,
        validation_data=(npValX, valY),
        class_weight=weight,
        callbacks=[early_stopping],
    )

    loss, auc, accuracy, precison, recal = model.evaluate(npTestX, testY)
    loses.append(loss)
    aucs.append(auc)
    accuracies.append(accuracy)
    precisions.append(precison)
    recals.append(recal)

    pass

print("Loses:", loses, np.mean(loses), np.std(loses))
print("AUCs:", aucs, np.mean(aucs), np.std(aucs))
print("Accuracies:", accuracies, np.mean(accuracies), np.std(accuracies))
print("Precisions:", precisions, np.mean(precisions), np.std(precisions))
print("Recals:", recals, np.mean(recals), np.std(recals))

Epoch 1/50


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - AUC: 0.5657 - accuracy: 0.5604 - loss: 0.6889 - precision: 0.4332 - recall: 0.3177 - val_AUC: 0.7348 - val_accuracy: 0.6736 - val_loss: 0.5963 - val_precision: 0.5844 - val_recall: 0.5921
Epoch 2/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - AUC: 0.7931 - accuracy: 0.7341 - loss: 0.5698 - precision: 0.6421 - recall: 0.7410 - val_AUC: 0.7294 - val_accuracy: 0.6736 - val_loss: 0.6125 - val_precision: 0.5765 - val_recall: 0.6447
Epoch 3/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - AUC: 0.8137 - accuracy: 0.7500 - loss: 0.5279 - precision: 0.6505 - recall: 0.7925 - val_AUC: 0.7290 - val_accuracy: 0.6839 - val_loss: 0.6329 - val_precision: 0.5904 - val_recall: 0.6447
Epoch 4/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - AUC: 0.8157 - accuracy: 0.7640 - loss: 0.5260 - precision: 0.6751 - recall: 0.7715 - val_AUC: 0.7191 -