# Load data

In [1]:
import pandas as pd
from constants import NULLABLE_MEASURES
from utils.class_patient import Patients

patients = Patients.loadPatients()
patients.fillMissingMeasureValue(NULLABLE_MEASURES, 0)

# Remove missing data

## Remove features with more than 20% missing

In [2]:
# remove measures with less than 80% of data

measures = patients.getMeasures()

for measure, count in measures.items():
    if count < len(patients) * 80 / 100:
        patients.removeMeasures([measure])
        print(measure, count)

pco2 935
po2 935
albumin 558
hba1c 444
lymphocyte 585
height 415
urine-ketone 343
crp 57


## Remove patients with more than 20% missing features

In [3]:
patients.removePatientByMissingFeatures()
len(patients)

1209

## Display insight

In [4]:
# dfData = patients.getMeasuresBetween(pd.Timedelta(hours=-6), pd.Timedelta(hours=24), "first")

with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(patients.getMeasures())

Counter({'ag': 1209,
         'age': 1209,
         'aki': 1209,
         'bg': 1209,
         'bicarbonate': 1209,
         'bun': 1209,
         'calcium': 1209,
         'chloride': 1209,
         'chronic_pulmonary_disease': 1209,
         'ckd_stage': 1209,
         'congestive_heart_failure': 1209,
         'dka_type': 1209,
         'egfr': 1209,
         'gender': 1209,
         'history_aci': 1209,
         'history_ami': 1209,
         'hypertension': 1209,
         'liver_disease': 1209,
         'macroangiopathy': 1209,
         'malignant_cancer': 1209,
         'mechanical_ventilation': 1209,
         'microangiopathy': 1209,
         'oasis': 1209,
         'phosphate': 1209,
         'potassium': 1209,
         'preiculos': 1209,
         'race': 1209,
         'saps2': 1209,
         'scr': 1209,
         'sodium': 1209,
         'sofa': 1209,
         'use_NaHCO3': 1209,
         'uti': 1209,
         'hr': 1208,
         'dbp': 1207,
         'gcs': 1207,
         'g

In [5]:
akdCount = sum([p.akdPositive for p in patients.patientList])

akdCount / len(patients)

0.5070306038047974

# Machine learning

In [6]:
from constants import CATEGORICAL_MEASURES


idColumns = ["subject_id", "hadm_id", "stay_id"]
categoryColumns = CATEGORICAL_MEASURES
labelColumn = "akd"

## Split train-test

In [7]:
splitedPatients = patients.split(5, 27)


def trainTest():
    for i in range(splitedPatients.__len__()):
        testPatients = splitedPatients[i]

        trainPatientsList = splitedPatients[:i] + splitedPatients[i + 1 :]
        trainPatients = Patients(patients=[])
        for trainPatientsElem in trainPatientsList:
            trainPatients += trainPatientsElem

        yield trainPatients, testPatients


def trainValTest():
    for i in range(splitedPatients.__len__()):
        testPatients = splitedPatients[i]

        trainPatientsList = splitedPatients[:i] + splitedPatients[i + 1 :]
        trainPatients = Patients(patients=[])
        for trainPatientsElem in trainPatientsList:
            trainPatients += trainPatientsElem

        *trainPatients, valPatients = trainPatients.split(5, 27)
        tmpPatients = Patients(patients=[])
        for trainPatientsElem in trainPatients:
            tmpPatients += trainPatientsElem
        trainPatients = tmpPatients

        yield trainPatients, valPatients, testPatients

## Define model

In [8]:
import xgboost as xgb

how = "first"

params = {
    "validate_parameters": True,
    "device": "gpu",  # Use GPU acceleration
    "n_jobs": -1,  # Use all CPU cores
    "eval_metric": "logloss",  # Evaluation metric
    "max_depth": 6,  # Maximum depth of a tree
    "min_child_weight": 1,  # Minimum sum of instance weight (hessian) needed in a child
    "gamma": 0,  # Minimum loss reduction required to make a further partition on a leaf node
    "subsample": 0.8,  # Subsample ratio of the training instances
    "colsample_bytree": 0.8,  # Subsample ratio of columns when constructing each tree
    "reg_alpha": 0.01,  # L1 regularization term on weights
    "reg_lambda": 1,  # L2 regularization term on weights
    "objective": "binary:logistic",  # Binary classification objective
    "n_estimators": 1000,  # Number of trees
    "learning_rate": 0.01,  # Learning rate
    "early_stopping_rounds": 10,  # Early stopping
}


def createModel():
    model = xgb.XGBClassifier(**params)
    return model

## Without validate

### Without fill missing data

In [9]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from utils.prepare_data import normalizeData


accuracy_score_list = []
precision_score_list = []
recall_score_list = []
auc_score_list = []
for trainPatients, testPatients in trainTest():
    dfTrain = trainPatients.getMeasuresBetween(
        pd.Timedelta(hours=-6), pd.Timedelta(hours=24), how
    )
    dfTrain = dfTrain.drop(columns=idColumns)

    dfTest = testPatients.getMeasuresBetween(
        pd.Timedelta(hours=-6), pd.Timedelta(hours=24), how
    )
    dfTest = dfTest.drop(columns=idColumns)

    dfTrain, dfTest, _ = normalizeData(dfTrain, dfTest)

    X_train = dfTrain.drop(columns=[labelColumn])
    y_train = dfTrain[labelColumn]

    X_test = dfTest.drop(columns=[labelColumn])
    y_test = dfTest[labelColumn]

    model = createModel()
    model.fit(X_train, y_train, eval_set=[(X_train, y_train)], verbose=False)

    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]  # For AUC

    accuracy_score_list.append(accuracy_score(y_test, y_pred))
    precision_score_list.append(precision_score(y_test, y_pred))
    recall_score_list.append(recall_score(y_test, y_pred))
    auc_score_list.append(roc_auc_score(y_test, y_pred_proba))



mixedColumns ['akd', 'ag', 'age', 'bg', 'bicarbonate', 'bun', 'calcium', 'chloride', 'chronic_pulmonary_disease', 'congestive_heart_failure', 'dbp', 'egfr', 'gcs', 'gcs_unable', 'hb', 'hematocrit', 'history_aci', 'history_ami', 'hr', 'hypertension', 'macroangiopathy', 'malignant_cancer', 'mch', 'mchc', 'mcv', 'mechanical_ventilation', 'microangiopathy', 'oasis', 'ph', 'phosphate', 'plt', 'potassium', 'preiculos', 'rbc', 'rdw', 'rr', 'saps2', 'sbp', 'scr', 'sodium', 'sofa', 'use_NaHCO3', 'uti', 'wbc', 'weight']


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




mixedColumns ['akd', 'ag', 'age', 'bg', 'bicarbonate', 'bun', 'calcium', 'chloride', 'chronic_pulmonary_disease', 'congestive_heart_failure', 'dbp', 'egfr', 'gcs', 'gcs_unable', 'hb', 'hematocrit', 'history_aci', 'history_ami', 'hr', 'hypertension', 'macroangiopathy', 'malignant_cancer', 'mch', 'mchc', 'mcv', 'mechanical_ventilation', 'microangiopathy', 'oasis', 'ph', 'phosphate', 'plt', 'potassium', 'preiculos', 'rbc', 'rdw', 'rr', 'saps2', 'sbp', 'scr', 'sodium', 'sofa', 'use_NaHCO3', 'uti', 'wbc', 'weight']
mixedColumns ['akd', 'ag', 'age', 'bg', 'bicarbonate', 'bun', 'calcium', 'chloride', 'chronic_pulmonary_disease', 'congestive_heart_failure', 'dbp', 'egfr', 'gcs', 'gcs_unable', 'hb', 'hematocrit', 'history_aci', 'history_ami', 'hr', 'hypertension', 'macroangiopathy', 'malignant_cancer', 'mch', 'mchc', 'mcv', 'mechanical_ventilation', 'microangiopathy', 'oasis', 'ph', 'phosphate', 'plt', 'potassium', 'preiculos', 'rbc', 'rdw', 'rr', 'saps2', 'sbp', 'scr', 'sodium', 'sofa', 'use_N

In [12]:

print(f"Average AUC: {np.mean(auc_score_list)}")
print(f"Average Accuracy: {np.mean(accuracy_score_list)}")
print(f"Average Precision: {np.mean(precision_score_list)}")
print(f"Average Recall: {np.mean(recall_score_list)}")

Average AUC: 0.7666866278698605
Average Accuracy: 0.6939062446418162
Average Precision: 0.6977940623369691
Average Recall: 0.699720111955218


### Fill missing with knn

In [13]:
from sklearn.metrics import roc_auc_score
from utils.prepare_data import normalizeAndFillData


accuracy_score_list_knn = []
precision_score_list_knn = []
recall_score_list_knn = []
auc_score_list_knn = []
for trainPatients, testPatients in trainTest():
    dfTrain = trainPatients.getMeasuresBetween(
        pd.Timedelta(hours=-6), pd.Timedelta(hours=24), how
    )
    dfTrain = dfTrain.drop(columns=idColumns)

    dfTest = testPatients.getMeasuresBetween(
        pd.Timedelta(hours=-6), pd.Timedelta(hours=24), how
    )
    dfTest = dfTest.drop(columns=idColumns)

    dfTrain, dfTest, _ = normalizeAndFillData(dfTrain, dfTest)

    X_train = dfTrain.drop(columns=[labelColumn])
    y_train = dfTrain[labelColumn]

    X_test = dfTest.drop(columns=[labelColumn])
    y_test = dfTest[labelColumn]

    model = createModel()
    model.fit(X_train, y_train, eval_set=[(X_train, y_train)], verbose=False)

    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]  # For AUC

    accuracy_score_list_knn.append(accuracy_score(y_test, y_pred))
    precision_score_list_knn.append(precision_score(y_test, y_pred))
    recall_score_list_knn.append(recall_score(y_test, y_pred))
    auc_score_list_knn.append(roc_auc_score(y_test, y_pred_proba))



mixedColumns ['akd', 'ag', 'age', 'bg', 'bicarbonate', 'bun', 'calcium', 'chloride', 'chronic_pulmonary_disease', 'congestive_heart_failure', 'dbp', 'egfr', 'gcs', 'gcs_unable', 'hb', 'hematocrit', 'history_aci', 'history_ami', 'hr', 'hypertension', 'macroangiopathy', 'malignant_cancer', 'mch', 'mchc', 'mcv', 'mechanical_ventilation', 'microangiopathy', 'oasis', 'ph', 'phosphate', 'plt', 'potassium', 'preiculos', 'rbc', 'rdw', 'rr', 'saps2', 'sbp', 'scr', 'sodium', 'sofa', 'use_NaHCO3', 'uti', 'wbc', 'weight']
mixedColumns ['akd', 'ag', 'age', 'bg', 'bicarbonate', 'bun', 'calcium', 'chloride', 'chronic_pulmonary_disease', 'congestive_heart_failure', 'dbp', 'egfr', 'gcs', 'gcs_unable', 'hb', 'hematocrit', 'history_aci', 'history_ami', 'hr', 'hypertension', 'macroangiopathy', 'malignant_cancer', 'mch', 'mchc', 'mcv', 'mechanical_ventilation', 'microangiopathy', 'oasis', 'ph', 'phosphate', 'plt', 'potassium', 'preiculos', 'rbc', 'rdw', 'rr', 'saps2', 'sbp', 'scr', 'sodium', 'sofa', 'use_N

In [14]:

print(f"Average AUC: {np.mean(auc_score_list_knn)}")
print(f"Average Accuracy: {np.mean(accuracy_score_list_knn)}")
print(f"Average Precision: {np.mean(precision_score_list_knn)}")
print(f"Average Recall: {np.mean(recall_score_list_knn)}")

Average AUC: 0.7650229488036718
Average Accuracy: 0.6947464078735297
Average Precision: 0.6968032786885245
Average Recall: 0.7046114887378382


## With validate

### Without fill missing data

In [15]:
from utils.prepare_data import normalizeData


accuracy_score_list_val = []
precision_score_list_val = []
recall_score_list_val = []
auc_score_list_val = []
for trainPatients, valPatients, testPatients in trainValTest():
    dfTrain = trainPatients.getMeasuresBetween(
        pd.Timedelta(hours=-6), pd.Timedelta(hours=24), how
    )
    dfTrain = dfTrain.drop(columns=idColumns)

    dfVal = valPatients.getMeasuresBetween(
        pd.Timedelta(hours=-6), pd.Timedelta(hours=24), how
    )
    dfVal = dfVal.drop(columns=idColumns)

    dfTest = testPatients.getMeasuresBetween(
        pd.Timedelta(hours=-6), pd.Timedelta(hours=24), how
    )
    dfTest = dfTest.drop(columns=idColumns)

    dfTrain, dfTest, dfVal = normalizeData(dfTrain, dfTest, dfVal)

    X_train = dfTrain.drop(columns=[labelColumn])
    y_train = dfTrain[labelColumn]

    X_val = dfVal.drop(columns=[labelColumn]) # type: ignore
    y_val = dfVal[labelColumn] # type: ignore

    X_test = dfTest.drop(columns=[labelColumn])
    y_test = dfTest[labelColumn]

    model = createModel()
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)])

    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]  # For AUC

    accuracy_score_list_val.append(accuracy_score(y_test, y_pred))
    precision_score_list_val.append(precision_score(y_test, y_pred))
    recall_score_list_val.append(recall_score(y_test, y_pred))
    auc_score_list_val.append(roc_auc_score(y_test, y_pred_proba))


mixedColumns ['akd', 'ag', 'age', 'bg', 'bicarbonate', 'bun', 'calcium', 'chloride', 'chronic_pulmonary_disease', 'congestive_heart_failure', 'dbp', 'egfr', 'gcs', 'gcs_unable', 'hb', 'hematocrit', 'history_aci', 'history_ami', 'hr', 'hypertension', 'macroangiopathy', 'malignant_cancer', 'mch', 'mchc', 'mcv', 'mechanical_ventilation', 'microangiopathy', 'oasis', 'ph', 'phosphate', 'plt', 'potassium', 'preiculos', 'rbc', 'rdw', 'rr', 'saps2', 'sbp', 'scr', 'sodium', 'sofa', 'use_NaHCO3', 'uti', 'wbc', 'weight']
[0]	validation_0-logloss:0.69059
[1]	validation_0-logloss:0.68932
[2]	validation_0-logloss:0.68754
[3]	validation_0-logloss:0.68634
[4]	validation_0-logloss:0.68483
[5]	validation_0-logloss:0.68356
[6]	validation_0-logloss:0.68183
[7]	validation_0-logloss:0.68023
[8]	validation_0-logloss:0.67902
[9]	validation_0-logloss:0.67746
[10]	validation_0-logloss:0.67615
[11]	validation_0-logloss:0.67433
[12]	validation_0-logloss:0.67325
[13]	validation_0-logloss:0.67208
[14]	validation_0-

In [16]:


print(f"Average AUC: {np.mean(auc_score_list_val)}")
print(f"Average Accuracy: {np.mean(accuracy_score_list_val)}")
print(f"Average Precision: {np.mean(precision_score_list_val)}")
print(f"Average Recall: {np.mean(recall_score_list_val)}")

Average AUC: 0.770582786493246
Average Accuracy: 0.699711944034841
Average Precision: 0.7059201683591928
Average Recall: 0.6998000799680127


### Fill missing with knn

In [17]:
from sklearn.metrics import roc_auc_score
from utils.prepare_data import normalizeAndFillData


accuracy_score_list_val_knn = []
precision_score_list_val_knn = []
recall_score_list_val_knn = []
auc_score_list_val_knn = []
metric_dic_list_val_knn = []
for trainPatients, valPatients, testPatients in trainValTest():
    dfTrain = trainPatients.getMeasuresBetween(
        pd.Timedelta(hours=-6), pd.Timedelta(hours=24), how
    )
    dfTrain = dfTrain.drop(columns=idColumns)

    dfVal = valPatients.getMeasuresBetween(
        pd.Timedelta(hours=-6), pd.Timedelta(hours=24), how
    )
    dfVal = dfVal.drop(columns=idColumns)

    dfTest = testPatients.getMeasuresBetween(
        pd.Timedelta(hours=-6), pd.Timedelta(hours=24), how
    )
    dfTest = dfTest.drop(columns=idColumns)

    dfTrain, dfTest, dfVal = normalizeAndFillData(dfTrain, dfTest, dfVal)

    X_train = dfTrain.drop(columns=[labelColumn])
    y_train = dfTrain[labelColumn]

    X_val = dfVal.drop(columns=[labelColumn])  # type: ignore
    y_val = dfVal[labelColumn]  # type: ignore

    X_test = dfTest.drop(columns=[labelColumn])
    y_test = dfTest[labelColumn]

    model = createModel()
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)])

    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]  # For AUC

    accuracy_score_list_val_knn.append(accuracy_score(y_test, y_pred))
    precision_score_list_val_knn.append(precision_score(y_test, y_pred))
    recall_score_list_val_knn.append(recall_score(y_test, y_pred))
    auc_score_list_val_knn.append(roc_auc_score(y_test, y_pred_proba))



mixedColumns ['akd', 'ag', 'age', 'bg', 'bicarbonate', 'bun', 'calcium', 'chloride', 'chronic_pulmonary_disease', 'congestive_heart_failure', 'dbp', 'egfr', 'gcs', 'gcs_unable', 'hb', 'hematocrit', 'history_aci', 'history_ami', 'hr', 'hypertension', 'macroangiopathy', 'malignant_cancer', 'mch', 'mchc', 'mcv', 'mechanical_ventilation', 'microangiopathy', 'oasis', 'ph', 'phosphate', 'plt', 'potassium', 'preiculos', 'rbc', 'rdw', 'rr', 'saps2', 'sbp', 'scr', 'sodium', 'sofa', 'use_NaHCO3', 'uti', 'wbc', 'weight']
[0]	validation_0-logloss:0.69099
[1]	validation_0-logloss:0.68915
[2]	validation_0-logloss:0.68759
[3]	validation_0-logloss:0.68630
[4]	validation_0-logloss:0.68441
[5]	validation_0-logloss:0.68255
[6]	validation_0-logloss:0.68078
[7]	validation_0-logloss:0.67907
[8]	validation_0-logloss:0.67714
[9]	validation_0-logloss:0.67612
[10]	validation_0-logloss:0.67522
[11]	validation_0-logloss:0.67367
[12]	validation_0-logloss:0.67290
[13]	validation_0-logloss:0.67153
[14]	validation_0-

In [18]:
print(f"Average AUC: {np.mean(auc_score_list_val_knn)}")
print(f"Average Accuracy: {np.mean(accuracy_score_list_val_knn)}")
print(f"Average Precision: {np.mean(precision_score_list_val_knn)}")
print(f"Average Recall: {np.mean(recall_score_list_val_knn)}")

Average AUC: 0.7683684705389553
Average Accuracy: 0.6922670690305546
Average Precision: 0.6917541363768761
Average Recall: 0.7112088497934159
