# Load data

In [1]:
import pandas as pd
from constants import NULLABLE_MEASURES
from utils.class_patient import Patients

patients = Patients.loadPatients()
patients.fillMissingMeasureValue(NULLABLE_MEASURES, 0)

# Remove missing data

## Remove features with more than 20% missing

In [2]:
# remove measures with less than 80% of data

measures = patients.getMeasures()

for measure, count in measures.items():
    if count < len(patients) * 80 / 100:
        patients.removeMeasures([measure])
        print(measure, count)

aki 621
pco2 935
po2 935
albumin 558
hba1c 444
lymphocyte 585
height 415
urine-ketone 343
crp 57


## Remove patients with more than 20% missing features

In [3]:
patients.removePatientByMissingFeatures()
print(len(patients))
sum([p.akdPositive for p in patients])

1203


613

## Display insight

In [4]:
# dfData = patients.getMeasuresBetween(pd.Timedelta(hours=-6), pd.Timedelta(hours=24), "first")

with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(patients.getMeasures())

Counter({'ag': 1203,
         'age': 1203,
         'bg': 1203,
         'bicarbonate': 1203,
         'bun': 1203,
         'calcium': 1203,
         'chloride': 1203,
         'chronic_pulmonary_disease': 1203,
         'ckd_stage': 1203,
         'congestive_heart_failure': 1203,
         'dka_type': 1203,
         'egfr': 1203,
         'gender': 1203,
         'history_aci': 1203,
         'history_ami': 1203,
         'hypertension': 1203,
         'liver_disease': 1203,
         'macroangiopathy': 1203,
         'malignant_cancer': 1203,
         'mechanical_ventilation': 1203,
         'microangiopathy': 1203,
         'oasis': 1203,
         'phosphate': 1203,
         'potassium': 1203,
         'preiculos': 1203,
         'race': 1203,
         'saps2': 1203,
         'scr': 1203,
         'sodium': 1203,
         'sofa': 1203,
         'use_NaHCO3': 1203,
         'uti': 1203,
         'hr': 1202,
         'dbp': 1201,
         'gcs': 1201,
         'gcs_unable': 1201,
    

In [5]:
akdCount = sum([p.akdPositive for p in patients.patientList])

akdCount / len(patients)

0.5095594347464671

# Machine learning

In [6]:
from constants import CATEGORICAL_MEASURES


idColumns = ["subject_id", "hadm_id", "stay_id"]
categoryColumns = CATEGORICAL_MEASURES
labelColumn = "akd"

## Split train-test

In [7]:
splitedPatients = patients.split(5, 27)


def trainTest():
    for i in range(splitedPatients.__len__()):
        testPatients = splitedPatients[i]

        trainPatientsList = splitedPatients[:i] + splitedPatients[i + 1 :]
        trainPatients = Patients(patients=[])
        for trainPatientsElem in trainPatientsList:
            trainPatients += trainPatientsElem

        yield trainPatients, testPatients


def trainValTest():
    for i in range(splitedPatients.__len__()):
        testPatients = splitedPatients[i]

        trainPatientsList = splitedPatients[:i] + splitedPatients[i + 1 :]
        trainPatients = Patients(patients=[])
        for trainPatientsElem in trainPatientsList:
            trainPatients += trainPatientsElem

        *trainPatients, valPatients = trainPatients.split(5, 27)
        tmpPatients = Patients(patients=[])
        for trainPatientsElem in trainPatients:
            tmpPatients += trainPatientsElem
        trainPatients = tmpPatients

        yield trainPatients, valPatients, testPatients

488 962 125 241
500 962 113 241
482 962 131 241
485 963 128 240
497 963 116 240
393 770 95 192 125 241
406 770 94 192 113 241
386 770 96 192 131 241
387 771 98 192 128 240
399 771 98 192 116 240


## Define model

In [8]:
import xgboost as xgb

how = "first"


def createModel():
    return xgb.XGBClassifier(device="cuda", n_jobs=-1)

## Without validate

### Without fill missing data

In [9]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from utils.prepare_data import normalizeData


accuracy_score_list = []
precision_score_list = []
recall_score_list = []
auc_score_list = []
for trainPatients, testPatients in trainTest():
    dfTrain = trainPatients.getMeasuresBetween(
        pd.Timedelta(hours=-6), pd.Timedelta(hours=24), how
    )
    dfTrain = dfTrain.drop(columns=idColumns)

    dfTest = testPatients.getMeasuresBetween(
        pd.Timedelta(hours=-6), pd.Timedelta(hours=24), how
    )
    dfTest = dfTest.drop(columns=idColumns)

    dfTrain, dfTest, _ = normalizeData(dfTrain, dfTest)

    X_train = dfTrain.drop(columns=[labelColumn])
    y_train = dfTrain[labelColumn]

    X_test = dfTest.drop(columns=[labelColumn])
    y_test = dfTest[labelColumn]

    model = createModel()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]  # For AUC

    accuracy_score_list.append(accuracy_score(y_test, y_pred))
    precision_score_list.append(precision_score(y_test, y_pred))
    recall_score_list.append(recall_score(y_test, y_pred))
    auc_score_list.append(roc_auc_score(y_test, y_pred_proba))



Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [11]:

print(f"Average AUC: {np.mean(auc_score_list)}")
print(f"Average Accuracy: {np.mean(accuracy_score_list)}")
print(f"Average Precision: {np.mean(precision_score_list)}")
print(f"Average Recall: {np.mean(recall_score_list)}")

Average AUC: 0.748472459662092
Average Accuracy: 0.6741874135546335
Average Precision: 0.6740103653815537
Average Recall: 0.6927366900232246


### Fill missing with knn

In [12]:
from sklearn.metrics import roc_auc_score
from utils.prepare_data import normalizeAndFillData


accuracy_score_list_knn = []
precision_score_list_knn = []
recall_score_list_knn = []
auc_score_list_knn = []
for trainPatients, testPatients in trainTest():
    dfTrain = trainPatients.getMeasuresBetween(
        pd.Timedelta(hours=-6), pd.Timedelta(hours=24), how
    )
    dfTrain = dfTrain.drop(columns=idColumns)

    dfTest = testPatients.getMeasuresBetween(
        pd.Timedelta(hours=-6), pd.Timedelta(hours=24), how
    )
    dfTest = dfTest.drop(columns=idColumns)

    dfTrain, dfTest, _ = normalizeAndFillData(dfTrain, dfTest)

    X_train = dfTrain.drop(columns=[labelColumn])
    y_train = dfTrain[labelColumn]

    X_test = dfTest.drop(columns=[labelColumn])
    y_test = dfTest[labelColumn]

    model = createModel()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]  # For AUC

    accuracy_score_list_knn.append(accuracy_score(y_test, y_pred))
    precision_score_list_knn.append(precision_score(y_test, y_pred))
    recall_score_list_knn.append(recall_score(y_test, y_pred))
    auc_score_list_knn.append(roc_auc_score(y_test, y_pred_proba))

In [13]:

print(f"Average AUC: {np.mean(auc_score_list_knn)}")
print(f"Average Accuracy: {np.mean(accuracy_score_list_knn)}")
print(f"Average Precision: {np.mean(precision_score_list_knn)}")
print(f"Average Recall: {np.mean(recall_score_list_knn)}")

Average AUC: 0.7388790639712485
Average Accuracy: 0.6899965421853389
Average Precision: 0.694049472198302
Average Recall: 0.7000251294297288


## With validate

### Without fill missing data

In [14]:
from utils.prepare_data import normalizeData


accuracy_score_list_val = []
precision_score_list_val = []
recall_score_list_val = []
auc_score_list_val = []
for trainPatients, valPatients, testPatients in trainValTest():
    dfTrain = trainPatients.getMeasuresBetween(
        pd.Timedelta(hours=-6), pd.Timedelta(hours=24), how
    )
    dfTrain = dfTrain.drop(columns=idColumns)

    dfVal = valPatients.getMeasuresBetween(
        pd.Timedelta(hours=-6), pd.Timedelta(hours=24), how
    )
    dfVal = dfVal.drop(columns=idColumns)

    dfTest = testPatients.getMeasuresBetween(
        pd.Timedelta(hours=-6), pd.Timedelta(hours=24), how
    )
    dfTest = dfTest.drop(columns=idColumns)

    dfTrain, dfTest, dfVal = normalizeData(dfTrain, dfTest, dfVal)

    X_train = dfTrain.drop(columns=[labelColumn])
    y_train = dfTrain[labelColumn]

    X_val = dfVal.drop(columns=[labelColumn]) # type: ignore
    y_val = dfVal[labelColumn] # type: ignore

    X_test = dfTest.drop(columns=[labelColumn])
    y_test = dfTest[labelColumn]

    model = createModel()
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)])

    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]  # For AUC

    accuracy_score_list_val.append(accuracy_score(y_test, y_pred))
    precision_score_list_val.append(precision_score(y_test, y_pred))
    recall_score_list_val.append(recall_score(y_test, y_pred))
    auc_score_list_val.append(roc_auc_score(y_test, y_pred_proba))

[0]	validation_0-logloss:0.64500
[1]	validation_0-logloss:0.62233
[2]	validation_0-logloss:0.60092
[3]	validation_0-logloss:0.59846
[4]	validation_0-logloss:0.59958
[5]	validation_0-logloss:0.59689
[6]	validation_0-logloss:0.59518
[7]	validation_0-logloss:0.60162
[8]	validation_0-logloss:0.60965
[9]	validation_0-logloss:0.62753
[10]	validation_0-logloss:0.63023
[11]	validation_0-logloss:0.62668
[12]	validation_0-logloss:0.63296
[13]	validation_0-logloss:0.65176
[14]	validation_0-logloss:0.65065
[15]	validation_0-logloss:0.64478
[16]	validation_0-logloss:0.65092
[17]	validation_0-logloss:0.65535
[18]	validation_0-logloss:0.67234
[19]	validation_0-logloss:0.67689
[20]	validation_0-logloss:0.68227
[21]	validation_0-logloss:0.69439
[22]	validation_0-logloss:0.68886
[23]	validation_0-logloss:0.69129
[24]	validation_0-logloss:0.69651
[25]	validation_0-logloss:0.70071
[26]	validation_0-logloss:0.69713
[27]	validation_0-logloss:0.69694
[28]	validation_0-logloss:0.70404
[29]	validation_0-loglos

In [15]:


print(f"Average AUC: {np.mean(auc_score_list_val)}")
print(f"Average Accuracy: {np.mean(accuracy_score_list_val)}")
print(f"Average Precision: {np.mean(precision_score_list_val)}")
print(f"Average Recall: {np.mean(recall_score_list_val)}")

Average AUC: 0.7483221106936571
Average Accuracy: 0.6733298755186722
Average Precision: 0.672726399096668
Average Recall: 0.7049002216465907


### Fill missing with knn

In [16]:
from sklearn.metrics import roc_auc_score
from utils.prepare_data import normalizeAndFillData


accuracy_score_list_val_knn = []
precision_score_list_val_knn = []
recall_score_list_val_knn = []
auc_score_list_val_knn = []
metric_dic_list_val_knn = []
for trainPatients, valPatients, testPatients in trainValTest():
    dfTrain = trainPatients.getMeasuresBetween(
        pd.Timedelta(hours=-6), pd.Timedelta(hours=24), how
    )
    dfTrain = dfTrain.drop(columns=idColumns)

    dfVal = valPatients.getMeasuresBetween(
        pd.Timedelta(hours=-6), pd.Timedelta(hours=24), how
    )
    dfVal = dfVal.drop(columns=idColumns)

    dfTest = testPatients.getMeasuresBetween(
        pd.Timedelta(hours=-6), pd.Timedelta(hours=24), how
    )
    dfTest = dfTest.drop(columns=idColumns)

    dfTrain, dfTest, dfVal = normalizeAndFillData(dfTrain, dfTest, dfVal)

    X_train = dfTrain.drop(columns=[labelColumn])
    y_train = dfTrain[labelColumn]

    X_val = dfVal.drop(columns=[labelColumn])  # type: ignore
    y_val = dfVal[labelColumn]  # type: ignore

    X_test = dfTest.drop(columns=[labelColumn])
    y_test = dfTest[labelColumn]

    model = createModel()
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)])

    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]  # For AUC

    accuracy_score_list_val_knn.append(accuracy_score(y_test, y_pred))
    precision_score_list_val_knn.append(precision_score(y_test, y_pred))
    recall_score_list_val_knn.append(recall_score(y_test, y_pred))
    auc_score_list_val_knn.append(roc_auc_score(y_test, y_pred_proba))



[0]	validation_0-logloss:0.65358
[1]	validation_0-logloss:0.65147
[2]	validation_0-logloss:0.63219
[3]	validation_0-logloss:0.64100
[4]	validation_0-logloss:0.65571
[5]	validation_0-logloss:0.66279
[6]	validation_0-logloss:0.66809
[7]	validation_0-logloss:0.66693
[8]	validation_0-logloss:0.66930
[9]	validation_0-logloss:0.67871
[10]	validation_0-logloss:0.68033
[11]	validation_0-logloss:0.67845
[12]	validation_0-logloss:0.68159
[13]	validation_0-logloss:0.67824
[14]	validation_0-logloss:0.68968
[15]	validation_0-logloss:0.68867
[16]	validation_0-logloss:0.68828
[17]	validation_0-logloss:0.69504
[18]	validation_0-logloss:0.70154
[19]	validation_0-logloss:0.70062
[20]	validation_0-logloss:0.70141
[21]	validation_0-logloss:0.70010
[22]	validation_0-logloss:0.70097
[23]	validation_0-logloss:0.70177
[24]	validation_0-logloss:0.70029
[25]	validation_0-logloss:0.70059
[26]	validation_0-logloss:0.70775
[27]	validation_0-logloss:0.70329
[28]	validation_0-logloss:0.69875
[29]	validation_0-loglos

In [17]:
print(f"Average AUC: {np.mean(auc_score_list_val_knn)}")
print(f"Average Accuracy: {np.mean(accuracy_score_list_val_knn)}")
print(f"Average Precision: {np.mean(precision_score_list_val_knn)}")
print(f"Average Recall: {np.mean(recall_score_list_val_knn)}")

Average AUC: 0.7451317367214016
Average Accuracy: 0.6783160442600276
Average Precision: 0.680417896556309
Average Recall: 0.6976789199591414
