In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
from scipy import interp


from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn import metrics
from sklearn.calibration import calibration_curve
from sklearn.metrics import brier_score_loss

pd.set_option("display.max_rows", 200)
pd.set_option("display.max_columns", 200)

In [None]:
# Adapted from: https://towardsdatascience.com/how-to-calibrate-undersampled-model-scores-8f3319c1ea5b

def calibration(data, no_patients, no_patients_with_complication, downsampled_no_patients, downsampled_no_patients_with_complications):
    
    calibrated_data = \
    ((data * (no_patients_with_complication / no_patients) / (downsampled_no_patients_with_complications / downsampled_no_patients)) /
    ((
        (1 - data) * (1 - no_patients_with_complication / no_patients) / (1 - downsampled_no_patients_with_complications / downsampled_no_patients)
     ) +
     (
        data * (no_patients_with_complication / no_patients) / (downsampled_no_patients_with_complications / downsampled_no_patients)
     )))

    return calibrated_data

In [None]:
qs_post = pd.read_csv("qs_2016_2017_prediction_mortality_211213.csv", index_col=0)

qs_post_2016 = qs_post[qs_post["dtjahr_a"]==2016].copy()
qs_post_2016 = qs_post_2016.drop(columns=["dtjahr_a",])

qs_post_2017 = qs_post[qs_post["dtjahr_a"]==2017].copy()
qs_post_2017 = qs_post_2017.drop(columns=["dtjahr_a",])

In [None]:
no_downsamples = 100
inner_cv = 5

all_preds_2016 = []
all_y_2016 = []
all_preds_2017 = []
all_y_2017 = []

tprs_2016 = []
tprs_2017 = []
test_auc_2016 = []
test_auc_2017 = []
test_auc_2017_cal = []

fraction_pos_all_2017_cal = []
mean_pred_all_2017_cal = []
brier_score_all = []

mean_fpr = np.linspace(0, 1, 100)

for no in range(no_downsamples):
    qs_post_2016_0 = qs_post_2016[qs_post_2016["tot"]==0].copy()
    qs_post_2016_1 = qs_post_2016[qs_post_2016["tot"]==1].copy()

    qs_post_2016_0_sample = qs_post_2016_0.sample(int(np.sum(qs_post_2016["tot"])), replace=False)
    qs_post_2016_1_sample = qs_post_2016_1.sample(int(np.sum(qs_post_2016["tot"])), replace=False)

    qs_post_2016_joint = pd.concat([qs_post_2016_0_sample, qs_post_2016_1_sample])

    y = np.array(qs_post_2016_joint["tot"])
    qs_post_2016_joint = qs_post_2016_joint.drop(["tot"], axis=1)
    
        
    qs_post_2017_joint = qs_post_2017.copy()

    y_2017 = np.array(qs_post_2017_joint["tot"])
    qs_post_2017_joint = qs_post_2017_joint.drop(["tot"], axis=1)

                
    pipe = Pipeline([
                ('scale', StandardScaler()),
                    ('classify',  KNeighborsClassifier())         
                                    ])

    parameters = [{
                      'classify__n_neighbors': [1,5,10,50],
                        }]
    
    clf = GridSearchCV(pipe,param_grid=parameters, cv =inner_cv)
    
    X_train, X_test, y_train, y_test = train_test_split(qs_post_2016_joint, y, test_size=0.2, random_state=no)

    
    clf.fit(X_train, np.array(y_train).ravel())
    preds = clf.predict(X_test)
    preds_2017 = clf.predict(qs_post_2017_joint)
    all_preds_2016.append(preds)
    all_y_2016.append(y_test)
    all_preds_2017.append(preds_2017)
    all_y_2017.append(np.array(y_2017).ravel())

    y_proba_2016 = pd.DataFrame(clf.predict_proba(X_test)).loc[:,1]
            
    fpr_2016, tpr_2016, thresholds_2016 = roc_curve(y_test, y_proba_2016)
    tprs_2016.append(np.interp(mean_fpr, fpr_2016, tpr_2016))
    test_auc_2016.append(roc_auc_score(y_test, y_proba_2016))
        
    y_proba_2017 = pd.DataFrame(clf.predict_proba(qs_post_2017_joint)).loc[:,1]
            
    fpr_2017, tpr_2017, thresholds_2017 = roc_curve(y_2017, y_proba_2017)
    tprs_2017.append(np.interp(mean_fpr, fpr_2017, tpr_2017))
    test_auc_2017.append(roc_auc_score(y_2017, y_proba_2017))
        
    brier_score = brier_score_loss(y_2017, y_proba_2017, pos_label=y_2017.max())
    brier_score_all.append(brier_score)

    calibrated_y_proba_2017 = calibration(y_proba_2017, len(y_2017), np.sum(y_2017), len(y), np.sum(y))
    fraction_of_positives_cal, mean_predicted_value_cal = calibration_curve(y_2017, calibrated_y_proba_2017, n_bins=10,strategy='quantile')
    fraction_pos_all_2017_cal.append(fraction_of_positives_cal)
    mean_pred_all_2017_cal.append(mean_predicted_value_cal)
    test_auc_2017_cal.append(roc_auc_score(y_2017, calibrated_y_proba_2017))


In [None]:
# Saving prediction scores

scores = pd.DataFrame(test_auc_2016)
scores.columns = ["validation_auc_2016"]
scores["auc_2017"] = test_auc_2017
scores["auc_cal_2017"] = test_auc_2017_cal
scores["brier_2017"] = brier_score_all

scores.to_csv("predictions/kNN_mortality_noICU_1111.csv")

tprs_long = pd.DataFrame(tprs_2017)
tprs_long.to_csv("predictions/kNN_mortality_tprs_noICU_1111.csv")

fraction_pos_all_2017 = pd.DataFrame(fraction_pos_all_2017_cal)
fraction_pos_all_2017.to_csv("predictions/kNN_mortality_fraction_pos_cal_noICU_1111.csv")

mean_pred_all_2017 = pd.DataFrame(mean_pred_all_2017_cal)
mean_pred_all_2017.to_csv("predictions/kNN_mortality_mean_pred_cal_noICU_1111.csv")

pd.DataFrame(all_preds_2016).to_csv("predictions/kNN/kNN_mortality_all_pred_2016_noICU_1111.csv")
pd.DataFrame(all_preds_2017).to_csv("predictions/kNN/kNN_mortality_all_pred_2017_noICU_1111.csv")
pd.DataFrame(all_y_2016).to_csv("predictions/kNN/kNN_mortality_all_y_2016_noICU_1111.csv")
pd.DataFrame(all_y_2017).to_csv("predictions/kNN/kNN_mortality_all_y_2017_noICU_1111.csv")