In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
from scipy import interp


from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn import metrics
from sklearn.calibration import calibration_curve
from sklearn.metrics import brier_score_loss

import pickle

pd.set_option("display.max_rows", 200)
pd.set_option("display.max_columns", 200)

In [2]:
# Adapted from: https://towardsdatascience.com/how-to-calibrate-undersampled-model-scores-8f3319c1ea5b

def calibration(data, no_patients, no_patients_with_complication, downsampled_no_patients, downsampled_no_patients_with_complications):
    
    calibrated_data = \
    ((data * (no_patients_with_complication / no_patients) / (downsampled_no_patients_with_complications / downsampled_no_patients)) /
    ((
        (1 - data) * (1 - no_patients_with_complication / no_patients) / (1 - downsampled_no_patients_with_complications / downsampled_no_patients)
     ) +
     (
        data * (no_patients_with_complication / no_patients) / (downsampled_no_patients_with_complications / downsampled_no_patients)
     )))

    return calibrated_data

In [3]:
# loading previously trained model to generate new predictions

pickled_model = pickle.load(open('clf_dvt.pkl', 'rb'))

In [4]:
# Exemplary new patient data (simulation, not corresponding to any real patient)

# Sociodemographic aspects
Age = 87 
Sex_female = 1 # 0 male, 1 female
care_at_home = 0
nursing_home = 1

# Symtoms and scores at admission:
Paresis = 1
Language_impairment = 1
Speech_impairment = 1
Swallowing_impairment = 0
Consciousness = 0
Ranking_Scale = 3
NIHSS = 15
Barthel_index_bladder_function = 5
Barthel_index_transition_bed_chair = 5
Barthel_index_mobility = 5

# Comorbidities
Comorbidity_diabetes_mellitus = 1 
Comorbidity_hypertension = 1
Comorbidity_myocardial_infarct = 0
Comorbidity_previous_stroke = 0
Comorbidity_hypercholesterinaemia = 1
Comorbidity_atrial_fibrillation = 0
Comorbidity_atrial_fibrillation_newly_diagnosed = 0

# Therapies
Thrombolysis_iv = 1
Thrombectomy_thrombolysis_ia = 1

# Month of admission
January = 0
February = 1
March = 0
April = 0
May = 0
June = 0
July = 0
August = 0
September = 0
October = 0
November = 0
December = 0

# Stroke etiology
atherothrombotic = 1
embolic = 0
microangiopathic = 0
other_cause = 0
competing_causes = 0
unknown_cause = 0

# further stroke characteristics
Stenosis = 1 # Large vessel occlusion
No_stenosis = 0
Stenosis_not_evaluated = 0
Imaging_before_admission = 0
Time_onset_admission_below_1h = 1 
Time_onset_admission_1_2h = 0 
Time_onset_admission_2_3h = 0 
Time_onset_admission_3_3h30mins = 0 
Time_onset_admission_3_30_4h = 0 
Time_onset_admission_4_6h = 0 
Time_onset_admission_6_24h = 0 
Time_onset_admission_24_48h = 0 
Time_onset_admission_48h = 0 

patient = pd.DataFrame([Paresis, Language_impairment, Speech_impairment, Swallowing_impairment, Consciousness, Ranking_Scale, 
                        Barthel_index_bladder_function, Barthel_index_transition_bed_chair, Barthel_index_mobility, Comorbidity_diabetes_mellitus,
                        Comorbidity_hypertension, Comorbidity_myocardial_infarct, Comorbidity_previous_stroke, Comorbidity_hypercholesterinaemia,
                        NIHSS, Age, Thrombectomy_thrombolysis_ia, Sex_female, February, March, April, May, June, July,
                        August, September, October, November, December, atherothrombotic, embolic, microangiopathic,
                        other_cause, competing_causes, care_at_home, nursing_home, No_stenosis, Stenosis_not_evaluated,
                        Imaging_before_admission, Time_onset_admission_below_1h, Time_onset_admission_1_2h, Time_onset_admission_2_3h, 
                        Time_onset_admission_3_3h30mins, Time_onset_admission_3_30_4h, Time_onset_admission_4_6h, Time_onset_admission_6_24h, Time_onset_admission_24_48h, Time_onset_admission_48h,
                        Comorbidity_atrial_fibrillation, Comorbidity_atrial_fibrillation_newly_diagnosed, Thrombolysis_iv]).transpose()

In [5]:
# Uncalibrated prediction
new_prediction = pickled_model.predict_proba(np.array(patient).reshape(1,-1))

# Test population
no_patients = 71313
no_patients_with_dvt = 269

# Training population
downsampled_no_patients = 674
downsampled_no_patients_with_dvt = 337

# Calibrated prediction
new_prediction_proba_calibrated = calibration(new_prediction[0][1], no_patients, no_patients_with_dvt, downsampled_no_patients, downsampled_no_patients_with_dvt)
print("Probability for adverse outcome: %s" %new_prediction_proba_calibrated)

Probability for adverse outcome: 0.003495395105912328
