In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#pd.set_option('max_columns',100)

import matplotlib as mpl
mpl.style.use('ggplot')
sns.set_style('white')

import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier

import sys
import os
import tempfile
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve,auc
from sklearn.metrics import accuracy_score
from sklearn.calibration import calibration_curve
from sklearn.metrics import brier_score_loss
from sklearn.metrics import (precision_score, recall_score,f1_score)
from sklearn.calibration import CalibratedClassifierCV, calibration_curve

In [2]:
from sklearn.ensemble import GradientBoostingClassifier
from bayes_opt import BayesianOptimization

In [3]:
dev_df = pd.read_csv('train_df.csv')
val_df = pd.read_csv('val_df.csv')

In [4]:
# features = [
# 'hospstay_seq',
#  'icustay_rank',
#  'previous_mdrneg_lessthan90d',
#  'potassium',
#  'bicarbonate',
#  'creatinine',
#  'po2',
#  'eosinophils',
#  'monocytes_abs',
#  'lymphocytes',
#  'ptt',
#  'hematocrit',
#  'mch',
#  'mchc',
#  'platelet',
#  'rbc',
#  'rdw',
#  'history_dialysis',
#  'charlson_comorbidity_index',
#  'ant_duration',
#  'ant_1stday',
#  'immu_1stday',
#  'ent_nut',

#   'CEFTRIAXONE_anttest_beforeicu90d',
#   'TRIMETHOPRIM.SULFA_anttest_beforeicu90d',
#   'CEFEPIME_anttest_beforeicu90d'
#   'CEFTAZIDIME_anttest_beforeicu90d',
#   'CIPROFLOXACIN_anttest_beforeicu90d',
#   'PIPERACILLIN.TAZO_anttest_beforeicu90d',
#   'MEROPENEM_anttest_beforeicu90d',
#   'GENTAMICIN_anttest_beforeicu90d',
#   'TOBRAMYCIN_anttest_beforeicu90d'
# ]
# labels = ['CEFEPIME', 'CEFTAZIDIME', 'CIPROFLOXACIN','PIPERACILLIN.TAZO', 'MEROPENEM','GENTAMICIN', 'TOBRAMYCIN']

In [5]:
label_features_dict = {
'stay_chart_included_mdr':['sodium',
 'RPR',
 'creatinine',
 'monocytes_abs',
 'mcv',
 'INR',
 'mchc',
 'APPT',
 'fibrinogen',
 'Temperature',
 'shock_index',
 'bilirubin',
 'D_dimer',
 'PT',
 'p',
 'gamma_GT',
 'BMI',
 'BNP',
 'PCT',
 'urea',
 'CRP',
 'ck',
 'IL_6',
 'HR',
 'los_hosp_beforeicu'
]
}

In [6]:
# label_features_dict = {
# 'stay_chart_included_mdr':['AGE',
#  'mcv',
#  'INR',
#  'creatinine',
#  'PA',
#  'rdw',
#  'APPT',
#  'ca',
#  'wbc',
#  'fibrinogen',
#  'RR',
#  'mpv',
#  'Temperature',
#  'BNP',
#  'sodium',
#  'urea',
#  'bilirubin',
#  'ck',
#  'BMI',
#  'gamma_GT',
#  'PCT',
#  'IL_6',
#  'HR',
#  'los_hosp_beforeicu',
#  'CRP'
# ]
# }

In [7]:
#gbdt
def gbdt_cv(n_estimators, min_samples_split, max_features, max_depth):
    res = cross_val_score( 
        GradientBoostingClassifier(n_estimators=int(n_estimators),
                                                        min_samples_split=int(min_samples_split),
                                                        max_features=min(max_features, 0.999), # float
                                                        max_depth=int(max_depth),
                                                        random_state=2024
        ),
        x_train, y_train, scoring='roc_auc', cv=5
    ).mean()
    return res

gbdt_op = BayesianOptimization(
        gbdt_cv,
        {'n_estimators': (10, 250),
        'min_samples_split': (2, 25),
        'max_features': (0.1, 0.999),
        'max_depth': (5, 15)},
    random_state=2024
)

#random forest
def rf_cv(n_estimators, min_samples_split, max_features, max_depth):
    res = cross_val_score( 
        RandomForestClassifier(n_estimators=int(n_estimators),
                            min_samples_split=int(min_samples_split),
                            max_features=min(max_features, 0.999), # float
                            max_depth=int(max_depth),
                            random_state=2024
        ),
        x_train, y_train, scoring='roc_auc', cv=5
    ).mean()
    return res

rf_op = BayesianOptimization(
        rf_cv,
        {'n_estimators': (10, 250),
        'min_samples_split': (2, 25),
        'max_features': (0.1, 0.999),
        'max_depth': (3, 15)},
    random_state = 2024
)

#xgboost
def xgb_cv(n_estimators, min_child_weight,  subsample, colsample_bytree, max_depth):
    res = cross_val_score(
        XGBClassifier(n_estimators=int(n_estimators),
                            min_child_weight=int(min_child_weight),
                            #gama = min(gama,0.999),
                            subsample = min(subsample,0.999),
                            colsample_bytree = min(colsample_bytree,0.999),
                            max_depth=int(max_depth),
                            learning_rate = 0.01,
                            #n_jobs = -1,
                            random_state=2024
        ),
        x_train, y_train, scoring='roc_auc', cv=5
    ).mean()
    return res

xgb_op = BayesianOptimization(
        xgb_cv,
        {'n_estimators': (10, 250),
        'min_child_weight': (2, 25),
        #'gama': (0.1, 0.999),
        'subsample': (0.1, 0.999),
        'colsample_bytree': (0.1, 0.999),
        'max_depth': (3, 15)},
    random_state = 2024
)

In [8]:
def optimize_rf(n_iter):
    total_param_dict = {}
    models_dict = {}
    
    def rf_cv(n_estimators, min_samples_split, max_features, max_depth):
        return cross_val_score( 
                RandomForestClassifier(n_estimators=int(n_estimators),
                                    min_samples_split=int(min_samples_split),
                                    max_features=min(max_features, 0.999), # float
                                    max_depth=int(max_depth),
                                    random_state=2024
                ),
                x_train, y_train, scoring='roc_auc', cv=5
            ).mean()

    for i in label_features_dict:
        label = i
        features = label_features_dict[i]
        x_train = dev_df[features]
        y_train = dev_df[label]
        x_test = val_df[features]
        y_test = val_df[label]        
        
        rf_op = BayesianOptimization(
            rf_cv,
            {'n_estimators': (10, 300),
            'min_samples_split': (2, 25),
            'max_features': (0.1, 0.999),
            'max_depth': (3, 20)},
        random_state = 2024)

        print('---------------------------特征为：%s-------------------------------'%label)
        print('---------------------------模型为：Random Forest-------------------------------')
        rf_op.maximize(n_iter=n_iter)
        print(rf_op.max)
        models_dict[label] = rf_op.max
        print('----------------------------------------------------------------------------------------------------')
        
    return models_dict
    
def optimize_xgb(n_iter):
    total_param_dict = {}
    models_dict = {}
    
    def xgb_cv(n_estimators, min_child_weight,  subsample, colsample_bytree, max_depth):
        res = cross_val_score(
            XGBClassifier(n_estimators=int(n_estimators),
                                min_child_weight=int(min_child_weight),
                                #gama = min(gama,0.999),
                                subsample = min(subsample,0.999),
                                colsample_bytree = min(colsample_bytree,0.999),
                                max_depth=int(max_depth),
                                learning_rate = 0.01,
                                #n_jobs = -1,
                                random_state=2024
            ),
            x_train, y_train, scoring='roc_auc', cv=5
        ).mean()
        return res

    for i in label_features_dict:
        label = i
        features = label_features_dict[i]
        x_train = dev_df[features]
        y_train = dev_df[label]
        x_test = val_df[features]
        y_test = val_df[label]        
        
        xgb_op = BayesianOptimization(
                xgb_cv,
                {'n_estimators': (10, 250),
                'min_child_weight': (2, 25),
                #'gama': (0.1, 0.999),
                'subsample': (0.1, 0.999),
                'colsample_bytree': (0.1, 0.999),
                'max_depth': (3, 15)},
            random_state = 2024
        )

        print('---------------------------特征为：%s-------------------------------'%label)
        print('---------------------------模型为：XGBOOST-------------------------------')
        xgb_op.maximize(n_iter=n_iter)
        print(xgb_op.max)
        models_dict[label] = xgb_op.max
        print('----------------------------------------------------------------------------------------------------')
        
    return models_dict

In [9]:
models_dict_rf = optimize_rf(45)

---------------------------特征为：stay_chart_included_mdr-------------------------------
---------------------------模型为：Random Forest-------------------------------
|   iter    |  target   | max_depth | max_fe... | min_sa... | n_esti... |
-------------------------------------------------------------------------
| [0m1        [0m | [0m0.715    [0m | [0m13.0     [0m | [0m0.7285   [0m | [0m6.327    [0m | [0m22.7     [0m |
| [95m2        [0m | [95m0.764    [0m | [95m6.485    [0m | [95m0.1954   [0m | [95m18.73    [0m | [95m207.0    [0m |
| [0m3        [0m | [0m0.745    [0m | [0m11.06    [0m | [0m0.503    [0m | [0m2.439    [0m | [0m228.3    [0m |
| [0m4        [0m | [0m0.7334   [0m | [0m13.24    [0m | [0m0.9646   [0m | [0m17.28    [0m | [0m185.9    [0m |
| [0m5        [0m | [0m0.7467   [0m | [0m10.64    [0m | [0m0.3026   [0m | [0m17.41    [0m | [0m223.4    [0m |
| [0m6        [0m | [0m0.7538   [0m | [0m3.645    [0m | [0m0.6906  

In [10]:
models_dict_xgb = optimize_xgb(45)

---------------------------特征为：stay_chart_included_mdr-------------------------------
---------------------------模型为：XGBOOST-------------------------------
|   iter    |  target   | colsam... | max_depth | min_ch... | n_esti... | subsample |
-------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.7543   [0m | [0m0.6286   [0m | [0m11.39    [0m | [0m6.327    [0m | [0m20.51    [0m | [0m0.2843   [0m |
| [95m2        [0m | [95m0.7633   [0m | [95m0.1954   [0m | [95m11.73    [0m | [95m17.63    [0m | [95m123.7    [0m | [95m0.503    [0m |
| [0m3        [0m | [0m0.7607   [0m | [0m0.1172   [0m | [0m12.03    [0m | [0m15.86    [0m | [0m240.8    [0m | [0m0.6973   [0m |
| [95m4        [0m | [95m0.7665   [0m | [95m0.6454   [0m | [95m8.39     [0m | [95m7.183    [0m | [95m170.8    [0m | [95m0.7615   [0m |
| [0m5        [0m | [0m0.7538   [0m | [0m0.3319   [0m | [0m4.147    [0m | [0m24.1

In [11]:
models_dict_rf_mod = models_dict_rf.copy()
models_dict_xgb_mod =models_dict_xgb.copy()

In [12]:
for i in models_dict_rf_mod:
    models_dict_rf_mod[i]['params']['max_depth'] = int(models_dict_rf_mod[i]['params']['max_depth'])
    models_dict_rf_mod[i]['params']['max_features'] = round(models_dict_rf_mod[i]['params']['max_features'],3)
    models_dict_rf_mod[i]['params']['min_samples_split'] = int(models_dict_rf_mod[i]['params']['min_samples_split'])
    models_dict_rf_mod[i]['params']['n_estimators'] = int(models_dict_rf_mod[i]['params']['n_estimators'])

In [13]:
for i in models_dict_xgb_mod:
    models_dict_xgb_mod[i]['params']['max_depth'] = int(models_dict_xgb_mod[i]['params']['max_depth'])
    models_dict_xgb_mod[i]['params']['colsample_bytree'] = round(models_dict_xgb_mod[i]['params']['colsample_bytree'],3)
    models_dict_xgb_mod[i]['params']['subsample'] = round(models_dict_xgb_mod[i]['params']['subsample'],3)
    models_dict_xgb_mod[i]['params']['min_child_weight'] = int(models_dict_xgb_mod[i]['params']['min_child_weight'])
    models_dict_xgb_mod[i]['params']['n_estimators'] = int(models_dict_xgb_mod[i]['params']['n_estimators'])

In [14]:
np.save('rf_params.npy',models_dict_rf_mod)
np.save('xgb_params.npy',models_dict_xgb_mod)

In [15]:
#models_dict_rf_mod = np.load('rf_params.npy',allow_pickle=True).item()

In [16]:
# total_param_dict = {}
# models_dict = {}
# n_iter = 195
# for i in label_features_dict:
#     label = i
#     features = label_features_dict[i]
#     x_train = dev_df[features]
#     y_train = dev_df[label]
#     x_test = val_df[features]
#     y_test = val_df[label]
    
# #     gbdt_op = BayesianOptimization(
# #         gbdt_cv,
# #         {'n_estimators': (10, 250),
# #         'min_samples_split': (2, 25),
# #         'max_features': (0.1, 0.999),
# #         'max_depth': (5, 15)},
# #     random_state=2024)

#     rf_op = BayesianOptimization(
#         rf_cv,
#         {'n_estimators': (10, 250),
#         'min_samples_split': (2, 25),
#         'max_features': (0.1, 0.999),
#         'max_depth': (3, 15)},
#     random_state = 2024)
    
#     xgb_op = BayesianOptimization(
#         xgb_cv,
#         {'n_estimators': (10, 250),
#         'min_child_weight': (2, 25),
#         #'gama': (0.1, 0.999),
#         'subsample': (0.1, 0.999),
#         'colsample_bytree': (0.1, 0.999),
#         'max_depth': (3, 15)},
#     random_state = 2024)
    
#     print('---------------------------特征为：%s-------------------------------'%label)
# #     print('---------------------------模型为：GBDT-------------------------------')
# #     gbdt_op.maximize(n_iter=n_iter)
# #     print(gbdt_op.max)
# #     models_dict['gbdt'] = gbdt_op.max
#     print('---------------------------模型为：Random Forest-------------------------------')
#     rf_op.maximize(n_iter=n_iter)
#     print(rf_op.max)
#     models_dict['rf'] = rf_op.max
#     print('---------------------------模型为：XGBOOST-------------------------------')
#     xgb_op.maximize(n_iter=n_iter)
#     print(xgb_op.max)
#     models_dict['xgboost'] = xgb_op.max
#     total_param_dict[label] = models_dict
#     print('----------------------------------------------------------------------------------------------------')