In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#pd.set_option('max_columns',100)

import matplotlib as mpl
mpl.style.use('ggplot')
sns.set_style('white')

import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier

import sys
import os
import tempfile
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve,auc
from sklearn.metrics import accuracy_score
from sklearn.calibration import calibration_curve
from sklearn.metrics import brier_score_loss
from sklearn.metrics import (precision_score, recall_score,f1_score)
from sklearn.calibration import CalibratedClassifierCV, calibration_curve

In [2]:
from sklearn.ensemble import GradientBoostingClassifier
from bayes_opt import BayesianOptimization

In [3]:
dev_df = pd.read_csv('dev_df_revise.csv')
val_df = pd.read_csv('val_df_revise.csv')

In [4]:
# features = [
# 'hospstay_seq',
#  'icustay_rank',
#  'previous_mdrneg_lessthan90d',
#  'potassium',
#  'bicarbonate',
#  'creatinine',
#  'po2',
#  'eosinophils',
#  'monocytes_abs',
#  'lymphocytes',
#  'ptt',
#  'hematocrit',
#  'mch',
#  'mchc',
#  'platelet',
#  'rbc',
#  'rdw',
#  'history_dialysis',
#  'charlson_comorbidity_index',
#  'ant_duration',
#  'ant_1stday',
#  'immu_1stday',
#  'ent_nut',

#   'CEFTRIAXONE_anttest_beforeicu90d',
#   'TRIMETHOPRIM.SULFA_anttest_beforeicu90d',
#   'CEFEPIME_anttest_beforeicu90d'
#   'CEFTAZIDIME_anttest_beforeicu90d',
#   'CIPROFLOXACIN_anttest_beforeicu90d',
#   'PIPERACILLIN.TAZO_anttest_beforeicu90d',
#   'MEROPENEM_anttest_beforeicu90d',
#   'GENTAMICIN_anttest_beforeicu90d',
#   'TOBRAMYCIN_anttest_beforeicu90d'
# ]
# labels = ['CEFEPIME', 'CEFTAZIDIME', 'CIPROFLOXACIN','PIPERACILLIN.TAZO', 'MEROPENEM','GENTAMICIN', 'TOBRAMYCIN']

In [5]:
label_features_dict = {
'stay_chart_included_mdr':['dementia',
 'platelet',
 'phosphate',
 'heart_rate',
 'weight',
 'creatinine',
 'basophils_abs',
 'lymphocytes_abs',
 'icustay_rank',
 'temperature',
 'sodium',
 'history_difficulty_swallowing',
 'los_hosp_beforeicu',
 'hospstay_seq',
 'neutrophils_abs',
 'wbc',
 'mg',
 'NLR',
 'admission_age',
 'history_assistive_devices_Wheelchair',
 'mchc',
 'history_past_COPD',
 'bun',
 'rdw',
 'previous_mdr_90d'
]
}

In [6]:
#gbdt
def gbdt_cv(n_estimators, min_samples_split, max_features, max_depth):
    res = cross_val_score( 
        GradientBoostingClassifier(n_estimators=int(n_estimators),
                                                        min_samples_split=int(min_samples_split),
                                                        max_features=min(max_features, 0.999), # float
                                                        max_depth=int(max_depth),
                                                        random_state=2023
        ),
        x_train, y_train, scoring='roc_auc', cv=5
    ).mean()
    return res

gbdt_op = BayesianOptimization(
        gbdt_cv,
        {'n_estimators': (10, 250),
        'min_samples_split': (2, 25),
        'max_features': (0.1, 0.999),
        'max_depth': (5, 15)},
    random_state=2023
)

#random forest
def rf_cv(n_estimators, min_samples_split, max_features, max_depth):
    res = cross_val_score( 
        RandomForestClassifier(n_estimators=int(n_estimators),
                            min_samples_split=int(min_samples_split),
                            max_features=min(max_features, 0.999), # float
                            max_depth=int(max_depth),
                            random_state=2023
        ),
        x_train, y_train, scoring='roc_auc', cv=5
    ).mean()
    return res

rf_op = BayesianOptimization(
        rf_cv,
        {'n_estimators': (10, 250),
        'min_samples_split': (2, 25),
        'max_features': (0.1, 0.999),
        'max_depth': (3, 15)},
    random_state = 2023
)

#xgboost
def xgb_cv(n_estimators, min_child_weight,  subsample, colsample_bytree, max_depth):
    res = cross_val_score(
        XGBClassifier(n_estimators=int(n_estimators),
                            min_child_weight=int(min_child_weight),
                            #gama = min(gama,0.999),
                            subsample = min(subsample,0.999),
                            colsample_bytree = min(colsample_bytree,0.999),
                            max_depth=int(max_depth),
                            learning_rate = 0.01,
                            #n_jobs = -1,
                            random_state=2023
        ),
        x_train, y_train, scoring='roc_auc', cv=5
    ).mean()
    return res

xgb_op = BayesianOptimization(
        xgb_cv,
        {'n_estimators': (10, 250),
        'min_child_weight': (2, 25),
        #'gama': (0.1, 0.999),
        'subsample': (0.1, 0.999),
        'colsample_bytree': (0.1, 0.999),
        'max_depth': (3, 15)},
    random_state = 2023
)

In [7]:
def optimize_rf(n_iter):
    total_param_dict = {}
    models_dict = {}
    
    def rf_cv(n_estimators, min_samples_split, max_features, max_depth):
        return cross_val_score( 
                RandomForestClassifier(n_estimators=int(n_estimators),
                                    min_samples_split=int(min_samples_split),
                                    max_features=min(max_features, 0.999), # float
                                    max_depth=int(max_depth),
                                    random_state=2023
                ),
                x_train, y_train, scoring='roc_auc', cv=5
            ).mean()

    for i in label_features_dict:
        label = i
        features = label_features_dict[i]
        x_train = dev_df[features]
        y_train = dev_df[label]
        x_test = val_df[features]
        y_test = val_df[label]        
        
        rf_op = BayesianOptimization(
            rf_cv,
            {'n_estimators': (10, 300),
            'min_samples_split': (2, 25),
            'max_features': (0.1, 0.999),
            'max_depth': (3, 20)},
        random_state = 2023)

        print('---------------------------特征为：%s-------------------------------'%label)
        print('---------------------------模型为：Random Forest-------------------------------')
        rf_op.maximize(n_iter=n_iter)
        print(rf_op.max)
        models_dict[label] = rf_op.max
        print('----------------------------------------------------------------------------------------------------')
        
    return models_dict
    
def optimize_xgb(n_iter):
    total_param_dict = {}
    models_dict = {}
    
    def xgb_cv(n_estimators, min_child_weight,  subsample, colsample_bytree, max_depth):
        res = cross_val_score(
            XGBClassifier(n_estimators=int(n_estimators),
                                min_child_weight=int(min_child_weight),
                                #gama = min(gama,0.999),
                                subsample = min(subsample,0.999),
                                colsample_bytree = min(colsample_bytree,0.999),
                                max_depth=int(max_depth),
                                learning_rate = 0.01,
                                #n_jobs = -1,
                                random_state=2023
            ),
            x_train, y_train, scoring='roc_auc', cv=5
        ).mean()
        return res

    for i in label_features_dict:
        label = i
        features = label_features_dict[i]
        x_train = dev_df[features]
        y_train = dev_df[label]
        x_test = val_df[features]
        y_test = val_df[label]        
        
        xgb_op = BayesianOptimization(
                xgb_cv,
                {'n_estimators': (10, 250),
                'min_child_weight': (2, 25),
                #'gama': (0.1, 0.999),
                'subsample': (0.1, 0.999),
                'colsample_bytree': (0.1, 0.999),
                'max_depth': (3, 15)},
            random_state = 2023
        )

        print('---------------------------特征为：%s-------------------------------'%label)
        print('---------------------------模型为：XGBOOST-------------------------------')
        xgb_op.maximize(n_iter=n_iter)
        print(xgb_op.max)
        models_dict[label] = xgb_op.max
        print('----------------------------------------------------------------------------------------------------')
        
    return models_dict

In [8]:
models_dict_rf = optimize_rf(45)

---------------------------特征为：stay_chart_included_mdr-------------------------------
---------------------------模型为：Random Forest-------------------------------
|   iter    |  target   | max_depth | max_fe... | min_sa... | n_esti... |
-------------------------------------------------------------------------
| [0m1        [0m | [0m0.6939   [0m | [0m8.474    [0m | [0m0.9005   [0m | [0m15.53    [0m | [0m46.71    [0m |
| [95m2        [0m | [95m0.6943   [0m | [95m5.403    [0m | [95m0.5206   [0m | [95m2.508    [0m | [95m220.9    [0m |
| [95m3        [0m | [95m0.6982   [0m | [95m11.91    [0m | [95m0.5899   [0m | [95m12.5     [0m | [95m155.4    [0m |
| [95m4        [0m | [95m0.7009   [0m | [95m9.706    [0m | [95m0.2359   [0m | [95m10.3     [0m | [95m57.0     [0m |
| [0m5        [0m | [0m0.6945   [0m | [0m8.745    [0m | [0m0.2621   [0m | [0m10.99    [0m | [0m20.34    [0m |
| [95m6        [0m | [95m0.7034   [0m | [95m8.893    [0m

In [9]:
models_dict_xgb = optimize_xgb(45)

---------------------------特征为：stay_chart_included_mdr-------------------------------
---------------------------模型为：XGBOOST-------------------------------
|   iter    |  target   | colsam... | max_depth | min_ch... | n_esti... | subsample |
-------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.6986   [0m | [0m0.3895   [0m | [0m13.69    [0m | [0m15.53    [0m | [0m40.38    [0m | [0m0.2271   [0m |
| [0m2        [0m | [0m0.6961   [0m | [0m0.5206   [0m | [0m3.265    [0m | [0m18.73    [0m | [0m135.9    [0m | [0m0.5899   [0m |
| [95m3        [0m | [95m0.7058   [0m | [95m0.5103   [0m | [95m9.017    [0m | [95m11.07    [0m | [95m46.28    [0m | [95m0.4244   [0m |
| [0m4        [0m | [0m0.7009   [0m | [0m0.2457   [0m | [0m7.056    [0m | [0m6.147    [0m | [0m103.8    [0m | [0m0.132    [0m |
| [95m5        [0m | [95m0.706    [0m | [95m0.6078   [0m | [95m5.442    [0m | [95m9.374 

In [10]:
models_dict_rf_mod = models_dict_rf.copy()
models_dict_xgb_mod =models_dict_xgb.copy()

In [11]:
for i in models_dict_rf_mod:
    models_dict_rf_mod[i]['params']['max_depth'] = int(models_dict_rf_mod[i]['params']['max_depth'])
    models_dict_rf_mod[i]['params']['max_features'] = round(models_dict_rf_mod[i]['params']['max_features'],3)
    models_dict_rf_mod[i]['params']['min_samples_split'] = int(models_dict_rf_mod[i]['params']['min_samples_split'])
    models_dict_rf_mod[i]['params']['n_estimators'] = int(models_dict_rf_mod[i]['params']['n_estimators'])

In [12]:
for i in models_dict_xgb_mod:
    models_dict_xgb_mod[i]['params']['max_depth'] = int(models_dict_xgb_mod[i]['params']['max_depth'])
    models_dict_xgb_mod[i]['params']['colsample_bytree'] = round(models_dict_xgb_mod[i]['params']['colsample_bytree'],3)
    models_dict_xgb_mod[i]['params']['subsample'] = round(models_dict_xgb_mod[i]['params']['subsample'],3)
    models_dict_xgb_mod[i]['params']['min_child_weight'] = int(models_dict_xgb_mod[i]['params']['min_child_weight'])
    models_dict_xgb_mod[i]['params']['n_estimators'] = int(models_dict_xgb_mod[i]['params']['n_estimators'])

In [13]:
np.save('rf_params_revise.npy',models_dict_rf_mod)
np.save('xgb_params_revise.npy',models_dict_xgb_mod)

In [14]:
#models_dict_rf_mod = np.load('rf_params.npy',allow_pickle=True).item()

In [15]:
# total_param_dict = {}
# models_dict = {}
# n_iter = 195
# for i in label_features_dict:
#     label = i
#     features = label_features_dict[i]
#     x_train = dev_df[features]
#     y_train = dev_df[label]
#     x_test = val_df[features]
#     y_test = val_df[label]
    
# #     gbdt_op = BayesianOptimization(
# #         gbdt_cv,
# #         {'n_estimators': (10, 250),
# #         'min_samples_split': (2, 25),
# #         'max_features': (0.1, 0.999),
# #         'max_depth': (5, 15)},
# #     random_state=2023)

#     rf_op = BayesianOptimization(
#         rf_cv,
#         {'n_estimators': (10, 250),
#         'min_samples_split': (2, 25),
#         'max_features': (0.1, 0.999),
#         'max_depth': (3, 15)},
#     random_state = 2023)
    
#     xgb_op = BayesianOptimization(
#         xgb_cv,
#         {'n_estimators': (10, 250),
#         'min_child_weight': (2, 25),
#         #'gama': (0.1, 0.999),
#         'subsample': (0.1, 0.999),
#         'colsample_bytree': (0.1, 0.999),
#         'max_depth': (3, 15)},
#     random_state = 2023)
    
#     print('---------------------------特征为：%s-------------------------------'%label)
# #     print('---------------------------模型为：GBDT-------------------------------')
# #     gbdt_op.maximize(n_iter=n_iter)
# #     print(gbdt_op.max)
# #     models_dict['gbdt'] = gbdt_op.max
#     print('---------------------------模型为：Random Forest-------------------------------')
#     rf_op.maximize(n_iter=n_iter)
#     print(rf_op.max)
#     models_dict['rf'] = rf_op.max
#     print('---------------------------模型为：XGBOOST-------------------------------')
#     xgb_op.maximize(n_iter=n_iter)
#     print(xgb_op.max)
#     models_dict['xgboost'] = xgb_op.max
#     total_param_dict[label] = models_dict
#     print('----------------------------------------------------------------------------------------------------')