# Packages

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.set_option('max_columns',200)

In [3]:
import matplotlib as mpl
mpl.style.use('ggplot')
sns.set_style('white')

In [4]:
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

import sys
import os
import tempfile
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve,auc
from sklearn.metrics import accuracy_score

# Read  the data

In [5]:
dev = pd.read_csv('train_df.csv')
val = pd.read_csv('val_df.csv')

In [6]:
dev.columns

Index(['GENDER', 'AGE', 'previous_mdrpos_lessthan90d',
       'previous_mdrneg_lessthan90d', 'icustay_rank', 'los_hosp_beforeicu',
       'myocardial_infarct', 'congestive_heart_failure',
       'peripheral_vascular_disease', 'cerebrovascular_disease', 'dementia',
       'chronic_pulmonary_disease', 'rheumatic_disease',
       'peptic_ulcer_disease', 'severe_liver_disease', 'mild_liver_disease',
       'diabetes_with_cc', 'diabetes_without_cc', 'paraplegia',
       'metastatic_solid_tumor', 'malignant_cancer', 'aids', 'ALP', 'ALT',
       'APPT', 'AST', 'BNP', 'CRP', 'D_dimer', 'INR', 'LYN', 'PA', 'PCT',
       'IL_6', 'PT', 'PTT', 'albumin', 'amylase', 'bands', 'basophil',
       'bilirubin', 'bilirubin_direct', 'ca', 'chloride', 'ck', 'ck_mb',
       'creatinine', 'eosinophil', 'fibrinogen', 'gamma_GT', 'glucose',
       'hct_pcv', 'hemoglobin', 'lipase', 'mb', 'mch', 'mchc', 'mcv', 'mg',
       'monocytes', 'mpv', 'p', 'platelet', 'potassium', 'rbc', 'rdw',
       'sodium', 'urea', 

In [7]:
features = [
'los_hosp_beforeicu',
 'ALP',
 'BNP',
 'CRP',
 'INR',
 'PCT',
 'IL_6',
 'bilirubin',
 'ck',
 'fibrinogen',
 'rbc',
 'sodium',
 'urea',
 'HR',
 'SBP'
]
target = ['stay_chart_included_mdr']

In [8]:
x_train = dev[features]
y_train = dev[target]

x_extval = val[features]
y_extval = val[target]

In [9]:
y_extval.value_counts()

stay_chart_included_mdr
0                          1048
1                           336
dtype: int64

In [10]:
train_tot = dev[features + target]
extval_tot = val[features + target]

In [11]:
def lookup_miss(df):
    nadf = pd.DataFrame(columns=['nan_nums','nan_ratios'])
    for i in df.columns:
        nadf.loc[i,'nan_nums'] = pd.isna(df[i]).sum()
        nadf.loc[i,'nan_ratios'] = (nadf.loc[i,'nan_nums']/df.shape[0])*100
    return nadf.sort_values(by = 'nan_ratios', ascending = False)

In [12]:
lookup_miss(x_train)

Unnamed: 0,nan_nums,nan_ratios
los_hosp_beforeicu,0,0.0
ALP,0,0.0
BNP,0,0.0
CRP,0,0.0
INR,0,0.0
PCT,0,0.0
IL_6,0,0.0
bilirubin,0,0.0
ck,0,0.0
fibrinogen,0,0.0


In [13]:
scaler = StandardScaler()

x_train_scaler = scaler.fit_transform(x_train)
x_extval_scaler = scaler.transform(x_extval)

In [14]:
skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

## Logistic Regression

In [15]:
lr_model = LogisticRegression(random_state=2021)
         

param_dict1 = {'penalty':['l1','l2'],
             'C':[0.00001,0.0001,0.001,0.01,0.1,1,10,100]
             }
estimator_grid_lr = GridSearchCV(lr_model,param_grid=param_dict1,cv=skf,n_jobs = -1,scoring = 'roc_auc')

estimator_grid_lr.fit(x_train_scaler,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=LogisticRegression(random_state=2021), n_jobs=-1,
             param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
                         'penalty': ['l1', 'l2']},
             scoring='roc_auc')

In [16]:
print("----------------------------------------")
print('最佳参数：\n',estimator_grid_lr.best_params_)
print('最佳结果：\n',estimator_grid_lr.best_score_)
print('最佳估计器：\n',estimator_grid_lr.best_estimator_)

----------------------------------------
最佳参数：
 {'C': 1e-05, 'penalty': 'l2'}
最佳结果：
 0.7576767124068454
最佳估计器：
 LogisticRegression(C=1e-05, random_state=2021)


In [17]:
roc_auc_score(y_extval, estimator_grid_lr.predict_proba(x_extval_scaler)[:,1])

0.7517436841148674

## Random Forest

In [19]:
forest_model1 = RandomForestClassifier(random_state=2021)
         

param_dict1 = {'n_estimators':[100,150,200,250,300],
             'max_depth':[3,4,5,6,7],
            'min_samples_leaf':[7,9,11,12,13,15]
             }
estimator_grid_forest1 = GridSearchCV(forest_model1,param_grid=param_dict1,cv=skf,n_jobs = -1,scoring = 'roc_auc')

estimator_grid_forest1.fit(x_train,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=RandomForestClassifier(random_state=2021), n_jobs=-1,
             param_grid={'max_depth': [3, 4, 5, 6, 7],
                         'min_samples_leaf': [7, 9, 11, 12, 13, 15],
                         'n_estimators': [100, 150, 200, 250, 300]},
             scoring='roc_auc')

In [20]:
print("----------------------------------------")
print('最佳参数：\n',estimator_grid_forest1.best_params_)
print('最佳结果：\n',estimator_grid_forest1.best_score_)
print('最佳估计器：\n',estimator_grid_forest1.best_estimator_)

----------------------------------------
最佳参数：
 {'max_depth': 6, 'min_samples_leaf': 15, 'n_estimators': 150}
最佳结果：
 0.7836145312306251
最佳估计器：
 RandomForestClassifier(max_depth=6, min_samples_leaf=15, n_estimators=150,
                       random_state=2021)


In [21]:
roc_auc_score(y_extval, estimator_grid_forest1.predict_proba(x_extval)[:,1])

0.7919762131952018

In [22]:
forest_model1 = RandomForestClassifier(random_state=2021)
         

param_dict1 = {'n_estimators':[110,120,130,140,150,160,170,180,190,200],
             'max_depth':[5,6,7],
            'min_samples_leaf':[14,15,16,17,18,19]
             }
estimator_grid_forest1 = GridSearchCV(forest_model1,param_grid=param_dict1,cv=skf,n_jobs = 5,scoring = 'roc_auc')

estimator_grid_forest1.fit(x_train,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=RandomForestClassifier(random_state=2021), n_jobs=5,
             param_grid={'max_depth': [5, 6, 7],
                         'min_samples_leaf': [14, 15, 16, 17, 18, 19],
                         'n_estimators': [110, 120, 130, 140, 150, 160, 170,
                                          180, 190, 200]},
             scoring='roc_auc')

In [24]:
print("----------------------------------------")
print('最佳参数：\n',estimator_grid_forest1.best_params_)
print('最佳结果：\n',estimator_grid_forest1.best_score_)
print('最佳估计器：\n',estimator_grid_forest1.best_estimator_)

----------------------------------------
最佳参数：
 {'max_depth': 7, 'min_samples_leaf': 14, 'n_estimators': 110}
最佳结果：
 0.7846550690892862
最佳估计器：
 RandomForestClassifier(max_depth=7, min_samples_leaf=14, n_estimators=110,
                       random_state=2021)


In [25]:
roc_auc_score(y_extval, estimator_grid_forest1.predict_proba(x_extval)[:,1])

0.7945945792439113

In [29]:
forest_model1 = RandomForestClassifier(random_state=2021)
         

param_dict1 = {'n_estimators':[50,60,80,90,100,110,120],
             'max_depth':[5,6,7,8],
            'min_samples_leaf':[12,13,14,15]
             }
estimator_grid_forest1 = GridSearchCV(forest_model1,param_grid=param_dict1,cv=skf,n_jobs = 5,scoring = 'roc_auc')

estimator_grid_forest1.fit(x_train,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=RandomForestClassifier(random_state=2021), n_jobs=5,
             param_grid={'max_depth': [5, 6, 7, 8],
                         'min_samples_leaf': [12, 13, 14, 15],
                         'n_estimators': [50, 60, 80, 90, 100, 110, 120]},
             scoring='roc_auc')

In [30]:
print("----------------------------------------")
print('最佳参数：\n',estimator_grid_forest1.best_params_)
print('最佳结果：\n',estimator_grid_forest1.best_score_)
print('最佳估计器：\n',estimator_grid_forest1.best_estimator_)

----------------------------------------
最佳参数：
 {'max_depth': 7, 'min_samples_leaf': 14, 'n_estimators': 80}
最佳结果：
 0.7870885844798696
最佳估计器：
 RandomForestClassifier(max_depth=7, min_samples_leaf=14, n_estimators=80,
                       random_state=2021)


In [31]:
roc_auc_score(y_extval, estimator_grid_forest1.predict_proba(x_extval)[:,1])

0.792351076881134

## XGBOOST

In [33]:
xgboost_model1_1 = XGBClassifier(random_state = 2021)
         

param_dict1_1 = {
              'n_estimators':[15,20,25,35,45,50,70,80],
            'max_depth':[2,3,4,5,6],
            'min_child_weight':[1,2,4,6],
            'learning_rate':[0.001,0.01,0.1]
             }
estimator_grid_xgb1_1 = GridSearchCV(xgboost_model1_1,param_grid=param_dict1_1,cv=skf, n_jobs = -1,scoring = 'roc_auc',verbose = 20)

estimator_grid_xgb1_1.fit(x_train,y_train)

Fitting 5 folds for each of 480 candidates, totalling 2400 fits


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     mis...
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=2021,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                           

In [34]:
print("----------------------------------------")
print('最佳参数：\n',estimator_grid_xgb1_1.best_params_)
print('最佳结果：\n',estimator_grid_xgb1_1.best_score_)
print('最佳估计器：\n',estimator_grid_xgb1_1.best_estimator_)

----------------------------------------
最佳参数：
 {'learning_rate': 0.1, 'max_depth': 2, 'min_child_weight': 4, 'n_estimators': 45}
最佳结果：
 0.7764848287949386
最佳估计器：
 XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=2,
              min_child_weight=4, missing=nan, monotone_constraints='()',
              n_estimators=45, n_jobs=40, num_parallel_tree=1,
              random_state=2021, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)


In [35]:
roc_auc_score(y_extval, estimator_grid_xgb1_1.predict_proba(x_extval)[:,1])

0.7822127181025083

In [36]:
xgboost_model1_1 = XGBClassifier(random_state = 2021)
         

param_dict1_1 = {
              'n_estimators':[40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55],
            'max_depth':[2,3,4,5],
            'min_child_weight':[2,3,4,5],
     'learning_rate':[0.01,0.1]
             }
estimator_grid_xgb1_1 = GridSearchCV(xgboost_model1_1,param_grid=param_dict1_1,cv=skf, n_jobs = 10,scoring = 'roc_auc',verbose = 20)

estimator_grid_xgb1_1.fit(x_train,y_train)

Fitting 5 folds for each of 512 candidates, totalling 2560 fits


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     mis...
                                     num_parallel_tree=None, random_state=2021,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None, validate_parameters=None,
              

In [37]:
print("----------------------------------------")
print('最佳参数：\n',estimator_grid_xgb1_1.best_params_)
print('最佳结果：\n',estimator_grid_xgb1_1.best_score_)
print('最佳估计器：\n',estimator_grid_xgb1_1.best_estimator_)

----------------------------------------
最佳参数：
 {'learning_rate': 0.1, 'max_depth': 2, 'min_child_weight': 4, 'n_estimators': 45}
最佳结果：
 0.7764848287949386
最佳估计器：
 XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=2,
              min_child_weight=4, missing=nan, monotone_constraints='()',
              n_estimators=45, n_jobs=40, num_parallel_tree=1,
              random_state=2021, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)


In [38]:
roc_auc_score(y_extval, estimator_grid_xgb1_1.predict_proba(x_extval)[:,1])

0.7822127181025083

## SVM

In [41]:
svm_model = SVC(kernel='linear', probability = True, random_state = 2021)
         

param_dict1 = {
             'C':[1e-06, 0.00001,0.0001,0.001,0.01,0.1,1,10,100]
             }
estimator_grid_svm = GridSearchCV(svm_model,param_grid=param_dict1,cv=skf,n_jobs = 5,scoring = 'roc_auc')

estimator_grid_svm.fit(x_train_scaler,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=SVC(kernel='linear', probability=True,
                           random_state=2021),
             n_jobs=5,
             param_grid={'C': [1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10,
                               100]},
             scoring='roc_auc')

In [42]:
print("----------------------------------------")
print('最佳参数：\n',estimator_grid_svm.best_params_)
print('最佳结果：\n',estimator_grid_svm.best_score_)
print('最佳估计器：\n',estimator_grid_svm.best_estimator_)

----------------------------------------
最佳参数：
 {'C': 100}
最佳结果：
 0.652689567058008
最佳估计器：
 SVC(C=100, kernel='linear', probability=True, random_state=2021)


In [43]:
roc_auc_score(y_extval, estimator_grid_svm.predict_proba(x_extval_scaler)[:,1]

0.662524268966846

## KNN

In [61]:
knn_model = KNeighborsClassifier()
         

param_dict1 = {
             'n_neighbors':[5,20,35,40,45,50,60,70,80,100]
             }
estimator_grid_knn = GridSearchCV(knn_model,param_grid=param_dict1,cv=skf,n_jobs = 5,scoring = 'roc_auc')

estimator_grid_knn.fit(x_train_scaler,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=KNeighborsClassifier(), n_jobs=5,
             param_grid={'n_neighbors': [5, 20, 35, 40, 45, 50, 60, 70, 80,
                                         100]},
             scoring='roc_auc')

In [62]:
print("----------------------------------------")
print('最佳参数：\n',estimator_grid_knn.best_params_)
print('最佳结果：\n',estimator_grid_knn.best_score_)
print('最佳估计器：\n',estimator_grid_knn.best_estimator_)

----------------------------------------
最佳参数：
 {'n_neighbors': 50}
最佳结果：
 0.6695932887175513
最佳估计器：
 KNeighborsClassifier(n_neighbors=50)


In [63]:
roc_auc_score(y_extval, estimator_grid_knn.predict_proba(x_extval_scaler)[:,1])

0.733389652157159

In [77]:
knn_model = KNeighborsClassifier()
         

param_dict2 = {
             'n_neighbors':[46,47,48,49,50,51,52,53,54,55,56,57,58,59]
             }
estimator_grid_knn2 = GridSearchCV(knn_model,param_grid=param_dict2,cv=skf,n_jobs = 5,scoring = 'roc_auc')

estimator_grid_knn2.fit(x_train_scaler,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=KNeighborsClassifier(), n_jobs=5,
             param_grid={'n_neighbors': [46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
                                         56, 57, 58, 59]},
             scoring='roc_auc')

In [78]:
print("----------------------------------------")
print('最佳参数：\n',estimator_grid_knn2.best_params_)
print('最佳结果：\n',estimator_grid_knn2.best_score_)
print('最佳估计器：\n',estimator_grid_knn2.best_estimator_)

----------------------------------------
最佳参数：
 {'n_neighbors': 48}
最佳结果：
 0.6707857389834735
最佳估计器：
 KNeighborsClassifier(n_neighbors=48)


In [79]:
roc_auc_score(y_extval, estimator_grid_knn2.predict_proba(x_extval_scaler)[:,1])

0.7288878514368711