In [1]:
!pip install hyperopt

import pandas as pd
import numpy as np
import time
import eli5
from sklearn import *

# model algorithams
from sklearn.ensemble import (RandomForestClassifier, 
                              AdaBoostClassifier, 
                              GradientBoostingClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
from sklearn.svm import SVC
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.impute import SimpleImputer
from feature_engine.wrappers import SklearnTransformerWrapper

#Common model helpers
from sklearn.preprocessing import (StandardScaler,
                                   LabelEncoder,
                                   OneHotEncoder)
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, 
                             auc, 
                             precision_score,
                             recall_score,
                             f1_score, 
                             roc_auc_score,
                             confusion_matrix)
from sklearn.model_selection import (GridSearchCV,
                                     StratifiedKFold,
                                     cross_val_score)
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from pycaret.classification import *
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

# import packages for hyperparameters tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe



In [2]:
# load data
df = pd.read_csv("data_strokes_prediction.csv")


# drop useless column
df=df.drop(columns='id')


# fill null values with regression on numerical values
DT_bmi_pipe = Pipeline( steps=[
                               ('scale',StandardScaler()),
                               ('lr',DecisionTreeRegressor(random_state=42))
    
                              ])
X = df[['age','gender','bmi']].copy()
X.gender = X.gender.replace({'Male':0,'Female':1,'Other':-1}).astype(np.uint8)
Missing = X[X.bmi.isna()]
X = X[~X.bmi.isna()]
Y = X.pop('bmi')
DT_bmi_pipe.fit(X,Y)
predicted_bmi = pd.Series(DT_bmi_pipe.predict(Missing[['age','gender']]),index=Missing.index)
df.loc[Missing.index,'bmi'] = round(predicted_bmi)

# ENCODE types object into categories
le = LabelEncoder()
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
for col in cat_cols:
       le.fit(df[col])
       df[col] = le.transform(df[col])


 
    
# define X as features, y as labels to predict
X,y = df.drop('stroke', axis = 1), df['stroke']




   # undersampling + oversampling

over = SMOTE(sampling_strategy = 1)
under = RandomUnderSampler(sampling_strategy = 0.1)

print(X.shape,y.shape)
X_res_u, y_res_u = under.fit_resample(X, y)
print(X_res_u.shape,y_res_u.shape)
X_res, y_res = over.fit_resample(X_res_u, y_res_u)
print(X_res.shape,y_res.shape) 
    





#split
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# AUGMENT : artificially get the same amount of stroke and non stroke cases for perfect balance
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

(5110, 10) (5110,)
(2739, 10) (2739,)
(4980, 10) (4980,)


In [3]:
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
       'learning_rate' : hp.quniform('learning_rate', 0,1,0.01),
       'subsample' : hp.quniform('subsample', 0,2,0.1),
       'nthread': hp.quniform('nthread', 0,10,1),
       'scale_pos_weight': hp.quniform('scale_pos_weight',0,5,1),
        'n_estimators': 100,
        'seed': 0,

    }

#        'subsample' : hp.quniform('subsample', 0,2,0.1),
#        'nthread': hp.quniform('nthread', 0,10,1),
#        'scale_pos_weight': hp.quniform('scale_pos_weight',0,5,1),
#        'n_estimators': hp.quniform('n_estimators', 1,500,10),

In [4]:
def objective(space):
    clf=XGBClassifier(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']))
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, pred>0.5)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }

In [5]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

SCORE:                                                                                                                 
0.7821285140562249                                                                                                     
SCORE:                                                                                                                 
0.7891566265060241                                                                                                     
SCORE:                                                                                                                 
0.7801204819277109                                                                                                     
SCORE:                                                                                                                 
0.8102409638554217                                                                                                     
SCORE:                                  

SCORE:                                                                                                                 
0.8182730923694779                                                                                                     
SCORE:                                                                                                                 
0.8182730923694779                                                                                                     
SCORE:                                                                                                                 
0.8112449799196787                                                                                                     
SCORE:                                                                                                                 
0.8052208835341366                                                                                                     
SCORE:                                  

In [6]:
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

The best hyperparameters are :  

{'colsample_bytree': 0.6966652866539669, 'gamma': 5.1222111958093395, 'learning_rate': 0.28, 'max_depth': 16.0, 'min_child_weight': 1.0, 'nthread': 3.0, 'reg_alpha': 61.0, 'reg_lambda': 0.5855571439718152, 'scale_pos_weight': 4.0, 'subsample': 0.9}
