In [1]:
# !pip install jupyterthemes
# !jt -t oceans16 -T -N
# from jupyterthemes import get_themes
# import jupyterthemes as jt
# from jupyterthemes.stylefx import set_nb_theme
# set_nb_theme('oceans16',-T -N -kl)

In [2]:
!pip install feature-engine

import pandas as pd
import numpy as np
import time
import eli5
from sklearn import *

# model algorithams
from sklearn.ensemble import (RandomForestClassifier, 
                              AdaBoostClassifier, 
                              GradientBoostingClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
from sklearn.svm import SVC, LinearSVC
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.impute import SimpleImputer
from feature_engine.wrappers import SklearnTransformerWrapper

#Common model helpers
from sklearn.preprocessing import (StandardScaler,
                                   LabelEncoder,
                                   OneHotEncoder)
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, 
                             auc, 
                             precision_score,
                             recall_score,
                             f1_score, 
                             roc_auc_score,
                             confusion_matrix)
from sklearn.model_selection import (GridSearchCV,
                                     StratifiedKFold,
                                     cross_val_score)
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from pycaret.classification import *
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils import shuffle



In [3]:
random_state = 2023

# load data
df = pd.read_csv("data_strokes_prediction.csv")

# drop outliers in age : 69768 (stroke at 1.32), 49669 (stroke at 14)
outlier1 = df.loc[df['id'] == 69768]
outlier2 = df.loc[df['id'] == 49669]
# print('outlier1',outlier1)
# print('outlier2',outlier2)
df=df.drop(df[df['id'] == 69768].index)
df=df.drop(df[df['id'] == 49669].index)

# drop useless column
df=df.drop(columns='id')


In [4]:
# fill null values with regression on numerical values
DT_bmi_pipe = Pipeline( steps=[
                               ('scale',StandardScaler()),
                               ('lr',DecisionTreeRegressor(random_state=random_state))
    
                              ])
X = df[['age','gender','bmi']].copy()
X.gender = X.gender.replace({'Male':0,'Female':1,'Other':-1}).astype(np.uint8)
Missing = X[X.bmi.isna()]
X = X[~X.bmi.isna()]
Y = X.pop('bmi')
DT_bmi_pipe.fit(X,Y)
predicted_bmi = pd.Series(DT_bmi_pipe.predict(Missing[['age','gender']]),index=Missing.index)
df.loc[Missing.index,'bmi'] = round(predicted_bmi)

In [5]:
# ENCODE types object into categories
le = LabelEncoder()
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
for col in cat_cols:
       le.fit(df[col])
       df[col] = le.transform(df[col])

In [6]:
df.columns

Index(['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [7]:
df.dtypes

gender                 int32
age                  float64
hypertension           int64
heart_disease          int64
ever_married           int32
work_type              int32
Residence_type         int32
avg_glucose_level    float64
bmi                  float64
smoking_status         int32
stroke                 int64
dtype: object

In [8]:

# # feature selection
# df=df[['age', 'avg_glucose_level', 'bmi', 'stroke']]


# define X as features, y as labels to predict





X,y = df.drop('stroke', axis = 1), df['stroke']
# # shuffle data
# X, y = shuffle(X, y)


# undersampling + oversampling

over = SMOTE(sampling_strategy = 1)
under = RandomUnderSampler(sampling_strategy = 0.1)

print(X.shape,y.shape)
X_res_u, y_res_u = under.fit_resample(X, y)
print(X_res_u.shape,y_res_u.shape)
X_res, y_res = over.fit_resample(X_res_u, y_res_u)
print(X_res.shape,y_res.shape)


# X_train_res, y_train_res

(5108, 10) (5108,)
(2717, 10) (2717,)
(4940, 10) (4940,)


In [9]:
# # UNDERSAMPLE
# rus = RandomUnderSampler(random_state=42) #sample coan be biased so test many times without random state
# # rus = RandomUnderSampler()
# X_res, y_res = rus.fit_resample(X, y)

In [10]:
#split
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.15,random_state=random_state)
# X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)


In [11]:
# #RESCALE
# sc = StandardScaler()
# X_train = sc.fit_transform(X_train)
# X_test = sc.transform(X_test)

In [12]:
# # AUGMENT : artificially get the same amount of stroke and non stroke cases for perfect balance
# sm = SMOTE(random_state=random_state)
# X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

In [13]:




# XGB_BEST_PARAMS = {'random_state': random_state,
#                    'colsample_bytree': 0.6470331952059594,
#                    'gamma': 8.746218586664071,
#                    'learning_rate': 0.6,
#                    'max_depth': 17.0,
#                    'min_child_weight':4.0,
#                    'nthread': 8.0,
#                    'reg_alpha': 96.0,
#                    'reg_lambda': 0.4345180176753971,
#                    'scale_pos_weight': 2.0,
#                    'subsample': 1.2000000000000002,
#                 'n_estimators': 100,
#                   'seed': 0}

XGB_BEST_PARAMS = {'random_state': random_state}


# LG_BEST_PARAMS ={'C': 4.635, 'penalty': 'l2', 'tol': 0.0001}
LR_BEST_PARAMS = {}

# XGB_BEST_PARAMS = {}

# SVC_BEST_PARAMS = {'kernel': 'rbf', 'gamma': 0.001, 'C': 1000}
SVC_BEST_PARAMS = {}








def test_main_classifiers(x_set,y_set):
    t1 = time.time()
    print('Classification Process Starts....')
    accuracy,precision,recall,f1,auc,conf_mat= [],[],[],[],[],[]
        
    
#     random_state = None
    
    ##classifiers list 
    classifiers = []
    classifiers.append(SVC(random_state=random_state, probability = True).set_params(**SVC_BEST_PARAMS))
    classifiers.append(DecisionTreeClassifier(random_state=random_state))
    classifiers.append(AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state)))
    classifiers.append(RandomForestClassifier(random_state=random_state))
    classifiers.append(GradientBoostingClassifier(random_state=random_state))
    classifiers.append(KNeighborsClassifier())
#     classifiers.append(LogisticRegression(random_state=random_state))
    classifiers.append(LogisticRegression(random_state=random_state).set_params(**LR_BEST_PARAMS))
#     classifiers.append(XGBClassifier(random_state=random_state))
    classifiers.append(XGBClassifier(params=XGB_BEST_PARAMS))
    classifiers.append(LGBMClassifier(random_state = random_state,learning_rate = 0.067))


    for classifier in classifiers:
        
        t =time.time()
        print('fitting on classifier with parameters: {}'.format(classifier))
        
        #classifier and fitting
        clf = classifier
        clf.fit(x_set,y_set)
        
        #predictions
        y_preds = clf.predict(X_test)
        y_probs = clf.predict_proba(X_test)
        
        # metrics
        accuracy.append((round(accuracy_score(y_test,y_preds),2))*100)
        precision.append((round(precision_score(y_test,y_preds),2))*100)
        recall.append((round(recall_score(y_test,y_preds),2))*100)
        f1.append((round(f1_score(y_test,y_preds),2))*100)
        auc.append((round (roc_auc_score(y_test,y_probs[:,1]), 2))*100)
        conf_mat.append(confusion_matrix(y_test,y_preds))
        
        elapsed = time.time() - t
        print('Done and elapsed time is {}seconds'.format(round(elapsed,3)))
        print('\n')
        
        
    results_df = pd.DataFrame({"Accuracy Score":accuracy,"Precision Score":precision,
                        "Recall Score":recall, "f1 Score":f1,"AUC Score":auc,
                        "Confusion Matrix":conf_mat,
                        "Algorithm":["SVC","DecisionTree","AdaBoost",
                                     "RandomForest","GradientBoosting",
                                     "KNeighboors","LogisticRegression",
                                     "XGBoost", "LightGBM"]})
    
    results_df = (results_df.sort_values(by = 'Algorithm', ascending = False)
                  .reset_index(drop =  True))
    t2 = time.time() - t1
    print('\nClassification is Completed and results are strored in dataframe.\ntotal time elapsed is {}seconds'.format(t2))
    print('***************************************************************\n\n')
    
    return results_df

In [14]:
# orig_results = test_main_classifiers(X_train,y_train)
# resamp_results = test_main_classifiers(X_train_res,y_train_res)
# orig_results = test_main_classifiers(X,y)
resamp_results = test_main_classifiers(X_train,y_train)

Classification Process Starts....
fitting on classifier with parameters: SVC(probability=True, random_state=2023)
Done and elapsed time is 2.308seconds


fitting on classifier with parameters: DecisionTreeClassifier(random_state=2023)
Done and elapsed time is 0.02seconds


fitting on classifier with parameters: AdaBoostClassifier(estimator=DecisionTreeClassifier(random_state=2023))
Done and elapsed time is 0.029seconds


fitting on classifier with parameters: RandomForestClassifier(random_state=2023)
Done and elapsed time is 0.597seconds


fitting on classifier with parameters: GradientBoostingClassifier(random_state=2023)
Done and elapsed time is 0.557seconds


fitting on classifier with parameters: KNeighborsClassifier()
Done and elapsed time is 0.037seconds


fitting on classifier with parameters: LogisticRegression(random_state=2023)
Done and elapsed time is 0.039seconds


fitting on classifier with parameters: XGBClassifier(base_score=None, booster=None, callbacks=None,
          

In [15]:
# orig_results.sort_values(by='Recall Score', ascending=False)

In [16]:
resamp_results.sort_values(by='f1 Score', ascending=False)

Unnamed: 0,Accuracy Score,Precision Score,Recall Score,f1 Score,AUC Score,Confusion Matrix,Algorithm
0,93.0,92.0,94.0,93.0,98.0,"[[337, 30], [21, 353]]",XGBoost
2,92.0,90.0,95.0,92.0,98.0,"[[327, 40], [20, 354]]",RandomForest
4,91.0,89.0,94.0,92.0,97.0,"[[324, 43], [22, 352]]",LightGBM
5,89.0,82.0,99.0,90.0,94.0,"[[288, 79], [5, 369]]",KNeighboors
7,87.0,86.0,90.0,88.0,87.0,"[[311, 56], [39, 335]]",DecisionTree
8,88.0,86.0,90.0,88.0,88.0,"[[313, 54], [37, 337]]",AdaBoost
6,87.0,84.0,91.0,87.0,93.0,"[[303, 64], [35, 339]]",GradientBoosting
1,78.0,74.0,86.0,80.0,84.0,"[[252, 115], [51, 323]]",SVC
3,79.0,78.0,83.0,80.0,87.0,"[[279, 88], [65, 309]]",LogisticRegression


In [17]:
# grid search cross validation


# # LogisticRegression()
# model,name = LogisticRegression(), 'LR'
# C = [4.55, 4.60,4.61,4.62,4.625,4.629,4.630,4.631,4.61,4.635,4.64,4.65]
# penalty = ['l1','l2', 'elasticnet']
# tol = [ 0.0001, 0.001, 0.01]
# param_grid = {'penalty': penalty,'C': C, 'tol':tol} 


# # XGBClassifier()
model,name = XGBClassifier(),'XGBC'
param_grid = {'learning_rate' : [1,0.20,0.32,0.329,0.33,0.331, 0.34,0.35, 0.36,2],
              'gamma': [1,10,50],
              'max_depth': [1,6,10]}


# # SVC()
# model,name = SVC(cache_size=2000), 'SVC'
# # param_grid = {'C':[1,10,100,1000],'gamma':[1,0.1,0.001,0.0001], 'kernel':['linear','rbf']}
# param_grid = {'C':[500,800,1000],'gamma':[0.0001,0.0005], 'kernel':['linear','rbf']}
# # param_grid = {'C':[1,10,1000],'gamma':[1,0.1], 'kernel':['linear','rbf']}
# # param_grid_2 = {'degree' : [0, 1, 2, 3, 4, 5, 6], 'kernel':['poly']}



# grid = GridSearchCV(model,param_grid, n_jobs=-1, cv=10)
# # grid = RandomizedSearchCV(model,param_grid, n_jobs=-1, random_state=random_state) # RandomizedSearchCV for slow models like SVC + n_jobs
# grid.fit(X_train_res, y_train_res)
# grid.best_params_


In [18]:
SVC_BEST_PARAMS = {'kernel': 'rbf', 'gamma': 0.001, 'C': 1000}
LR_BEST_PARAMS ={'C': 4.55, 'penalty': 'l2', 'tol': 0.0001, 
                 'solver':'lbfgs',
                 'class_weight': {0: 1, 1 : 2.1},
                'warm_start':True}
XGB_BEST_PARAMS = {'colsample_bytree': 0.6966652866539669,
                   'gamma': 5.1222111958093395,
                   'learning_rate': 0.28,
                   'max_depth': 16.0,
                   'min_child_weight': 1.0,
                   'nthread': 3.0, 'reg_alpha': 61.0,
                   'reg_lambda': 0.5855571439718152, 'scale_pos_weight': 4.0, 'subsample': 0.9}

In [19]:
# pipeline = Pipeline(steps = [('scale',StandardScaler()),('LR',LogisticRegression(random_state=random_state).set_params(**LR_BEST_PARAMS))])
# pipeline.fit(X_train_res,y_train_res)

pipeline = Pipeline(steps = [('scale',StandardScaler()), ("XGBC",XGBClassifier(random_state=random_state, params=XGB_BEST_PARAMS))])
pipeline.fit(X_train,y_train)

# pipeline = Pipeline(steps = [('scale',StandardScaler()), ("SVC",SVC(random_state=random_state, probability=True).set_params(**SVC_BEST_PARAMS))])
# pipeline.fit(X_train,y_train)



tuned_pred   = pipeline.predict(X_test)
# tuned_pred = (pipeline.predict_proba(X_test)[:,1] >= 0.6).astype(bool) # set threshold as 0.6 

Parameters: { "params" } are not used.



In [20]:




print(classification_report(y_test,tuned_pred))
print('Accuracy Score: ',accuracy_score(y_test,tuned_pred))
print('\nConfusion matrix: \n',confusion_matrix(y_test, tuned_pred))
print('\n Model :',name)


              precision    recall  f1-score   support

           0       0.94      0.92      0.93       367
           1       0.92      0.94      0.93       374

    accuracy                           0.93       741
   macro avg       0.93      0.93      0.93       741
weighted avg       0.93      0.93      0.93       741

Accuracy Score:  0.9311740890688259

Confusion matrix: 
 [[337  30]
 [ 21 353]]

 Model : XGBC


In [21]:
print(X.columns)

Index(['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status'],
      dtype='object')


In [22]:
columns = ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status']
# columns=['age', 'avg_glucose_level', 'bmi']
eli5.show_weights(pipeline.named_steps[name], feature_names=columns)

Weight,Feature
0.1785,age
0.153,work_type
0.1272,ever_married
0.111,gender
0.1018,heart_disease
0.0822,hypertension
0.0751,Residence_type
0.0603,avg_glucose_level
0.0561,bmi
0.0549,smoking_status


In [23]:
#can changing the predictions threshold help?
# we see more true predictions but at the cost of having more false positives

for i in range(1,9):
    
    cm1=0
    y_pred1 = pipeline.predict_proba(X_test)[:,1]
    y_pred1 = y_pred1.reshape(-1,1)
    y_pred2 = preprocessing.binarize(X=y_pred1, threshold=i/10)
    y_pred2 = np.where(y_pred2 == 1, 1, 0)
    cm1 = confusion_matrix(y_test, y_pred2)
        
    print ('With',i/10,'threshold the Confusion Matrix is ','\n\n',cm1,'\n\n',
            'with',cm1[0,0]+cm1[1,1],'correct predictions, ', '\n\n', 
           
            cm1[0,1],'Type I errors( False Positives), ','\n\n',           
            cm1[1,0],'Type II errors( False Negatives), ','\n\n',
           
           'Accuracy score: ', (accuracy_score(y_test, y_pred2)), '\n\n',
           'F1 score: ', (f1_score(y_test, y_pred2)), '\n\n',
           'Sensitivity: ',cm1[1,1]/(float(cm1[1,1]+cm1[1,0])), '\n\n',
           
           'Specificity: ',cm1[0,0]/(float(cm1[0,0]+cm1[0,1])),'\n\n',
          
            '====================================================', '\n\n')

With 0.1 threshold the Confusion Matrix is  

 [[282  85]
 [  6 368]] 

 with 650 correct predictions,  

 85 Type I errors( False Positives),  

 6 Type II errors( False Negatives),  

 Accuracy score:  0.8771929824561403 

 F1 score:  0.8899637243047159 

 Sensitivity:  0.983957219251337 

 Specificity:  0.7683923705722071 



With 0.2 threshold the Confusion Matrix is  

 [[302  65]
 [ 10 364]] 

 with 666 correct predictions,  

 65 Type I errors( False Positives),  

 10 Type II errors( False Negatives),  

 Accuracy score:  0.8987854251012146 

 F1 score:  0.9066002490660025 

 Sensitivity:  0.9732620320855615 

 Specificity:  0.8228882833787466 



With 0.3 threshold the Confusion Matrix is  

 [[319  48]
 [ 14 360]] 

 with 679 correct predictions,  

 48 Type I errors( False Positives),  

 14 Type II errors( False Negatives),  

 Accuracy score:  0.9163292847503374 

 F1 score:  0.9207161125319694 

 Sensitivity:  0.9625668449197861 

 Specificity:  0.8692098092643051 



Wit

In [24]:
# y_pred = (pipeline.predict_proba(X_test)[:,1] >= 0.8).astype(bool) # set threshold as 0.8 