In [1]:
import os
import pandas as pd
import numpy as np
import joblib
import warnings
import pickle
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv(os.path.join('..','data','healthcare-dataset-stroke-data.csv'))
df = df.dropna()
df = df.drop('id',axis=1)

In [3]:
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5104,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [4]:
df_dum = pd.get_dummies(df)
df_dum

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,ever_married_No,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,0,1,228.69,36.6,1,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0
2,80.0,0,1,105.92,32.5,1,0,1,0,0,...,0,1,0,0,1,0,0,0,1,0
3,49.0,0,0,171.23,34.4,1,1,0,0,0,...,0,1,0,0,0,1,0,0,0,1
4,79.0,1,0,174.12,24.0,1,1,0,0,0,...,0,0,1,0,1,0,0,0,1,0
5,81.0,0,0,186.21,29.0,1,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5104,13.0,0,0,103.08,18.6,0,1,0,0,1,...,0,0,0,1,1,0,1,0,0,0
5106,81.0,0,0,125.20,40.0,0,1,0,0,0,...,0,0,1,0,0,1,0,0,1,0
5107,35.0,0,0,82.99,30.6,0,1,0,0,0,...,0,0,1,0,1,0,0,0,1,0
5108,51.0,0,0,166.29,25.6,0,0,1,0,0,...,0,1,0,0,1,0,0,1,0,0


In [5]:
from sklearn.model_selection import train_test_split
X = df_dum.drop('stroke',axis=1)
y = df_dum.stroke.values.reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
y_train, y_test = y_train.ravel(), y_test.ravel()

In [6]:
%%time

from lazypredict.Supervised import LazyClassifier

clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)
models = models.reset_index()
models = models.sort_values(['Accuracy'], ascending=False)
models.to_csv(os.path.join('..','models','models.csv'), index=False)
models

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:05<00:00,  5.54it/s]

Wall time: 6.91 s





Unnamed: 0,Model,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
15,RandomForestClassifier,0.95,0.51,0.51,0.93,0.36
20,LogisticRegression,0.95,0.5,0.5,0.93,0.05
21,CalibratedClassifierCV,0.95,0.5,0.5,0.93,0.83
19,AdaBoostClassifier,0.95,0.5,0.5,0.93,0.21
18,RidgeClassifierCV,0.95,0.5,0.5,0.93,0.02
17,RidgeClassifier,0.95,0.5,0.5,0.93,0.03
6,BaggingClassifier,0.95,0.54,0.54,0.93,0.11
22,LinearSVC,0.95,0.5,0.5,0.93,0.22
23,SVC,0.95,0.5,0.5,0.93,0.18
12,XGBClassifier,0.95,0.52,0.52,0.93,0.23


In [7]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, roc_auc_score
def get_metrics(y_test, preds):
    accu_score = accuracy_score(y_test, preds)
    balance_score = balanced_accuracy_score(y_test, preds)
    f1_score_ = f1_score(y_test, preds)
    roc_auc_score_ = roc_auc_score(y_test, preds)
   
    scores_string = f'''
    accuracy_score: {accu_score *100}%
    balanced_accuracy_score: {balance_score*100}%
    f1_score: {f1_score_*100}%
    roc_auc_score: {roc_auc_score_*100}%
    '''
    
    print(scores_string)
    
    

In [8]:
# building an XGBoost classification model

from xgboost import XGBClassifier

xgb_clf = XGBClassifier()
xgb_clf.fit(X_train, y_train)
xgb_clf_preds = xgb_clf.predict(X_test) 
print(get_metrics(y_test, xgb_clf_preds))
xgb_clf.save_model(os.path.join('..','models','xgboost_stroke_pred_model.json'))


    accuracy_score: 94.87654320987654%
    balanced_accuracy_score: 52.27272727272727%
    f1_score: 8.791208791208792%
    roc_auc_score: 52.27272727272728%
    
None


In [9]:
def dump_and_test_model(model, X_test, y_test):
    '''
    This is for sklearn models
    '''
    save_model = os.path.join('..','models',f'{model}.sav')
    pickle.dump(model, open(save_model, 'wb'))
    loaded_model = pickle.load(open(save_model, 'rb'))
    pred = loaded_model.predict(X_test)
    metrics = get_metrics(y_test, pred)
    print(metrics)
    

In [10]:
# SVM
from sklearn.svm import SVC

svm_clf = SVC()
svm_clf.fit(X_train, y_train)
dump_and_test_model(svm_clf, X_test, y_test)
# svm_clf_preds = svm_clf.predict(X_test).ravel()
# print(get_metrics(y_test, svm_clf_preds))
# pickle.dump(svm_clf, open('svm_clf.sav', 'wb'))


    accuracy_score: 95.06172839506173%
    balanced_accuracy_score: 50.0%
    f1_score: 0.0%
    roc_auc_score: 50.0%
    
None


In [11]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

random_forest_clf = RandomForestClassifier()
random_forest_clf.fit(X_train, y_train)
dump_and_test_model(random_forest_clf, X_test, y_test)
# random_forest_clf_preds = random_forest_clf.predict(X_test).ravel()
# get_metrics(y_test, random_forest_clf_preds)
# joblib.dump(random_forest_clf, 'random_forest_clf.pkl', compress=9)


    accuracy_score: 95.0%
    balanced_accuracy_score: 49.967532467532465%
    f1_score: 0.0%
    roc_auc_score: 49.967532467532465%
    
None


In [12]:
# Bayesian
from sklearn.naive_bayes import BernoulliNB

bern_NB_clf = BernoulliNB()
bern_NB_clf.fit(X_train, y_train)
dump_and_test_model(bern_NB_clf, X_test, y_test)
# bern_NB_clf_preds = bern_NB_clf.predict(X_test).ravel()
# get_metrics(y_test, bern_NB_clf_preds)
# joblib.dump(bern_NB_clf_preds, 'bern_NB_clf_preds.pkl', compress=9)


    accuracy_score: 94.5679012345679%
    balanced_accuracy_score: 52.1103896103896%
    f1_score: 8.333333333333334%
    roc_auc_score: 52.1103896103896%
    
None


In [13]:
# Perceptron
from sklearn.linear_model import Perceptron

per_clf = Perceptron()
per_clf.fit(X_train, y_train)
dump_and_test_model(per_clf, X_test, y_test)
# per_clf_preds = per_clf.predict(X_test).ravel()
# get_metrics(y_test, per_clf_preds)
# joblib.dump(per_clf_preds, 'per_clf_preds.pkl', compress=9)


    accuracy_score: 95.06172839506173%
    balanced_accuracy_score: 50.0%
    f1_score: 0.0%
    roc_auc_score: 50.0%
    
None
