In [1]:
import os
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv(os.path.join('..','data','healthcare-dataset-stroke-data.csv'))
df = df.dropna()
df = df.drop('id',axis=1)

In [3]:
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5104,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [4]:
df_dum = pd.get_dummies(df)
df_dum

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,ever_married_No,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,0,1,228.69,36.6,1,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0
2,80.0,0,1,105.92,32.5,1,0,1,0,0,...,0,1,0,0,1,0,0,0,1,0
3,49.0,0,0,171.23,34.4,1,1,0,0,0,...,0,1,0,0,0,1,0,0,0,1
4,79.0,1,0,174.12,24.0,1,1,0,0,0,...,0,0,1,0,1,0,0,0,1,0
5,81.0,0,0,186.21,29.0,1,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5104,13.0,0,0,103.08,18.6,0,1,0,0,1,...,0,0,0,1,1,0,1,0,0,0
5106,81.0,0,0,125.20,40.0,0,1,0,0,0,...,0,0,1,0,0,1,0,0,1,0
5107,35.0,0,0,82.99,30.6,0,1,0,0,0,...,0,0,1,0,1,0,0,0,1,0
5108,51.0,0,0,166.29,25.6,0,0,1,0,0,...,0,1,0,0,1,0,0,1,0,0


In [5]:
from sklearn.model_selection import train_test_split
X = df_dum.drop('smoking_status_smokes',axis=1)
y = df_dum.smoking_status_smokes.values.reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
y_train, y_test = y_train.ravel(), y_test.ravel()

In [6]:
%%time

from lazypredict.Supervised import LazyClassifier

clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)
models = models.reset_index()
models.sort_values(['Balanced Accuracy','Time Taken'], ascending=False)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:05<00:00,  5.11it/s]

Wall time: 7.36 s





Unnamed: 0,Model,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
13,CalibratedClassifierCV,1.0,1.0,1.0,1.0,1.4
11,LinearSVC,1.0,1.0,1.0,1.0,0.37
0,AdaBoostClassifier,1.0,1.0,1.0,1.0,0.25
2,XGBClassifier,1.0,1.0,1.0,1.0,0.15
3,SVC,1.0,1.0,1.0,1.0,0.15
14,LGBMClassifier,1.0,1.0,1.0,1.0,0.1
1,BaggingClassifier,1.0,1.0,1.0,1.0,0.07
10,LogisticRegression,1.0,1.0,1.0,1.0,0.04
7,QuadraticDiscriminantAnalysis,1.0,1.0,1.0,1.0,0.03
12,DecisionTreeClassifier,1.0,1.0,1.0,1.0,0.03


In [7]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, average_precision_score, brier_score_loss, f1_score, log_loss, precision_score, recall_score, confusion_matrix, classification_report

def get_metrics(y_test, preds):
    accu_score = accuracy_score(y_test, preds)
    balance_score = balanced_accuracy_score(y_test, preds)
    avg_precision_score = average_precision_score(y_test, preds)
    brier_sl = brier_score_loss(y_test, preds)
    f1_score_ = f1_score(y_test, preds)
    log_loss_ = log_loss(y_test, preds)
    prec_score = precision_score(y_test, preds)
    rec_score = recall_score(y_test, preds)
    confusion_mat = confusion_matrix(y_test, preds)
    class_report = classification_report(y_test, preds, target_names=['had a stroke','not had a stroke'])
    
    scores_string = f'''
    accuracy_score: {accu_score *100}%
    balanced_accuracy_score: {balance_score*100}%
    average_precision_score: {avg_precision_score*100}%
    brier_score_loss: {brier_sl*100}%
    f1_score: {f1_score_*100}%
    log_loss: {log_loss_*100}%
    precision_score {prec_score*100}%
    recall_score: {rec_score*100}%
    confusion_matrix : {confusion_mat}
    classification_report: 
    {class_report}
    '''
    
    print(scores_string)
    
    

In [8]:
# SVM
from sklearn.svm import SVC
svm_clf = SVC()
svm_clf.fit(X_train, y_train)

svm_clf_preds = svm_clf.predict(X_test).ravel()

# print(get_metrics(y_test, svm_clf_preds))

def get_svm_preds(X_train, X_test, y_train):
    svm_clf = SVC()
    svm_clf.fit(X_train, y_train)

    svm_clf_preds = svm_clf.predict(X_test).ravel()
    return svm_clf_preds

svm_clf_preds = get_svm_preds(X_train, X_test, y_train)
get_metrics(y_test, svm_clf_preds)


    accuracy_score: 85.74074074074073%
    balanced_accuracy_score: 50.0%
    average_precision_score: 14.25925925925926%
    brier_score_loss: 14.25925925925926%
    f1_score: 0.0%
    log_loss: 492.49736711261534%
    precision_score 0.0%
    recall_score: 0.0%
    confusion_matrix : [[1389    0]
 [ 231    0]]
    classification_report: 
                      precision    recall  f1-score   support

    had a stroke       0.86      1.00      0.92      1389
not had a stroke       0.00      0.00      0.00       231

        accuracy                           0.86      1620
       macro avg       0.43      0.50      0.46      1620
    weighted avg       0.74      0.86      0.79      1620

    


In [9]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

forrest_clf = RandomForestClassifier()
forrest_clf.fit(X_train, y_train)
forrest_clf_preds = forrest_clf.predict(X_test).ravel()
get_metrics(y_test, forrest_clf_preds)


    accuracy_score: 99.93827160493827%
    balanced_accuracy_score: 99.78354978354979%
    average_precision_score: 99.62882796216131%
    brier_score_loss: 0.06172839506172839%
    f1_score: 99.78308026030369%
    log_loss: 2.1320232342538454%
    precision_score 100.0%
    recall_score: 99.56709956709958%
    confusion_matrix : [[1389    0]
 [   1  230]]
    classification_report: 
                      precision    recall  f1-score   support

    had a stroke       1.00      1.00      1.00      1389
not had a stroke       1.00      1.00      1.00       231

        accuracy                           1.00      1620
       macro avg       1.00      1.00      1.00      1620
    weighted avg       1.00      1.00      1.00      1620

    


In [10]:
# Neural Network

from sklearn.neural_network import MLPClassifier

NN_clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
NN_clf.fit(X_train, y_train)
NN_clf_preds = NN_clf.predict(X_test).ravel()
get_metrics(y_test, NN_clf_preds)


    accuracy_score: 99.87654320987654%
    balanced_accuracy_score: 99.74755266331941%
    average_precision_score: 99.19780155710893%
    brier_score_loss: 0.12345679012345678%
    f1_score: 99.56709956709958%
    log_loss: 4.2640958263736515%
    precision_score 99.56709956709958%
    recall_score: 99.56709956709958%
    confusion_matrix : [[1388    1]
 [   1  230]]
    classification_report: 
                      precision    recall  f1-score   support

    had a stroke       1.00      1.00      1.00      1389
not had a stroke       1.00      1.00      1.00       231

        accuracy                           1.00      1620
       macro avg       1.00      1.00      1.00      1620
    weighted avg       1.00      1.00      1.00      1620

    


In [11]:
# Bayesian
from sklearn.naive_bayes import BernoulliNB

bern_NB_clf = BernoulliNB()
bern_NB_clf.fit(X_train, y_train)
bern_NB_clf_preds = bern_NB_clf.predict(X_test).ravel()
get_metrics(y_test, bern_NB_clf_preds)


    accuracy_score: 93.39506172839506%
    balanced_accuracy_score: 76.83982683982684%
    average_precision_score: 60.284591951258626%
    brier_score_loss: 6.604938271604937%
    f1_score: 69.85915492957747%
    log_loss: 228.12648606515089%
    precision_score 100.0%
    recall_score: 53.67965367965368%
    confusion_matrix : [[1389    0]
 [ 107  124]]
    classification_report: 
                      precision    recall  f1-score   support

    had a stroke       0.93      1.00      0.96      1389
not had a stroke       1.00      0.54      0.70       231

        accuracy                           0.93      1620
       macro avg       0.96      0.77      0.83      1620
    weighted avg       0.94      0.93      0.93      1620

    


In [12]:
# XGBoost
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

xgb_clf = XGBClassifier()
xgb_clf.fit(X_train, y_train)
xgb_clf_preds = xgb_clf.predict(X_test).ravel()
get_metrics(y_test, xgb_clf_preds)


    accuracy_score: 100.0%
    balanced_accuracy_score: 100.0%
    average_precision_score: 100.0%
    brier_score_loss: 0.0%
    f1_score: 100.0%
    log_loss: 9.992007221626413e-14%
    precision_score 100.0%
    recall_score: 100.0%
    confusion_matrix : [[1389    0]
 [   0  231]]
    classification_report: 
                      precision    recall  f1-score   support

    had a stroke       1.00      1.00      1.00      1389
not had a stroke       1.00      1.00      1.00       231

        accuracy                           1.00      1620
       macro avg       1.00      1.00      1.00      1620
    weighted avg       1.00      1.00      1.00      1620

    


In [13]:
from sklearn.linear_model import Perceptron
per_clf = Perceptron()
per_clf.fit(X_train, y_train)
per_clf_preds = per_clf.predict(X_test).ravel()

get_metrics(y_test, per_clf_preds)


    accuracy_score: 91.9753086419753%
    balanced_accuracy_score: 78.7186895178256%
    average_precision_score: 52.93364095623983%
    brier_score_loss: 8.024691358024691%
    f1_score: 68.13725490196079%
    log_loss: 277.1648960518974%
    precision_score 78.53107344632768%
    recall_score: 60.17316017316018%
    confusion_matrix : [[1351   38]
 [  92  139]]
    classification_report: 
                      precision    recall  f1-score   support

    had a stroke       0.94      0.97      0.95      1389
not had a stroke       0.79      0.60      0.68       231

        accuracy                           0.92      1620
       macro avg       0.86      0.79      0.82      1620
    weighted avg       0.91      0.92      0.92      1620

    
