In [1]:
import os
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv(os.path.join('..','data','healthcare-dataset-stroke-data.csv'))
df = df.dropna()

In [3]:
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5104,14180,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [4]:
df_dum = pd.get_dummies(df)
df_dum

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,9046,67.0,0,1,228.69,36.6,1,0,1,0,...,0,1,0,0,0,1,0,1,0,0
2,31112,80.0,0,1,105.92,32.5,1,0,1,0,...,0,1,0,0,1,0,0,0,1,0
3,60182,49.0,0,0,171.23,34.4,1,1,0,0,...,0,1,0,0,0,1,0,0,0,1
4,1665,79.0,1,0,174.12,24.0,1,1,0,0,...,0,0,1,0,1,0,0,0,1,0
5,56669,81.0,0,0,186.21,29.0,1,0,1,0,...,0,1,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5104,14180,13.0,0,0,103.08,18.6,0,1,0,0,...,0,0,0,1,1,0,1,0,0,0
5106,44873,81.0,0,0,125.20,40.0,0,1,0,0,...,0,0,1,0,0,1,0,0,1,0
5107,19723,35.0,0,0,82.99,30.6,0,1,0,0,...,0,0,1,0,1,0,0,0,1,0
5108,37544,51.0,0,0,166.29,25.6,0,0,1,0,...,0,1,0,0,1,0,0,1,0,0


In [5]:
from sklearn.model_selection import train_test_split
X = df_dum.drop('smoking_status_smokes',axis=1)
y = df_dum.smoking_status_smokes.values.reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
y_train, y_test = y_train.ravel(), y_test.ravel()

In [6]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, average_precision_score, brier_score_loss, f1_score, log_loss, precision_score, recall_score

def get_metrics(y_test, preds):
    accu_score = accuracy_score(y_test, preds)
    balance_score = balanced_accuracy_score(y_test, preds)
    avg_precision_score = average_precision_score(y_test, preds)
    brier_sl = brier_score_loss(y_test, preds)
    f1_score_ = f1_score(y_test, preds)
    log_loss_ = log_loss(y_test, preds)
    prec_score = precision_score(y_test, preds)
    rec_score = recall_score(y_test, preds)
    
    scores_string = f'''
    accuracy_score: {accu_score *100}%
    balanced_accuracy_score: {balance_score*100}%
    average_precision_score: {avg_precision_score*100}%
    brier_score_loss: {brier_sl*100}%
    f1_score: {f1_score_*100}%
    log_loss: {log_loss_*100}%
    precision_score {prec_score*100}%
    recall_score: {rec_score*100}%
    '''
    
    print(scores_string)

In [7]:
# SVM
from sklearn.svm import SVC
svm_clf = SVC()
svm_clf.fit(X_train, y_train)

svm_clf_preds = svm_clf.predict(X_test).ravel()

get_metrics(y_test, svm_clf_preds)


    accuracy_score: 85.74074074074073%
    balanced_accuracy_score: 50.0%
    average_precision_score: 14.25925925925926%
    brier_score_loss: 14.25925925925926%
    f1_score: 0.0%
    log_loss: 492.49736711261534%
    precision_score 0.0%
    recall_score: 0.0%
    


In [8]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

forrest_clf = RandomForestClassifier()
forrest_clf.fit(X_train, y_train)
forrest_clf_preds = forrest_clf.predict(X_test).ravel()
get_metrics(y_test, forrest_clf_preds)


    accuracy_score: 99.93827160493827%
    balanced_accuracy_score: 99.78354978354979%
    average_precision_score: 99.62882796216131%
    brier_score_loss: 0.06172839506172839%
    f1_score: 99.78308026030369%
    log_loss: 2.1320232342538454%
    precision_score 100.0%
    recall_score: 99.56709956709958%
    


In [9]:
# Neural Network

from sklearn.neural_network import MLPClassifier

NN_clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
NN_clf.fit(X_train, y_train)
NN_clf_preds = NN_clf.predict(X_test).ravel()
get_metrics(y_test, NN_clf_preds)


    accuracy_score: 85.74074074074073%
    balanced_accuracy_score: 50.0%
    average_precision_score: 14.25925925925926%
    brier_score_loss: 14.25925925925926%
    f1_score: 0.0%
    log_loss: 492.49736711261534%
    precision_score 0.0%
    recall_score: 0.0%
    


In [10]:
# Bayesian
from sklearn.naive_bayes import BernoulliNB

bern_NB_clf = BernoulliNB()
bern_NB_clf.fit(X_train, y_train)
bern_NB_clf_preds = bern_NB_clf.predict(X_test).ravel()
get_metrics(y_test, bern_NB_clf_preds)


    accuracy_score: 93.39506172839506%
    balanced_accuracy_score: 76.83982683982684%
    average_precision_score: 60.284591951258626%
    brier_score_loss: 6.604938271604937%
    f1_score: 69.85915492957747%
    log_loss: 228.12648606515089%
    precision_score 100.0%
    recall_score: 53.67965367965368%
    


In [11]:
# XGBoost
from xgboost import XGBClassifier

xgb_clf = XGBClassifier()
xgb_clf.fit(X_train, y_train)
xgb_clf_preds = xgb_clf.predict(X_test).ravel()
get_metrics(y_test, bern_NB_clf_preds)


    accuracy_score: 93.39506172839506%
    balanced_accuracy_score: 76.83982683982684%
    average_precision_score: 60.284591951258626%
    brier_score_loss: 6.604938271604937%
    f1_score: 69.85915492957747%
    log_loss: 228.12648606515089%
    precision_score 100.0%
    recall_score: 53.67965367965368%
    
