In [1]:
import os
import pandas as pd
import numpy as np
import joblib
import warnings
import pickle
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv(os.path.join('..','data','healthcare-dataset-stroke-data.csv'))
df = df.dropna()
df = df.drop('id',axis=1)

In [3]:
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5104,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [4]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
list_of_cat_feat = ['gender','ever_married','work_type','Residence_type','smoking_status']
for i in list_of_cat_feat:
    df[f'{i}_encoded'] = le.fit_transform(df[f"{i}"])
    
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,gender_encoded,ever_married_encoded,work_type_encoded,Residence_type_encoded,smoking_status_encoded
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1,1,1,2,1,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1,1,1,2,0,2
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1,0,1,2,1,3
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1,0,1,3,0,2
5,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1,1,1,2,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5104,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0,0,0,4,0,0
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0,0,1,3,1,2
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0,0,1,3,0,2
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0,1,1,2,0,1


In [5]:
df_cat = df.drop(list_of_cat_feat, axis=1)
df_cat

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_encoded,ever_married_encoded,work_type_encoded,Residence_type_encoded,smoking_status_encoded
0,67.0,0,1,228.69,36.6,1,1,1,2,1,1
2,80.0,0,1,105.92,32.5,1,1,1,2,0,2
3,49.0,0,0,171.23,34.4,1,0,1,2,1,3
4,79.0,1,0,174.12,24.0,1,0,1,3,0,2
5,81.0,0,0,186.21,29.0,1,1,1,2,1,1
...,...,...,...,...,...,...,...,...,...,...,...
5104,13.0,0,0,103.08,18.6,0,0,0,4,0,0
5106,81.0,0,0,125.20,40.0,0,0,1,3,1,2
5107,35.0,0,0,82.99,30.6,0,0,1,3,0,2
5108,51.0,0,0,166.29,25.6,0,1,1,2,0,1


In [6]:
from sklearn.model_selection import train_test_split
X = df_cat.drop('stroke',axis=1)
y = df_cat.stroke.values.reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
y_train, y_test = y_train.ravel(), y_test.ravel()

In [7]:
%%time

from lazypredict.Supervised import LazyClassifier

clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)
models = models.reset_index()
models = models.sort_values(['Accuracy','F1 Score'], ascending=False)
models.to_csv(os.path.join('..','models','models_dum.csv'), index=False)
models

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:03<00:00,  7.66it/s]

Wall time: 5.92 s





Unnamed: 0,Model,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
8,BaggingClassifier,0.95,0.54,0.54,0.93,0.1
13,RandomForestClassifier,0.95,0.51,0.51,0.93,0.38
17,RidgeClassifierCV,0.95,0.5,0.5,0.93,0.02
18,SGDClassifier,0.95,0.5,0.5,0.93,0.03
19,SVC,0.95,0.5,0.5,0.93,0.14
20,LinearSVC,0.95,0.5,0.5,0.93,0.12
21,RidgeClassifier,0.95,0.5,0.5,0.93,0.02
22,LogisticRegression,0.95,0.5,0.5,0.93,0.03
23,CalibratedClassifierCV,0.95,0.5,0.5,0.93,0.46
24,AdaBoostClassifier,0.95,0.5,0.5,0.93,0.2


In [8]:
%%time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

pipe = Pipeline([
    ('scaler',StandardScaler()),
    ('model',RandomForestClassifier())
    ])

mod = GridSearchCV(
                estimator=pipe,
                param_grid={
                     'model__n_estimators':np.arange(1,50,1),
                     'model__max_depth':np.arange(2,15)
                },
                cv = 5, 
                verbose = 1,
                n_jobs = -1)
mod.fit(X,y)

pd.DataFrame(mod.cv_results_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 637 candidates, totalling 3185 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 656 tasks      | elapsed:   17.6s
[Parallel(n_jobs=-1)]: Done 1656 tasks      | elapsed:   48.7s
[Parallel(n_jobs=-1)]: Done 3056 tasks      | elapsed:  1.6min


Wall time: 1min 41s


[Parallel(n_jobs=-1)]: Done 3185 out of 3185 | elapsed:  1.7min finished


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__max_depth,param_model__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.02,0.00,0.00,0.00,2,1,"{'model__max_depth': 2, 'model__n_estimators': 1}",0.96,0.96,0.96,0.96,0.96,0.96,0.00,25
1,0.04,0.02,0.01,0.02,2,2,"{'model__max_depth': 2, 'model__n_estimators': 2}",0.96,0.96,0.96,0.96,0.96,0.96,0.00,25
2,0.02,0.00,0.01,0.00,2,3,"{'model__max_depth': 2, 'model__n_estimators': 3}",0.96,0.96,0.96,0.96,0.96,0.96,0.00,25
3,0.03,0.00,0.00,0.00,2,4,"{'model__max_depth': 2, 'model__n_estimators': 4}",0.96,0.96,0.96,0.96,0.96,0.96,0.00,25
4,0.02,0.00,0.00,0.00,2,5,"{'model__max_depth': 2, 'model__n_estimators': 5}",0.96,0.96,0.96,0.96,0.96,0.96,0.00,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
632,0.21,0.00,0.01,0.00,14,45,"{'model__max_depth': 14, 'model__n_estimators'...",0.96,0.96,0.96,0.96,0.96,0.96,0.00,423
633,0.21,0.01,0.02,0.00,14,46,"{'model__max_depth': 14, 'model__n_estimators'...",0.96,0.96,0.96,0.96,0.96,0.96,0.00,232
634,0.22,0.01,0.01,0.00,14,47,"{'model__max_depth': 14, 'model__n_estimators'...",0.95,0.96,0.96,0.96,0.96,0.96,0.00,350
635,0.22,0.01,0.01,0.00,14,48,"{'model__max_depth': 14, 'model__n_estimators'...",0.96,0.96,0.96,0.96,0.96,0.96,0.00,446


In [9]:
from sklearn.metrics import accuracy_score
best_etimator = mod.best_estimator_
print(best_etimator)
preds = best_etimator.predict(X_test)
print(accuracy_score(y_test, preds))

import joblib
file_path = os.path.join('..','models','random_forest_clf_cat.pkl')
joblib.dump(mod.best_estimator_, file_path, compress = 1)

Pipeline(steps=[('scaler', StandardScaler()),
                ('model',
                 RandomForestClassifier(max_depth=8, n_estimators=11))])
0.9567901234567902


['..\\models\\random_forest_clf_cat.pkl']

In [10]:
loaded_model = joblib.load(file_path)
result = loaded_model.score(X_test, y_test)
print(result)

0.9567901234567902


## Old dummy method not ideal for Streamlit app

In [11]:
df_dum = pd.get_dummies(df)
df_dum

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_encoded,ever_married_encoded,work_type_encoded,Residence_type_encoded,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.00,0,1,228.69,36.60,1,1,1,2,1,...,0,1,0,0,0,1,0,1,0,0
2,80.00,0,1,105.92,32.50,1,1,1,2,0,...,0,1,0,0,1,0,0,0,1,0
3,49.00,0,0,171.23,34.40,1,0,1,2,1,...,0,1,0,0,0,1,0,0,0,1
4,79.00,1,0,174.12,24.00,1,0,1,3,0,...,0,0,1,0,1,0,0,0,1,0
5,81.00,0,0,186.21,29.00,1,1,1,2,1,...,0,1,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5104,13.00,0,0,103.08,18.60,0,0,0,4,0,...,0,0,0,1,1,0,1,0,0,0
5106,81.00,0,0,125.20,40.00,0,0,1,3,1,...,0,0,1,0,0,1,0,0,1,0
5107,35.00,0,0,82.99,30.60,0,0,1,3,0,...,0,0,1,0,1,0,0,0,1,0
5108,51.00,0,0,166.29,25.60,0,1,1,2,0,...,0,1,0,0,1,0,0,1,0,0


In [12]:
from sklearn.model_selection import train_test_split
X = df_dum.drop('stroke',axis=1)
y = df_dum.stroke.values.reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
y_train, y_test = y_train.ravel(), y_test.ravel()

In [13]:
%%time

from lazypredict.Supervised import LazyClassifier

clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)
models = models.reset_index()
models = models.sort_values(['Accuracy','F1 Score'], ascending=False)
models.to_csv(os.path.join('..','models','models_dum.csv'), index=False)
models

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:05<00:00,  5.27it/s]

Wall time: 5.54 s





Unnamed: 0,Model,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
16,RidgeClassifier,0.95,0.5,0.5,0.93,0.03
17,RidgeClassifierCV,0.95,0.5,0.5,0.93,0.03
18,SVC,0.95,0.5,0.5,0.93,0.18
19,AdaBoostClassifier,0.95,0.5,0.5,0.93,0.26
20,LogisticRegression,0.95,0.5,0.5,0.93,0.04
21,CalibratedClassifierCV,0.95,0.5,0.5,0.93,0.85
22,LinearSVC,0.95,0.5,0.5,0.93,0.22
9,LGBMClassifier,0.95,0.54,0.54,0.93,0.14
12,BaggingClassifier,0.95,0.52,0.52,0.93,0.11
13,KNeighborsClassifier,0.95,0.52,0.52,0.93,0.31


In [14]:
%%time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

pipe = Pipeline([
    ('scaler',StandardScaler()),
    ('model',RandomForestClassifier())
    ])

mod = GridSearchCV(
                estimator=pipe,
                param_grid={
                     'model__n_estimators':np.arange(1,50,1),
                     'model__max_depth':np.arange(2,15)
                },
                cv = 5, 
                verbose = 1,
                n_jobs = -1)
mod.fit(X,y)

pd.DataFrame(mod.cv_results_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 637 candidates, totalling 3185 fits


[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 728 tasks      | elapsed:   19.1s
[Parallel(n_jobs=-1)]: Done 1728 tasks      | elapsed:   50.0s
[Parallel(n_jobs=-1)]: Done 2832 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 3178 out of 3185 | elapsed:  1.8min remaining:    0.1s


Wall time: 1min 48s


[Parallel(n_jobs=-1)]: Done 3185 out of 3185 | elapsed:  1.8min finished


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__max_depth,param_model__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.03,0.00,0.00,0.00,2,1,"{'model__max_depth': 2, 'model__n_estimators': 1}",0.96,0.96,0.96,0.96,0.96,0.96,0.00,56
1,0.04,0.02,0.00,0.00,2,2,"{'model__max_depth': 2, 'model__n_estimators': 2}",0.96,0.96,0.96,0.96,0.96,0.96,0.00,56
2,0.03,0.01,0.01,0.00,2,3,"{'model__max_depth': 2, 'model__n_estimators': 3}",0.96,0.96,0.96,0.96,0.96,0.96,0.00,56
3,0.03,0.00,0.00,0.00,2,4,"{'model__max_depth': 2, 'model__n_estimators': 4}",0.96,0.96,0.96,0.96,0.96,0.96,0.00,56
4,0.03,0.00,0.00,0.00,2,5,"{'model__max_depth': 2, 'model__n_estimators': 5}",0.96,0.96,0.96,0.96,0.96,0.96,0.00,56
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
632,0.26,0.07,0.02,0.00,14,45,"{'model__max_depth': 14, 'model__n_estimators'...",0.96,0.96,0.96,0.96,0.96,0.96,0.00,294
633,0.24,0.00,0.01,0.00,14,46,"{'model__max_depth': 14, 'model__n_estimators'...",0.95,0.96,0.96,0.96,0.96,0.96,0.00,419
634,0.22,0.01,0.02,0.00,14,47,"{'model__max_depth': 14, 'model__n_estimators'...",0.96,0.96,0.96,0.96,0.96,0.96,0.00,481
635,0.22,0.01,0.02,0.00,14,48,"{'model__max_depth': 14, 'model__n_estimators'...",0.96,0.96,0.96,0.95,0.96,0.96,0.00,486


In [15]:
mod.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('model', RandomForestClassifier(max_depth=8, n_estimators=8))])

In [16]:
from sklearn.metrics import accuracy_score
best_etimator = mod.best_estimator_
preds = best_etimator.predict(X_test)
accuracy_score(y_test, preds)

0.9574074074074074

In [17]:
import joblib
joblib.dump(mod.best_estimator_, os.path.join('..','models','random_forest_clf_dum.pkl'), compress = 1)

['..\\models\\random_forest_clf_dum.pkl']

In [18]:
loaded_model = joblib.load(os.path.join('..','models','random_forest_clf_dum.pkl'))
result = loaded_model.score(X_test, y_test)
print(result)

0.9574074074074074
