# Importing the required files

In [1]:
import os
import pandas as pd
import numpy as np
import joblib
import warnings
import pickle
warnings.filterwarnings('ignore')
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Loading the stroke dataset from a local source

In [2]:
df = pd.read_csv(os.path.join('..','data','healthcare-dataset-stroke-data.csv'))
df = df.dropna()
df = df.drop('id',axis=1)
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.00,0,1,Yes,Private,Urban,228.69,36.60,formerly smoked,1
2,Male,80.00,0,1,Yes,Private,Rural,105.92,32.50,never smoked,1
3,Female,49.00,0,0,Yes,Private,Urban,171.23,34.40,smokes,1
4,Female,79.00,1,0,Yes,Self-employed,Rural,174.12,24.00,never smoked,1
5,Male,81.00,0,0,Yes,Private,Urban,186.21,29.00,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5104,Female,13.00,0,0,No,children,Rural,103.08,18.60,Unknown,0
5106,Female,81.00,0,0,Yes,Self-employed,Urban,125.20,40.00,never smoked,0
5107,Female,35.00,0,0,Yes,Self-employed,Rural,82.99,30.60,never smoked,0
5108,Male,51.00,0,0,Yes,Private,Rural,166.29,25.60,formerly smoked,0


# Used a label encodeder to give categorical features numerical values

In [3]:
le = LabelEncoder()
list_of_cat_feat = ['gender','ever_married','work_type','Residence_type','smoking_status']
for i in list_of_cat_feat:
    df[f'{i}_encoded'] = le.fit_transform(df[f"{i}"])
    
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,gender_encoded,ever_married_encoded,work_type_encoded,Residence_type_encoded,smoking_status_encoded
0,Male,67.00,0,1,Yes,Private,Urban,228.69,36.60,formerly smoked,1,1,1,2,1,1
2,Male,80.00,0,1,Yes,Private,Rural,105.92,32.50,never smoked,1,1,1,2,0,2
3,Female,49.00,0,0,Yes,Private,Urban,171.23,34.40,smokes,1,0,1,2,1,3
4,Female,79.00,1,0,Yes,Self-employed,Rural,174.12,24.00,never smoked,1,0,1,3,0,2
5,Male,81.00,0,0,Yes,Private,Urban,186.21,29.00,formerly smoked,1,1,1,2,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5104,Female,13.00,0,0,No,children,Rural,103.08,18.60,Unknown,0,0,0,4,0,0
5106,Female,81.00,0,0,Yes,Self-employed,Urban,125.20,40.00,never smoked,0,0,1,3,1,2
5107,Female,35.00,0,0,Yes,Self-employed,Rural,82.99,30.60,never smoked,0,0,1,3,0,2
5108,Male,51.00,0,0,Yes,Private,Rural,166.29,25.60,formerly smoked,0,1,1,2,0,1


# Droped the categorical values

In [4]:
df_cat = df.drop(list_of_cat_feat, axis=1)
df_cat

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_encoded,ever_married_encoded,work_type_encoded,Residence_type_encoded,smoking_status_encoded
0,67.00,0,1,228.69,36.60,1,1,1,2,1,1
2,80.00,0,1,105.92,32.50,1,1,1,2,0,2
3,49.00,0,0,171.23,34.40,1,0,1,2,1,3
4,79.00,1,0,174.12,24.00,1,0,1,3,0,2
5,81.00,0,0,186.21,29.00,1,1,1,2,1,1
...,...,...,...,...,...,...,...,...,...,...,...
5104,13.00,0,0,103.08,18.60,0,0,0,4,0,0
5106,81.00,0,0,125.20,40.00,0,0,1,3,1,2
5107,35.00,0,0,82.99,30.60,0,0,1,3,0,2
5108,51.00,0,0,166.29,25.60,0,1,1,2,0,1


# Split the data up into the features (X) and the target (y)

In [5]:
X = df_cat.drop('stroke',axis=1)
y = df_cat.stroke.values.reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
y_train, y_test = y_train.ravel(), y_test.ravel()

# Used Lazy Classifier to test out various binary classification machine learning models and sorted them by accuracy and F1 score

In [6]:
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)
models = models.reset_index()
models = models.sort_values(['Accuracy','F1 Score'], ascending=False)
models

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:03<00:00,  8.54it/s]


Unnamed: 0,Model,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
8,BaggingClassifier,0.95,0.54,0.54,0.93,0.07
13,RandomForestClassifier,0.95,0.51,0.51,0.93,0.36
17,RidgeClassifierCV,0.95,0.5,0.5,0.93,0.02
18,SGDClassifier,0.95,0.5,0.5,0.93,0.02
19,SVC,0.95,0.5,0.5,0.93,0.14
20,LinearSVC,0.95,0.5,0.5,0.93,0.12
21,RidgeClassifier,0.95,0.5,0.5,0.93,0.02
22,LogisticRegression,0.95,0.5,0.5,0.93,0.03
23,CalibratedClassifierCV,0.95,0.5,0.5,0.93,0.42
24,AdaBoostClassifier,0.95,0.5,0.5,0.93,0.18


# Build a Sci-Kit pipeline with a standard scaler and using the random forest classifier as a model.
# Used Gridsearch with cross validation to find the most optimal model for this classifier
# Explored the results in a DataFrame

In [7]:
pipe = Pipeline([
    ('scaler',StandardScaler()),
    ('model',RandomForestClassifier())
    ])

mod = GridSearchCV(
                estimator=pipe,
                param_grid={
                    'model__n_estimators':np.arange(1,50,1),
                    'model__max_depth':np.arange(2,15),
                    'model__max_features':['auto', 'sqrt'],
                    'model__min_samples_split':np.arange(2,20),
                    'model__bootstrap':[True, False]
                },
                cv = 5, 
                verbose = 1,
                n_jobs = -1)
mod.fit(X,y)

pd.DataFrame(mod.cv_results_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 45864 candidates, totalling 229320 fits


[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 668 tasks      | elapsed:   16.0s
[Parallel(n_jobs=-1)]: Done 1668 tasks      | elapsed:   36.8s
[Parallel(n_jobs=-1)]: Done 3068 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 4868 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 7068 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 9668 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 12668 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 16068 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 19868 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done 24068 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 28668 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done 33668 tasks      | elapsed: 12.3min
[Parallel(n_jobs=-1)]: Done 39068 tasks      | elapsed: 14.5min
[Parallel(n_jobs=-1)]: Done 44868 tasks      | elapsed: 16.9min
[Parallel(n_jobs=-1)]: Done 51068 tasks      | el

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__bootstrap,param_model__max_depth,param_model__max_features,param_model__min_samples_split,param_model__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.01,0.00,0.00,0.00,True,2,auto,2,1,"{'model__bootstrap': True, 'model__max_depth':...",0.96,0.96,0.96,0.96,0.96,0.96,0.00,25168
1,0.02,0.00,0.00,0.00,True,2,auto,2,2,"{'model__bootstrap': True, 'model__max_depth':...",0.96,0.96,0.96,0.96,0.96,0.96,0.00,1907
2,0.02,0.00,0.00,0.00,True,2,auto,2,3,"{'model__bootstrap': True, 'model__max_depth':...",0.96,0.96,0.96,0.96,0.96,0.96,0.00,1907
3,0.02,0.00,0.00,0.00,True,2,auto,2,4,"{'model__bootstrap': True, 'model__max_depth':...",0.96,0.96,0.96,0.96,0.96,0.96,0.00,1907
4,0.02,0.00,0.00,0.00,True,2,auto,2,5,"{'model__bootstrap': True, 'model__max_depth':...",0.96,0.96,0.96,0.96,0.96,0.96,0.00,1907
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45859,0.21,0.00,0.01,0.00,False,14,sqrt,19,45,"{'model__bootstrap': False, 'model__max_depth'...",0.96,0.96,0.96,0.96,0.96,0.96,0.00,19026
45860,0.22,0.00,0.01,0.00,False,14,sqrt,19,46,"{'model__bootstrap': False, 'model__max_depth'...",0.96,0.96,0.96,0.96,0.96,0.96,0.00,31431
45861,0.22,0.00,0.01,0.00,False,14,sqrt,19,47,"{'model__bootstrap': False, 'model__max_depth'...",0.96,0.96,0.96,0.95,0.96,0.96,0.00,34577
45862,0.23,0.01,0.01,0.00,False,14,sqrt,19,48,"{'model__bootstrap': False, 'model__max_depth'...",0.96,0.96,0.96,0.96,0.96,0.96,0.00,31431


# Finding the best model and viewing its accuracy as well as saving the model as a pkl file

In [8]:
best_etimator = mod.best_estimator_
print(best_etimator)
preds = best_etimator.predict(X_test)
print(accuracy_score(y_test, preds))

file_path = os.path.join('..','models','random_forest_clf_cat.pkl')
joblib.dump(mod.best_estimator_, file_path, compress = 1)

Pipeline(steps=[('scaler', StandardScaler()),
                ('model',
                 RandomForestClassifier(max_depth=11, max_features='sqrt',
                                        min_samples_split=5,
                                        n_estimators=25))])
0.9654320987654321


['..\\models\\random_forest_clf_cat.pkl']