# --------------------------------------Stroke_Detectors_Data_Modeling_&_Evaluation-----------------------------------------

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings

In [2]:
warnings.filterwarnings(action='ignore')#for Ignoring warnings if any generated
df = pd.read_csv('Preprocessed_Stroke_Detector_Dataset.csv')#Reading the Dataset
df.head()#Displaying first 5 rows of the Dataset

Unnamed: 0,gender,hypertension,heart_disease,work_type,Residence_type,smoking_status,avg_glucose_level,bmi,age,stroke
0,1,0,1,2,1,1,2.706375,1.066746,1.051434,1
1,0,0,0,3,0,2,2.121559,0.013363,0.78607,1
2,1,0,1,2,0,2,-0.005028,0.506346,1.62639,1
3,0,0,0,2,1,3,1.437358,0.766044,0.255342,1
4,0,1,0,3,0,2,1.501184,-0.655458,1.582163,1


# Model Selection and Evaluation

## Splitting the dataset into training and testing datasets

In [3]:
x=df.drop(['stroke'], axis=1)
y=df['stroke']
# Models
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans
# Evaluation
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state= 80)

## models to be trained: decision tree, logistic regression, random forest, support vector machine, k nearest neighbor, naive bayes, and k means clustering.
## We will compare the results obtained from training these models and report the one with the best performance

In [4]:
from sklearn.model_selection import cross_validate
models = dict()
models['Decision Tree'] = DecisionTreeClassifier()
models['Logistic Regression'] = LogisticRegression()
models['Random Forest'] = RandomForestClassifier()
models['Support Vector Machine'] = SVC(kernel = 'sigmoid', gamma='scale')
models['K Nearest Neighbor'] = KNeighborsClassifier()
models['Naive Bayes'] = GaussianNB()
models['KMeans'] = KMeans(n_clusters=2, n_init=10, random_state=42)
scores2=[]
for model in models:
    models[model].fit(x_train, y_train)
    print(model + " model fitting completed.")
    scores = cross_validate(models[model], x_train, y_train, scoring = ['accuracy', 'precision','recall','f1'], cv = 10)
    # scores2.append(scores)
    df_scores = pd.DataFrame(scores, index = range(1,11))
    #df_scores
    print(df_scores.mean().round(2))
    scores2.append(df_scores.mean().round(2))

Decision Tree model fitting completed.
fit_time          0.03
score_time        0.02
test_accuracy     0.91
test_precision    0.14
test_recall       0.19
test_f1           0.16
dtype: float64
Logistic Regression model fitting completed.
fit_time          0.03
score_time        0.01
test_accuracy     0.95
test_precision    0.10
test_recall       0.01
test_f1           0.01
dtype: float64
Random Forest model fitting completed.
fit_time          1.08
score_time        0.05
test_accuracy     0.95
test_precision    0.12
test_recall       0.02
test_f1           0.04
dtype: float64
Support Vector Machine model fitting completed.
fit_time          0.31
score_time        0.05
test_accuracy     0.92
test_precision    0.13
test_recall       0.08
test_f1           0.10
dtype: float64
K Nearest Neighbor model fitting completed.
fit_time          0.02
score_time        0.07
test_accuracy     0.95
test_precision    0.18
test_recall       0.02
test_f1           0.03
dtype: float64
Naive Bayes model fi

## Below, we will  test each model and generate confusion matrix for each model to see the performance and evaluate how the prediction has worked for each model

In [5]:
print("Classification Report:\n")
conf_matrix=[]
for x in models:
    print('*'*20+x+'*'*20)
    model = models[x]
    y_pred = model.predict(x_test)
    arg_test = {'y_true':y_test, 'y_pred':y_pred}
    conf_matrix.append(confusion_matrix(**arg_test))
    print(classification_report(**arg_test))

Classification Report:

********************Decision Tree********************
              precision    recall  f1-score   support

           0       0.96      0.95      0.96      1218
           1       0.20      0.23      0.21        60

    accuracy                           0.92      1278
   macro avg       0.58      0.59      0.59      1278
weighted avg       0.93      0.92      0.92      1278

********************Logistic Regression********************
              precision    recall  f1-score   support

           0       0.95      1.00      0.98      1218
           1       0.00      0.00      0.00        60

    accuracy                           0.95      1278
   macro avg       0.48      0.50      0.49      1278
weighted avg       0.91      0.95      0.93      1278

********************Random Forest********************
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1218
           1       0.00      0.00      0.00  

## Confusion Matrix for Each Model:

In [6]:
model_names=["Decision Tree Model","Logistic Regression Model","Random Forest Model","Support Vector Machine Model","K Nearest Neighbor Model","Naive Bayes Model","KMeans"]
for i in range(7):
    print("Confusion Matrix for ",model_names[i],":\n", conf_matrix[i])

Confusion Matrix for  Decision Tree Model :
 [[1161   57]
 [  46   14]]
Confusion Matrix for  Logistic Regression Model :
 [[1217    1]
 [  60    0]]
Confusion Matrix for  Random Forest Model :
 [[1215    3]
 [  60    0]]
Confusion Matrix for  Support Vector Machine Model :
 [[1177   41]
 [  58    2]]
Confusion Matrix for  K Nearest Neighbor Model :
 [[1210    8]
 [  59    1]]
Confusion Matrix for  Naive Bayes Model :
 [[1103  115]
 [  43   17]]
Confusion Matrix for  KMeans :
 [[970 248]
 [ 60   0]]


# Calculating four additional evaluation metrics which are not implemented using standard libraries.(Extra Credit Work)
## Calculating F1-Score for each model:

In [7]:
# F1_Score=2*(Precision*Recall)/(Precision+Recall)
for i in range(7):
    F1_Score=2*((scores2[i][3]*scores2[i][4])/(scores2[i][3]+scores2[i][4]))
    print("F1 Score for Model ",model_names[i],"=",F1_Score.round(2))

F1 Score for Model  Decision Tree Model = 0.16
F1 Score for Model  Logistic Regression Model = 0.02
F1 Score for Model  Random Forest Model = 0.03
F1 Score for Model  Support Vector Machine Model = 0.1
F1 Score for Model  K Nearest Neighbor Model = 0.04
F1 Score for Model  Naive Bayes Model = 0.25
F1 Score for Model  KMeans = 0.04


## Calculating Critical Success Index for each model:

In [8]:
# CSI=Hits/(Hits+Misses+FalseAlarms)
for i in range(7):
    CSI=(conf_matrix[i][0][0])/(conf_matrix[i][0][0]+conf_matrix[i][1][0]+conf_matrix[i][0][1])
    print("Critical Success Index for Model ",model_names[i],"= ",CSI.round(2))

Critical Success Index for Model  Decision Tree Model =  0.92
Critical Success Index for Model  Logistic Regression Model =  0.95
Critical Success Index for Model  Random Forest Model =  0.95
Critical Success Index for Model  Support Vector Machine Model =  0.92
Critical Success Index for Model  K Nearest Neighbor Model =  0.95
Critical Success Index for Model  Naive Bayes Model =  0.87
Critical Success Index for Model  KMeans =  0.76


## Calculating False Alarm Ratio for each model:

In [9]:
# FAR=FalseAlarms/(Hits+FalseAlarms)
for i in range(7):
    FAR=(conf_matrix[i][0][1])/(conf_matrix[i][0][0]+conf_matrix[i][0][1])
    print("False Alarm Ratio for Model ",model_names[i],"= ",FAR.round(2))

False Alarm Ratio for Model  Decision Tree Model =  0.05
False Alarm Ratio for Model  Logistic Regression Model =  0.0
False Alarm Ratio for Model  Random Forest Model =  0.0
False Alarm Ratio for Model  Support Vector Machine Model =  0.03
False Alarm Ratio for Model  K Nearest Neighbor Model =  0.01
False Alarm Ratio for Model  Naive Bayes Model =  0.09
False Alarm Ratio for Model  KMeans =  0.2


## BIAS Score Measure Ratio for Each Model:

In [10]:
# BIAS=(Hits+FalseAlarms)/(Hits+Misses)
for i in range(7):
    BIAS=(conf_matrix[i][0][0]+conf_matrix[i][0][1])/(conf_matrix[i][0][0]+conf_matrix[i][1][0])
    print("BIAS Score Measure Ratio for Model ",model_names[i],"= ",BIAS.round(2))

BIAS Score Measure Ratio for Model  Decision Tree Model =  1.01
BIAS Score Measure Ratio for Model  Logistic Regression Model =  0.95
BIAS Score Measure Ratio for Model  Random Forest Model =  0.96
BIAS Score Measure Ratio for Model  Support Vector Machine Model =  0.99
BIAS Score Measure Ratio for Model  K Nearest Neighbor Model =  0.96
BIAS Score Measure Ratio for Model  Naive Bayes Model =  1.06
BIAS Score Measure Ratio for Model  KMeans =  1.18


## Summarizing the Accuracy Scores for Each Model:

In [11]:
print('Summary of Accuracy Score\n\n')
for i in models:
    model = models[i]
    print(i,' Model: ',accuracy_score(y_test, model.predict(x_test)).round(2)*100,"%")

Summary of Accuracy Score


Decision Tree  Model:  92.0 %
Logistic Regression  Model:  95.0 %
Random Forest  Model:  95.0 %
Support Vector Machine  Model:  92.0 %
K Nearest Neighbor  Model:  95.0 %
Naive Bayes  Model:  88.0 %
KMeans  Model:  76.0 %


## Accuracy scores of the models indicate that logistic regression (95%), random forest (95%), and KNN (95%) have the highest accuracy scores. We can choose any of these models as our final model as they have a very good accuracy score. However, for this project we choose Logistic Regression and moving forward.

## Training and Cross Validating the Accuracy of Logistic Regression Model for cross validate value as 10:

In [12]:
from sklearn.model_selection import cross_val_score

lgr=LogisticRegression()
scores = cross_val_score(lgr, x_train, y_train, cv = 10, scoring='accuracy')

print('Cross-validation scores:',scores.round(3)*100)
print("Average cross-validation score:",scores.mean().round(3)*100,"%")

Cross-validation scores: [95.1 95.1 95.3 95.  95.3 95.  95.  95.  95.  95. ]
Average cross-validation score: 95.1 %


## for cross validation and resampling, we use mean cross validation. Cross validation indicates how our model will perform in the wild, and based on the cross validation scores obtained for this, our Logistic Regression model will perform with 95.1% accurately on average

## Performing Hyper-Parameter Optimization(Extra Credits):-

## 1) Using Grid Search

### a) For Logistic Regression:

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
model = LogisticRegression()
# define evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=1)
# define search space
space = dict()
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
space['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
space['C'] = [1,2,3,4]
# define search
search = GridSearchCV(model, space, scoring='accuracy', n_jobs=1, cv=cv)
# execute search
result = search.fit(x_train,y_train)
# summarize result
print('Best Accuracy Score: ',result.best_score_.round(2)*100,"%")
print('Best Hyperparameters: ',result.best_params_)

Best Accuracy Score:  95.0 %
Best Hyperparameters:  {'C': 1, 'penalty': 'none', 'solver': 'newton-cg'}


### b) For Support Vector Machine:

In [14]:
parameters = {'kernel':('linear', 'rbf'), 'C':[1,2,3,4]}
svc = SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(x_train,y_train)
print('Best Accuracy Score: ',clf.best_score_.round(2)*100,"%")
print('Best Hyperparameters: ',clf.best_params_)

Best Accuracy Score:  95.0 %
Best Hyperparameters:  {'C': 1, 'kernel': 'linear'}


### c) For K Nearest Neighbor:

In [15]:
rf_params = {
    'n_neighbors': [1,2,3,4],
}
clf = KNeighborsClassifier()
grid = GridSearchCV(clf, rf_params, cv=10, scoring='accuracy')
grid.fit(x_train, y_train)
print('Best Accuracy Score: ',grid.best_score_.round(2)*100,"%")
print('Best Hyperparameters: ',grid.best_params_)

Best Accuracy Score:  95.0 %
Best Hyperparameters:  {'n_neighbors': 4}


In [17]:
!pip install scikit-optimize



You should consider upgrading via the 'C:\Users\khatr\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip' command.


## 2) Using BayesSearch

In [16]:
from skopt import BayesSearchCV 
from skopt.space import Integer
# Define the hyperparameter configuration space
rf_params = {
    'n_estimators': Integer(10,100),
    'max_depth': Integer(5,50),
    "min_samples_split":Integer(2,11),
    "min_samples_leaf":Integer(1,11),
    "criterion":['gini','entropy']
}
clf = RandomForestClassifier(random_state=80)
Bayes = BayesSearchCV(clf, rf_params,cv=10,n_iter=1, n_jobs=1,scoring='accuracy')
#number of iterations is set to 20, you can increase this number if time permits
Bayes.fit(x_train,y_train)
print("Best Estimator: ",Bayes.best_estimator_)
print('Best Accuracy Score: ',Bayes.best_score_.round(2)*100,"%")
print('Best Hyperparameters: ',Bayes.best_params_)

Best Estimator:  RandomForestClassifier(criterion='entropy', max_depth=16, min_samples_leaf=11,
                       n_estimators=47, random_state=80)
Best Accuracy Score:  95.0 %
Best Hyperparameters:  OrderedDict([('criterion', 'entropy'), ('max_depth', 16), ('min_samples_leaf', 11), ('min_samples_split', 2), ('n_estimators', 47)])
