In [1]:
# Code block imports and defines libraries used
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from imblearn.over_sampling import SMOTE

Commented out code below controls whether the models created are baseline or age specific

In [3]:
# Code Block cleans and pre-processes the dataset preparing it for training
df = pd.read_csv('stroke.csv')

df['gender'] = df['gender'].replace({'Male': 0, 'Female': 1, 'Other': -1}).astype(np.uint8)
df['Residence_type'] = df['Residence_type'].replace({'Rural': 0, 'Urban': 1}).astype(np.uint8)
df['work_type'] = df['work_type'].replace({'Private': 0, 'Self-employed': 1, 'Govt_job': 2, 'children': -1, 'Never_worked': -2}).astype(np.uint8)
df = pd.get_dummies(df, drop_first=True)

# Controls whether the models created will be for over 60s, under 60s or baseline
# Comment out both for baseline or 1 for the desired age group
df = df[df.age < 60]
# df = df[df.age >= 60]

X = df.drop('stroke', axis=1)
y = df['stroke']

imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

scaler = StandardScaler()
X = scaler.fit_transform(X)

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

  df['gender'] = df['gender'].replace({'Male': 0, 'Female': 1, 'Other': -1}).astype(np.uint8)
  df['Residence_type'] = df['Residence_type'].replace({'Rural': 0, 'Urban': 1}).astype(np.uint8)
  df['work_type'] = df['work_type'].replace({'Private': 0, 'Self-employed': 1, 'Govt_job': 2, 'children': -1, 'Never_worked': -2}).astype(np.uint8)


In [4]:
# Creates logistic regression model
model = LogisticRegression(max_iter=1000, random_state=42)
cv = StratifiedKFold(n_splits=5)

scores = cross_val_score(model, X_resampled, y_resampled, cv=cv, scoring='accuracy')
y_pred = model.fit(X_resampled, y_resampled).predict(X_resampled)

precision = precision_score(y_resampled, y_pred)
recall = recall_score(y_resampled, y_pred)
f1 = f1_score(y_resampled, y_pred)
roc_auc = roc_auc_score(y_resampled, y_pred)
conf_matrix = confusion_matrix(y_resampled, y_pred)


print("Logistic Regression")
print("LR Mean accuracy score:", np.mean(scores))
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)





Logistic Regression
LR Mean accuracy score: 0.774007054703244
Precision: 0.7472527472527473
Recall: 0.8346972176759411
F1 Score: 0.7885581754928489


In [5]:
# Creates K Nearest Neighbours model
model = KNeighborsClassifier(3)
cv = StratifiedKFold(n_splits=5)


scores = cross_val_score(model, X_resampled, y_resampled, cv=cv, scoring='accuracy')
y_pred = model.fit(X_resampled, y_resampled).predict(X_resampled)


precision = precision_score(y_resampled, y_pred)
recall = recall_score(y_resampled, y_pred)
f1 = f1_score(y_resampled, y_pred)
roc_auc = roc_auc_score(y_resampled, y_pred)
conf_matrix = confusion_matrix(y_resampled, y_pred)


print("KNeighborsClassifier")
print("KNN Mean accuracy score:", np.mean(scores))
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

KNeighborsClassifier
KNN Mean accuracy score: 0.9545826277235145
Precision: 0.9584096259482082
Recall: 0.9994544462629569
F1 Score: 0.9785018026438776


In [6]:
# Creates Random Forest model
model = RandomForestClassifier(max_depth=2, random_state=0)
cv = StratifiedKFold(n_splits=5)

scores = cross_val_score(model, X_resampled, y_resampled, cv=cv, scoring='accuracy')
y_pred = model.fit(X_resampled, y_resampled).predict(X_resampled)


precision = precision_score(y_resampled, y_pred)
recall = recall_score(y_resampled, y_pred)
f1 = f1_score(y_resampled, y_pred)
roc_auc = roc_auc_score(y_resampled, y_pred)
conf_matrix = confusion_matrix(y_resampled, y_pred)

print("Random Forest")
print("RF Mean accuracy score:", np.mean(scores))
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Random Forest
RF Mean accuracy score: 0.8278806782409927
Precision: 0.7821020741085994
Recall: 0.9154391707583197
F1 Score: 0.8435339952243308


In [7]:
# Creates Bernoulli Naive Bayes model
model = BernoulliNB()
cv = StratifiedKFold(n_splits=5)

scores = cross_val_score(model, X_resampled, y_resampled, cv=cv, scoring='accuracy')
y_pred = model.fit(X_resampled, y_resampled).predict(X_resampled)

precision = precision_score(y_resampled, y_pred)
recall = recall_score(y_resampled, y_pred)
f1 = f1_score(y_resampled, y_pred)
roc_auc = roc_auc_score(y_resampled, y_pred)
conf_matrix = confusion_matrix(y_resampled, y_pred)

print("BernoulliNB")
print("BNB Mean accuracy score:", np.mean(scores))
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

BernoulliNB
BNB Mean accuracy score: 0.7092204952799701
Precision: 0.6685058485985433
Recall: 0.8262411347517731
F1 Score: 0.7390508722703428


In [9]:
# Creates Stacked Ensemble Model
knn = KNeighborsClassifier(n_neighbors=3)
rf = RandomForestClassifier(max_depth=2, random_state=0)
nb = BernoulliNB()
meta_model = LogisticRegression(max_iter=1000, random_state=42)

stacking_model = StackingClassifier(
    estimators=[
        ('knn', knn),
        ('rf', rf),
        ('nb', nb)
    ],
    final_estimator=meta_model,
    cv=5
)


cv = StratifiedKFold(n_splits=5)
scores = cross_val_score(stacking_model, X_resampled, y_resampled, cv=cv, scoring='accuracy')
y_pred = stacking_model.fit(X_resampled, y_resampled).predict(X_resampled)


precision = precision_score(y_resampled, y_pred)
recall = recall_score(y_resampled, y_pred)
f1 = f1_score(y_resampled, y_pred)
roc_auc = roc_auc_score(y_resampled, y_pred)
conf_matrix = confusion_matrix(y_resampled, y_pred)


print("Stacked Ensemble")
print("Stacking Mean accuracy score:", np.mean(scores))
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Stacked Ensemble
Stacking Mean accuracy score: 0.9687675472491213
Precision: 0.9948285247686446
Recall: 0.9969994544462629
F1 Score: 0.9959128065395095
