## Import libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from numpy import mean
from numpy import std
import time
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE, SMOTENC

## Read and prepare data

In [2]:
df = pd.read_csv("./stroke-data.csv") # Read CSV
df.drop(['id', 'smoking_status'], axis=1, inplace=True) # Drop uneeded ID and smoking_status feature
df['bmi'].fillna(df['bmi'].mean(), inplace=True) # BMI missing value imputation

In [3]:
print("cases of non-stroke: " + str(len(df[df['stroke'] == 0])))
print("cases of stroke: " + str(len(df[df['stroke'] == 1])))

cases of non-stroke: 4861
cases of stroke: 249


## Encode Catagories

In [4]:
ohe = OneHotEncoder()

catagory_names = ['gender', 'ever_married', 'work_type', 'Residence_type']
encoded_catagories = ohe.fit_transform(df[catagory_names]) # Encode catagories

column_names = ohe.get_feature_names_out(catagory_names) # Get new catagory names ('gender' -> 'gender_Female', 'gender_Male', ...)

temp_df = pd.DataFrame(encoded_catagories.todense(), columns = column_names) # Create new dataframe with new catagories

df = df.drop(catagory_names, axis=1) # Remove old catagories
df = pd.concat([df, temp_df], axis=1) # Add new catagories

## Seperate input and output data

In [5]:
X = df.drop('stroke', axis=1) # Remove target feature from input
y = df['stroke'] # Extract target feature for output

## Add minority cases to dataset

In [6]:
# SMOTENC requires a list of indicies for the catagories within the dataset
catagory_column_indicies = []
for col in column_names:
    catagory_column_indicies.append(X.columns.get_loc(col))

smote_nc = SMOTENC(categorical_features=catagory_column_indicies, random_state=10)
X, y = smote_nc.fit_resample(X, y) # Add new entries to balance dataset

In [7]:
X

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban
0,67.000000,0,1,228.690000,36.600000,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,61.000000,0,0,202.210000,28.893237,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,80.000000,0,1,105.920000,32.500000,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,49.000000,0,0,171.230000,34.400000,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,79.000000,1,0,174.120000,24.000000,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9717,73.087644,0,0,130.769844,28.291236,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9718,77.874242,0,0,103.932679,31.590113,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
9719,78.000000,0,0,115.271659,26.268979,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
9720,57.491837,0,0,188.653439,31.816894,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


## Train/Test data partitioning

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42) # Training set 90%, testing set 10%

## Standardisation

In [9]:
scalar = StandardScaler()
X_train = scalar.fit_transform(X_train)
X_test = scalar.transform(X_test)

## Create and train SVM

In [10]:
from sklearn.svm import SVC

svm = SVC()
svm_start = time.time()
svm.fit(X_train, y_train)
svm_stop = time.time()

## Create and train Random Forest

In [11]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 100)
rf_start = time.time()
rf.fit(X_train, y_train)
rf_stop = time.time()

## Create and train KNN

In [12]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 7)
knn_start = time.time()
knn.fit(X_train, y_train)
knn_stop = time.time()

## Create and train Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression

lr  = LogisticRegression()
lr_start = time.time()
lr.fit(X_train, y_train)
lr_stop = time.time()

# Testing Scores

## Random Forest

In [14]:
y_pred_rf = rf.predict(X_test)

cv = KFold(n_splits=10, random_state=1, shuffle=True)
rf_scores = cross_val_score(rf, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)

print("Random Forest Scores:")
print("----------------------------------")
print("Training score: " + str(rf.score(X_train, y_train)))
print("Test score: " + str(rf.score(X_test, y_test)))
print('Cross Validation Accuracy: %.3f (%.3f)' % (mean(rf_scores), std(rf_scores)))
print("Training time: " + str(lr_stop - lr_start))
print("----------------------------------")
print("Classification Report")
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix")
print(confusion_matrix(y_test, y_pred_rf))

Random Forest Scores:
----------------------------------
Training score: 1.0
Test score: 0.9357326478149101
Cross Validation Accuracy: 0.939 (0.010)
Training time: 0.0174410343170166
----------------------------------
Classification Report
              precision    recall  f1-score   support

           0       0.95      0.92      0.94       975
           1       0.92      0.95      0.94       970

    accuracy                           0.94      1945
   macro avg       0.94      0.94      0.94      1945
weighted avg       0.94      0.94      0.94      1945

Confusion Matrix
[[900  75]
 [ 50 920]]


## SVM

In [15]:
y_pred_svm = svm.predict(X_test)

cv = KFold(n_splits=10, random_state=1, shuffle=True)
scores_svm = cross_val_score(svm, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)

print("SVM Scores:")
print("----------------------------------")
print("Training score: " + str(svm.score(X_train, y_train)))
print("Test score: " + str(svm.score(X_test, y_test)))
print('Cross Validation Accuracy: %.3f (%.3f)' % (mean(scores_svm), std(scores_svm)))
print("Training time: " + str(svm_stop - svm_start))
print("----------------------------------")
print("Classification Report")
print(classification_report(y_test, y_pred_svm))
print("Confusion Matrix")
print(confusion_matrix(y_test, y_pred_svm))

SVM Scores:
----------------------------------
Training score: 0.8617718914748618
Test score: 0.8467866323907455
Cross Validation Accuracy: 0.856 (0.015)
Training time: 0.5371448993682861
----------------------------------
Classification Report
              precision    recall  f1-score   support

           0       0.89      0.79      0.84       975
           1       0.81      0.91      0.86       970

    accuracy                           0.85      1945
   macro avg       0.85      0.85      0.85      1945
weighted avg       0.85      0.85      0.85      1945

Confusion Matrix
[[768 207]
 [ 91 879]]


## KNN

In [16]:
y_pred_knn = knn.predict(X_test)

cv = KFold(n_splits=10, random_state=1, shuffle=True)
scores_knn = cross_val_score(knn, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)

print("KNN Scores:")
print("----------------------------------")
print("Training score: " + str(knn.score(X_train, y_train)))
print("Test score: " + str(knn.score(X_test, y_test)))
print('Cross Validation Accuracy: %.3f (%.3f)' % (mean(scores_knn), std(scores_knn)))
print("Training time: " + str(knn_stop - knn_start))
print("----------------------------------")
print("Classification Report")
print(classification_report(y_test, y_pred_knn))
print("Confusion Matrix")
print(confusion_matrix(y_test, y_pred_knn))

KNN Scores:
----------------------------------
Training score: 0.9202777420599203
Test score: 0.8956298200514139
Cross Validation Accuracy: 0.899 (0.010)
Training time: 0.0007102489471435547
----------------------------------
Classification Report
              precision    recall  f1-score   support

           0       0.95      0.84      0.89       975
           1       0.85      0.95      0.90       970

    accuracy                           0.90      1945
   macro avg       0.90      0.90      0.90      1945
weighted avg       0.90      0.90      0.90      1945

Confusion Matrix
[[817 158]
 [ 45 925]]


## Logistic Regression

In [17]:
y_pred_lr = knn.predict(X_test)

cv = KFold(n_splits=10, random_state=1, shuffle=True)
scores_lr = cross_val_score(lr, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)

print("LR Scores:")
print("----------------------------------")
print("Training score: " + str(lr.score(X_train, y_train)))
print("Test score: " + str(lr.score(X_test, y_test)))
print('Cross Validation Accuracy: %.3f (%.3f)' % (mean(scores_lr), std(scores_lr)))
print("Training time: " + str(lr_stop - lr_start))
print("----------------------------------")
print("Classification Report")
print(classification_report(y_test, y_pred_lr))
print("Confusion Matrix")
print(confusion_matrix(y_test, y_pred_lr))

LR Scores:
----------------------------------
Training score: 0.8168959753118169
Test score: 0.8097686375321337
Cross Validation Accuracy: 0.815 (0.012)
Training time: 0.0174410343170166
----------------------------------
Classification Report
              precision    recall  f1-score   support

           0       0.95      0.84      0.89       975
           1       0.85      0.95      0.90       970

    accuracy                           0.90      1945
   macro avg       0.90      0.90      0.90      1945
weighted avg       0.90      0.90      0.90      1945

Confusion Matrix
[[817 158]
 [ 45 925]]


In [18]:
# df2 = pd.read_csv("./new-data.csv") # Read CSV
# df2.drop('smoking_status', axis=1, inplace=True) # Drop uneeded ID feature
# encoded_catagories = ohe.transform(df2[catagory_names]) # Encode catagories

# column_names = ohe.get_feature_names_out(catagory_names) # Get new catagory names ('gender' -> 'gender_Female', 'gender_Male', ...)

# temp_df = pd.DataFrame(encoded_catagories.todense(), columns = column_names) # Create new dataframe with new catagories

# df2 = df2.drop(catagory_names, axis=1) # Remove old catagories
# df2 = pd.concat([df2, temp_df], axis=1) # Add new catagories

# X = df2.drop('stroke', axis=1) # Remove target feature from input
# y = df2['stroke'] # Extract target feature for output


# X = scalar.transform(X)

# y_pred = model.predict(X)
# # y_pred

# model.score(X_test, y_test)
