In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from numpy import mean
from numpy import std
import time
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTENC
from sklearn.feature_selection import SelectKBest, f_classif

In [2]:
df = pd.read_csv("./stroke-data.csv") # Read CSV
df.drop(['id', 'smoking_status'], axis=1, inplace=True) # Drop uneeded ID and smoking_status feature
# df.drop('id', axis=1, inplace=True) # Drop uneeded ID and smoking_status feature
df['bmi'].fillna(df['bmi'].mean(), inplace=True) # BMI missing value imputation

In [3]:
# --- Encode catagories ---
enc = LabelEncoder()
df['gender'] = enc.fit_transform(df['gender'])
df['work_type'] = enc.fit_transform(df['work_type'])
df['Residence_type'] = enc.fit_transform(df['Residence_type'])
df['ever_married'] = enc.fit_transform(df['ever_married'])

In [4]:
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,stroke
0,1,67.0,0,1,1,2,1,228.69,36.600000,1
1,0,61.0,0,0,1,3,0,202.21,28.893237,1
2,1,80.0,0,1,1,2,0,105.92,32.500000,1
3,0,49.0,0,0,1,2,1,171.23,34.400000,1
4,0,79.0,1,0,1,3,0,174.12,24.000000,1
...,...,...,...,...,...,...,...,...,...,...
5105,0,80.0,1,0,1,2,1,83.75,28.893237,0
5106,0,81.0,0,0,1,3,1,125.20,40.000000,0
5107,0,35.0,0,0,1,3,0,82.99,30.600000,0
5108,1,51.0,0,0,1,2,0,166.29,25.600000,0


In [5]:
# --- Partition data ---
X = df.drop('stroke', axis=1)
y = df['stroke']

smote_nc = SMOTENC(categorical_features=[0, 2, 3, 4, 5, 6], random_state=10)
X, y = smote_nc.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10)

In [6]:
# --- Normalise data ---
scalar = StandardScaler()
X_train = scalar.fit_transform(X_train)
X_test = scalar.transform(X_test)

In [7]:
# --- Reduce dimensionality ---
# Improves score slightly?
from sklearn.decomposition import PCA
pca = PCA()
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [8]:
# --- Train model ---
model = LogisticRegression(solver='lbfgs', max_iter=2000)
start = time.time()
model.fit(X_train, y_train)
stop = time.time()

In [9]:
y_pred = model.predict(X_test)
cv = KFold(n_splits=10, random_state=1, shuffle=True)
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

print("Training score: " + str(model.score(X_train, y_train)))
print("Test score: " + str(model.score(X_test, y_test)))
print("Training time: " + str(stop - start))


print("Classification Report :")
print (classification_report(y_test, y_pred))
print('Cross Validation Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Training score: 0.7841069821267841
Test score: 0.7969151670951157
Training time: 0.009063959121704102
Classification Report :
              precision    recall  f1-score   support

           0       0.84      0.75      0.79       998
           1       0.76      0.85      0.80       947

    accuracy                           0.80      1945
   macro avg       0.80      0.80      0.80      1945
weighted avg       0.80      0.80      0.80      1945

Cross Validation Accuracy: 0.786 (0.012)


In [10]:
all_stroke_x = X_test[y_test == 1]
all_stroke_y = y_test[y_test == 1]


print("Test score: " + str(model.score(all_stroke_x, all_stroke_y)))

Test score: 0.8479408658922915
