## Import libraries

In [27]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from numpy import mean
from numpy import std
import time
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE, SMOTENC

## Read and prepare data

In [32]:
df = pd.read_csv("./stroke-data.csv") # Read CSV
# df.drop(['id', 'smoking_status'], axis=1, inplace=True) # Drop uneeded ID and smoking_status feature
df.drop('id', axis=1, inplace=True) # Drop uneeded ID and smoking_status feature
df['bmi'].fillna(df['bmi'].mean(), inplace=True) # BMI missing value imputation 

## Encode Catagories

In [33]:
ohe = OneHotEncoder()

catagory_names = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
encoded_catagories = ohe.fit_transform(df[catagory_names]) # Encode catagories

column_names = ohe.get_feature_names_out(catagory_names) # Get new catagory names ('gender' -> 'gender_Female', 'gender_Male', ...)

temp_df = pd.DataFrame(encoded_catagories.todense(), columns = column_names) # Create new dataframe with new catagories

df = df.drop(catagory_names, axis=1) # Remove old catagories
df = pd.concat([df, temp_df], axis=1) # Add new catagories

## Add minority cases to dataset

In [34]:
X = df.drop('stroke', axis=1) # Remove target feature from input
y = df['stroke'] # Extract target feature for output

# SMOTENC requires a list of indicies for the catagories within the dataset
catagory_column_indicies = []
for col in column_names:
    catagory_column_indicies.append(X.columns.get_loc(col))

smote_nc = SMOTENC(categorical_features=catagory_column_indicies, random_state=10)
X, y = smote_nc.fit_resample(X, y) # Add new entries to balance dataset

In [35]:
classifier = SelectKBest(score_func=f_classif,k=5)
fits = classifier.fit(df.drop('stroke',axis=1),df['stroke'])
x=pd.DataFrame(fits.scores_)
columns = pd.DataFrame(df.drop('stroke',axis=1).columns)
fscores = pd.concat([columns,x],axis=1)
fscores.columns = ['Attribute','Score']
fscores.sort_values(by='Score',ascending=False)

Unnamed: 0,Attribute,Score
0,age,326.916568
2,heart_disease,94.698406
3,avg_glucose_level,90.50387
1,hypertension,84.953542
8,ever_married_No,60.66723
9,ever_married_Yes,60.66723
14,work_type_children,36.18447
18,smoking_status_formerly smoked,21.376277
13,work_type_Self-employed,19.818466
17,smoking_status_Unknown,16.006799


## Train/Test data partitioning

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42) # Training set 80%, testing set 20%

## Standardisation

In [37]:
scalar = StandardScaler()
X_train = scalar.fit_transform(X_train) # Explain this
X_test = scalar.transform(X_test)

## Create and train model

In [38]:
model = SVC()
start = time.time()
model.fit(X_train, y_train)
stop = time.time()

## Test model performance

In [39]:
y_pred = model.predict(X_test)

cv = KFold(n_splits=10, random_state=1, shuffle=True)
scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)

print("----------------------------------")
print("Training score: " + str(model.score(X_train, y_train)))
print("Test score: " + str(model.score(X_test, y_test)))
print('Cross Validation Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))
print("Training time: " + str(stop - start))
print("----------------------------------")
print("Classification Report")
print(classification_report(y_test, y_pred))
print("Confusion Matrix")
print(confusion_matrix(y_test,y_pred))

----------------------------------
Training score: 0.9177060563199178
Test score: 0.9089974293059125
Cross Validation Accuracy: 0.910 (0.013)
Training time: 0.4444441795349121
----------------------------------
Classification Report
              precision    recall  f1-score   support

           0       0.94      0.88      0.91       975
           1       0.88      0.94      0.91       970

    accuracy                           0.91      1945
   macro avg       0.91      0.91      0.91      1945
weighted avg       0.91      0.91      0.91      1945

Confusion Matrix
[[856 119]
 [ 58 912]]


In [43]:
df2 = pd.read_csv("./new-data.csv") # Read CSV
# df2.drop('smoking_status', axis=1, inplace=True) # Drop uneeded ID feature
encoded_catagories = ohe.transform(df2[catagory_names]) # Encode catagories

column_names = ohe.get_feature_names_out(catagory_names) # Get new catagory names ('gender' -> 'gender_Female', 'gender_Male', ...)

temp_df = pd.DataFrame(encoded_catagories.todense(), columns = column_names) # Create new dataframe with new catagories

df2 = df2.drop(catagory_names, axis=1) # Remove old catagories
df2 = pd.concat([df2, temp_df], axis=1) # Add new catagories

X = df2.drop('stroke', axis=1) # Remove target feature from input
y = df2['stroke'] # Extract target feature for output


X = scalar.transform(X)

y_pred = model.predict(X)
y_pred


array([0, 1, 1])