## Patient Stroke Prediction


In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import accuracy_score, f1_score

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
data = pd.read_csv('Stroke Prediction Dataset.csv')
data

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [4]:
# check the count of label 
data.stroke.value_counts()

0    4861
1     249
Name: stroke, dtype: int64

after preprocessing the dataset, should to fix imbalance label

## Data Pre-processing

In [5]:
def onehot_encode(df, column):
    df = df.copy()
    
    dummies = pd.get_dummies(df[column], prefix=column)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    
    return df

In [6]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop id column
    df = df.drop('id', axis=1)
    
    # Binary encoding
    df['ever_married'] = df['ever_married'].replace({'No': 0, 'Yes': 1})
    df['Residence_type'] = df['Residence_type'].replace({'Rural': 0, 'Urban': 1})
    
    # One-hot encoding
    for column in ['gender', 'work_type', 'smoking_status']:
        df = onehot_encode(df, column=column)
    
    # Drop-gender_Other
    df = df.drop('gender_Other', axis=1)
    
    # Split df into X and y
    y = df['stroke']
    X = df.drop('stroke', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True, random_state=1)
    
    
    # KNN imputation of missing values on bmi column
    imputer = KNNImputer()
    imputer.fit(X_train)
    X_train = pd.DataFrame(imputer.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(imputer.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    
    
    return X_train, X_test, y_train, y_test

In [7]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

## After Pre-processing

In [8]:
X_train

Unnamed: 0,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,gender_Female,gender_Male,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
4152,0.525978,-0.327504,-0.239632,0.722864,-1.016775,-0.708689,-0.123429,-1.179848,1.179848,-0.382179,-0.068333,-1.148952,2.281083,-0.399645,-0.664047,-0.459239,-0.760111,2.347244
4051,0.525978,-0.327504,-0.239632,0.722864,0.983502,2.769942,-0.860040,-1.179848,1.179848,2.616572,-0.068333,-1.148952,-0.438388,-0.399645,-0.664047,-0.459239,1.315597,-0.426032
5076,-0.402645,-0.327504,-0.239632,0.722864,-1.016775,1.516505,-0.769579,0.847567,-0.847567,-0.382179,-0.068333,0.870358,-0.438388,-0.399645,-0.664047,-0.459239,1.315597,-0.426032
874,1.631482,-0.327504,-0.239632,0.722864,-1.016775,-1.074698,-0.291428,-1.179848,1.179848,-0.382179,-0.068333,0.870358,-0.438388,-0.399645,-0.664047,-0.459239,1.315597,-0.426032
3534,0.791299,-0.327504,-0.239632,0.722864,0.983502,0.055339,-1.273576,0.847567,-0.847567,-0.382179,-0.068333,0.870358,-0.438388,-0.399645,-0.664047,-0.459239,1.315597,-0.426032
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2895,0.127997,-0.327504,-0.239632,0.722864,0.983502,0.708548,0.044570,-1.179848,1.179848,-0.382179,-0.068333,0.870358,-0.438388,-0.399645,-0.664047,-0.459239,1.315597,-0.426032
2763,0.083777,-0.327504,-0.239632,0.722864,-1.016775,-0.925910,0.393491,0.847567,-0.847567,-0.382179,-0.068333,0.870358,-0.438388,-0.399645,1.505917,-0.459239,-0.760111,-0.426032
905,-0.535305,-0.327504,-0.239632,0.722864,-1.016775,-0.649306,0.858719,0.847567,-0.847567,-0.382179,-0.068333,0.870358,-0.438388,-0.399645,-0.664047,-0.459239,1.315597,-0.426032
3980,0.083777,-0.327504,-0.239632,0.722864,-1.016775,2.481859,3.365782,0.847567,-0.847567,-0.382179,-0.068333,0.870358,-0.438388,-0.399645,-0.664047,-0.459239,-0.760111,2.347244


## Handling label imbalance wth oversampling


In [9]:
oversampled_data = pd.concat([X_train, y_train], axis=1).copy()

num_samples = y_train.value_counts()[0] - y_train.value_counts()[1]
new_samples = oversampled_data.query("stroke == 1").sample(num_samples, replace=True, random_state=1)

oversampled_data = pd.concat([oversampled_data, new_samples], axis=0).sample(frac=1.0, random_state=1).reset_index(drop=True)

y_train_oversampled = oversampled_data['stroke']
X_train_oversampled = oversampled_data.drop('stroke', axis=1)

## Modeling & Results

In [10]:
models = {
    "                   Logistic Regression": LogisticRegression(),
    "                   K-Nearest Neighbors": KNeighborsClassifier(),
    "                         Decision Tree": DecisionTreeClassifier(),
    "Support Vector Machine (Linear Kernel)": LinearSVC(),
    "   Support Vector Machine (RBF Kernel)": SVC(),
    "                        Neural Network": MLPClassifier(),
    "                         Random Forest": RandomForestClassifier(),
    "                     Gradient Boosting": GradientBoostingClassifier(),
    "                               XGBoost": XGBClassifier(eval_metric='mlogloss'),
    "                              LightGBM": LGBMClassifier(),
    "                              CatBoost": CatBoostClassifier(verbose=0)
}

for name, model in models.items():
    model.fit(X_train_oversampled, y_train_oversampled)
    print(name + " trained.")

                   Logistic Regression trained.
                   K-Nearest Neighbors trained.
                         Decision Tree trained.
Support Vector Machine (Linear Kernel) trained.
   Support Vector Machine (RBF Kernel) trained.
                        Neural Network trained.
                         Random Forest trained.
                     Gradient Boosting trained.
                               XGBoost trained.
                              LightGBM trained.
                              CatBoost trained.


In [11]:
print("Model Performance\n-----------------")
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(
        "\n" + name + " Accuracy: {:.3f}%\n\t\t\t\t       F1-Score: {:.5f}"\
        .format(accuracy_score(y_test, y_pred) * 100, f1_score(y_test, y_pred))
    )

Model Performance
-----------------

                   Logistic Regression Accuracy: 74.560%
				       F1-Score: 0.28571

                   K-Nearest Neighbors Accuracy: 85.910%
				       F1-Score: 0.15294

                         Decision Tree Accuracy: 90.215%
				       F1-Score: 0.15254

Support Vector Machine (Linear Kernel) Accuracy: 73.777%
				       F1-Score: 0.27957

   Support Vector Machine (RBF Kernel) Accuracy: 77.397%
				       F1-Score: 0.23256

                        Neural Network Accuracy: 81.311%
				       F1-Score: 0.14350

                         Random Forest Accuracy: 93.346%
				       F1-Score: 0.05556

                     Gradient Boosting Accuracy: 80.626%
				       F1-Score: 0.28261

                               XGBoost Accuracy: 92.074%
				       F1-Score: 0.19802

                              LightGBM Accuracy: 90.020%
				       F1-Score: 0.23881

                              CatBoost Accuracy: 90.802%
				       F1-Score: 0.22951
