## Preprocessing

In [4]:
# data manipulation
import pandas as pd
import numpy as np

# data visualization
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns

# data preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

# model
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn import svm
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, f1_score


import json

# styling
plt.style.use("ggplot")
rcParams['figure.figsize'] = (12, 6)

### 1. Importing Dataset

In [5]:
df = pd.read_csv("../data/healthcare-dataset-stroke-data.csv")

In [6]:
df.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [7]:
# With sklearn
imp = SimpleImputer(strategy='mean')
df['bmi'] = imp.fit_transform(df[['bmi']])

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                5110 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


### Encode categorical features

In [9]:
# Select columns which dtype == 'category'
cat_cols = df.select_dtypes(include=['object']).columns
print(f'Categorical columns: {cat_cols}')

# Loop through each categorical column 
for col in cat_cols:

    # create a list of labels to be encoded in the column
    append_to = list(df[col].unique())
 
    # These labels will be use as column headers
    print(append_to)
 
    # Apply OneHotEncoder()
    df[append_to] = OneHotEncoder().fit_transform(df[[col]]).toarray()
 
    # Drop non-encoded column
    df.drop(col, axis=1, inplace=True)
    
    # Drop redundant data
    df.drop(append_to[0], axis=1, inplace=True)
 

Categorical columns: Index(['gender', 'ever_married', 'work_type', 'Residence_type',
       'smoking_status'],
      dtype='object')
['Male', 'Female', 'Other']
['Yes', 'No']
['Private', 'Self-employed', 'Govt_job', 'children', 'Never_worked']
['Urban', 'Rural']
['formerly smoked', 'never smoked', 'smokes', 'Unknown']


In [10]:
df

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,Female,Other,No,Self-employed,Govt_job,children,Never_worked,Rural,never smoked,smokes,Unknown
0,9046,67.0,0,1,228.69,36.600000,1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
1,51676,61.0,0,0,202.21,28.893237,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,31112,80.0,0,1,105.92,32.500000,1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,60182,49.0,0,0,171.23,34.400000,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
4,1665,79.0,1,0,174.12,24.000000,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,80.0,1,0,83.75,28.893237,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
5106,44873,81.0,0,0,125.20,40.000000,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
5107,19723,35.0,0,0,82.99,30.600000,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
5108,37544,51.0,0,0,166.29,25.600000,0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


### Scale numeric features

In [11]:
# Select numerical columns
num_cols = ['age', 'avg_glucose_level', 'bmi']

# Apply MinMaxScaler
minmax = MinMaxScaler()
df[num_cols] = minmax.fit_transform(df[num_cols])
df[num_cols]

Unnamed: 0,age,avg_glucose_level,bmi
0,0.816895,0.801265,0.301260
1,0.743652,0.679023,0.212981
2,0.975586,0.234512,0.254296
3,0.597168,0.536008,0.276060
4,0.963379,0.549349,0.156930
...,...,...,...
5105,0.975586,0.132167,0.212981
5106,0.987793,0.323516,0.340206
5107,0.426270,0.128658,0.232532
5108,0.621582,0.513203,0.175258


In [12]:
from sklearn.model_selection import train_test_split

y = df['stroke']
X  = df
X.drop('stroke', axis=1, inplace=True) 
X.drop('id', axis=1, inplace=True) 

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

In [36]:
X_test

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,Female,Other,No,Self-employed,Govt_job,children,Never_worked,Rural,never smoked,smokes,Unknown
4688,0.377441,0,0,0.044917,0.145475,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4478,0.487305,0,0,0.046949,0.206186,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3849,0.096680,0,0,0.089096,0.139748,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4355,0.963379,1,0,0.099344,0.105384,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3826,0.914551,0,0,0.183039,0.193585,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3605,0.021973,0,0,0.410165,0.212981,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4934,0.015137,0,0,0.239590,0.212981,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4835,0.597168,1,0,0.296279,0.230241,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4105,0.682617,0,0,0.339350,0.167239,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [15]:
oversample = SMOTE() 
X_train_resh, y_train_resh = oversample.fit_resample(X_train, y_train.ravel())

In [16]:
X_train

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,Female,Other,No,Self-employed,Govt_job,children,Never_worked,Rural,never smoked,smokes,Unknown
802,0.963379,0,0,0.265534,0.208477,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3927,0.755859,0,0,0.153264,0.297824,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2337,0.255371,0,0,0.020312,0.268041,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3910,0.377441,0,0,0.048841,0.230241,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1886,0.377441,0,0,0.020820,0.109966,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4426,0.523926,0,0,0.151787,0.232532,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
466,0.743652,1,0,0.530560,0.571592,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3092,0.013184,0,0,0.194627,0.085911,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3772,0.975586,0,0,0.650725,0.237113,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [17]:
X_train_resh

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,Female,Other,No,Self-employed,Govt_job,children,Never_worked,Rural,never smoked,smokes,Unknown
0,0.963379,0,0,0.265534,0.208477,1.0,0.0,1.0,0.0,0.000000,1.000000,0.0,0.000000,1.0,0.0,0.000000
1,0.755859,0,0,0.153264,0.297824,0.0,0.0,1.0,0.0,1.000000,0.000000,0.0,1.000000,0.0,0.0,0.000000
2,0.255371,0,0,0.020312,0.268041,0.0,0.0,0.0,0.0,1.000000,0.000000,0.0,0.000000,0.0,1.0,0.000000
3,0.377441,0,0,0.048841,0.230241,1.0,0.0,1.0,0.0,0.000000,0.000000,0.0,1.000000,1.0,0.0,0.000000
4,0.377441,0,0,0.020820,0.109966,0.0,0.0,0.0,0.0,1.000000,0.000000,0.0,0.000000,0.0,1.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7797,0.910056,1,0,0.074213,0.139897,0.0,0.0,1.0,0.0,1.000000,0.000000,0.0,0.909742,0.0,1.0,0.000000
7798,0.869142,0,0,0.518660,0.202692,1.0,0.0,1.0,0.0,0.669994,0.000000,0.0,1.000000,1.0,0.0,0.000000
7799,0.878571,0,0,0.711039,0.306656,0.0,0.0,1.0,0.0,1.000000,0.000000,0.0,0.506569,0.0,0.0,0.000000
7800,0.764332,0,0,0.636064,0.239342,0.0,0.0,1.0,0.0,0.000000,0.488585,0.0,0.000000,0.0,0.0,1.000000


In [18]:
# Without crossfold validation
rf = RandomForestClassifier(random_state=42).fit(X_train_resh, y_train_resh)
rf.score(X_test, y_test)

0.8806262230919765

## Models

In [19]:
# calculte scores for best model
# return: dict including f1, precision and recall (all macro)
from sklearn.metrics import accuracy_score, precision_score, recall_score


def calculate_scores(y_true, y_pred):
    f1 = f1_score(y_true, y_pred, average='binary')
    precision = precision_score(y_true, y_pred, average='binary')
    recall = recall_score(y_true, y_pred, average='binary')
    accuracy = accuracy_score(y_true, y_pred)

    print("f1_binary: " + str(f1) + ", precision_binary: " + str(precision) + ", recall_binary: " + str(recall) + ", accuracy: " + str(accuracy))
    
    return {
        "f1_binary": f1,
        "precision_binary": precision,
        "recall_binary": recall,
        "accuracy": accuracy
    }

In [20]:
# create stratified holdout of 20% for testing
# test = df.groupby('label', group_keys=False).apply(lambda x:x.sample(frac=.2))
import os

y_test_debug = dict()
test_prediction_debug = dict()


from sklearn.model_selection import GridSearchCV, StratifiedKFold

def return_model_results(
    sklearn_model, 
    params: dict, 
    X_train: pd.DataFrame,
    X_test: pd.DataFrame, 
    y_train: pd.DataFrame, 
    y_test: pd.DataFrame
):

    # obtain cpu cores to speed up optimization
    cpu_cores = os.cpu_count()

    # create folds
    inner_cv = StratifiedKFold(5, shuffle=True, random_state=12)

    # instantiate grid search
    param_search = GridSearchCV(
        estimator=sklearn_model(),
        param_grid=params,
        cv=inner_cv,
        scoring='f1_macro',
        n_jobs=cpu_cores-1,
        verbose = 3
    )

    # run gridsearch and identify best params
    param_search.fit(X_train, y_train)
    print(param_search.best_params_)
    best_params = param_search.best_params_

    # train model with best params and obtain results
    final_model = sklearn_model(**best_params)
    final_model.fit(X_train, y_train)
    test_prediction = final_model.predict(X_test)
    
    # y_test_debug = y_test
    # test_prediction_debug = y

    score_dict = calculate_scores(y_test, test_prediction)

    # return dict including the seach config, best params, etst prediction of best model and results
    return {
        "param_grid": params,
        "best_params": best_params, 
        "predicted_test": test_prediction.tolist(),
        **score_dict
    }

In [33]:
rf_params = {
    "n_estimators": [64,100,128,200],
    "criterion": ["gini", "entropy"],
    "max_depth": [10, 50, 200],
    # "class_weight": ["balanced"],
    # "min_samples_split": [2, 5, 10],
    # "max_features": ["auto", "log2"],
    # "verbose": [2],
    # "n_jobs": [7],
    # "bootstrap": [True,False]
    "random_state": [42]
}

svm_params = {
    "kernel": ["rbf", "poly"],
    "degree": [3, 5],
    # "gamma": ["scale", .1, 4],
    # "C": [1, 10]
}

lr_params = {
    "penalty": ['none','l2'],
    "C": [0.001, 0.01, 0.1, 1, 10, 100],
    # "class_weight": ["balanced"]
}

result_dict = {
    "rf": {}
}

In [35]:

classifiers = [
    [RandomForestClassifier, rf_params, "rf"],
    [LogisticRegression, lr_params, "lr"],
    # [SVC, svm_params, "svm"]
]

for classifier, params, dict_key in classifiers:
    json.dump(
        return_model_results(
            classifier,
            params,
            X_train_resh, 
            X_test,
            y_train_resh,
            y_test
        ), 
        open(f"../data/results-{dict_key}.json", "w")
    )

Fitting 5 folds for each of 24 candidates, totalling 120 fits
{'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 50, 'n_estimators': 100, 'random_state': 42}
f1_binary: 0.14084507042253522, precision_binary: 0.125, recall_binary: 0.16129032258064516, accuracy: 0.8806262230919765
Fitting 5 folds for each of 12 candidates, totalling 60 fits
{'C': 1, 'class_weight': 'balanced', 'penalty': 'l2'}
f1_binary: 0.2752808988764045, precision_binary: 0.16666666666666666, recall_binary: 0.7903225806451613, accuracy: 0.7475538160469667


In [214]:

calculate_scores(y_test, np.random.randint(0, 2, y_test.shape))


f1_binary: 0.1310344827586207, precision_binary: 0.07335907335907337, recall_binary: 0.6129032258064516, accuracy: 0.5068493150684932


{'f1_binary': 0.1310344827586207,
 'precision_binary': 0.07335907335907337,
 'recall_binary': 0.6129032258064516,
 'accuracy': 0.5068493150684932}