This file contains the code for the following steps:
- training of standard classifiers
- selection of parameters according to f1 scoring
- collection of metrics for each binarization strategy

In [1]:
# To comply with the code style
%load_ext jupyter_black

### Libraries

In [2]:
import time
import numpy as np
import pandas as pd

# Preprocessing
from sklearn.model_selection import train_test_split

# Models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# Metrics
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    precision_score,
    recall_score,
    classification_report,
    confusion_matrix,
)

In [3]:
pd.set_option("display.max_colwidth", None)

In [4]:
SEED = 42

### Dataset import

In [5]:
df_bin_strategy1 = pd.read_csv("datasets/strategy1.csv", index_col=0)
df_bin_strategy2 = pd.read_csv("datasets/strategy2.csv", index_col=0)
df_bin_strategy3 = pd.read_csv("datasets/strategy3.csv", index_col=0)
df_bin_strategy4 = pd.read_csv("datasets/strategy4.csv", index_col=0)

### Spliting the data to train and test

In [6]:
def updated_split(df):
    y = df["stroke"]
    X = df.drop(columns=["stroke"])
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=SEED
    )
    return X_train, X_test, y_train, y_test

In [7]:
X_train1, X_test1, y_train1, y_test1 = updated_split(df_bin_strategy1)
X_train2, X_test2, y_train2, y_test2 = updated_split(df_bin_strategy2)
X_train3, X_test3, y_train3, y_test3 = updated_split(df_bin_strategy3)
X_train4, X_test4, y_train4, y_test4 = updated_split(df_bin_strategy4)

# Base models

### Base pipeline

In [8]:
classifiers = {
    "kNN": KNeighborsClassifier(n_jobs=-1),
    "Naive Bayes": GaussianNB(),
    "Logistic Regression": LogisticRegression(
        multi_class="ovr",
        solver="saga",
        n_jobs=-1,
        random_state=SEED,
    ),
    "Decision Tree": DecisionTreeClassifier(random_state=SEED),
    "Random Forest": RandomForestClassifier(n_jobs=-1, random_state=SEED),
    "CatBoost": CatBoostClassifier(
        allow_writing_files=False, verbose=0, random_state=SEED
    ),
    "XGBoost": XGBClassifier(n_jobs=-1, random_state=SEED),
}

In [9]:
param_grid = {
    "kNN": {"n_neighbors": [3, 5, 7], "weights": ["uniform", "distance"]},
    "Naive Bayes": {},
    "Logistic Regression": {
        "penalty": [None, "l1", "l2"],
        "tol": [1e-4, 1e-3, 1e-2],
        "max_iter": [100, 500, 1000],
    },
    "Decision Tree": {
        "max_depth": [3, 5, 7],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
    },
    "Random Forest": {
        "n_estimators": [50, 100, 500],
        "max_depth": [3, 5, 7],
        "min_samples_split": [2, 5, 7],
        "min_samples_leaf": [1, 2, 4],
    },
    "CatBoost": {
        "learning_rate": [1e-3, 1e-2, 1e-1],
        "n_estimators": [100, 500, 700],
        "max_depth": [3, 5, 7],
    },
    "XGBoost": {
        "learning_rate": [1e-3, 1e-2, 1e-1],
        "n_estimators": [100, 500, 700],
        "max_depth": [3, 5, 7],
    },
}

In [10]:
results = {
    "classifier": [],
    "best_parameters": [],
    "accuracy": [],
    "precision": [],
    "recall": [],
    "f1_score": [],
}

In [11]:
def cv(name, classifier, param_grid, X_train, X_test, y_train, y_test, results):
    # Cross-validation and parameter tuning for each classifier
    print(f"{name} Classifier")
    results["classifier"].append(name)

    grid_search = GridSearchCV(classifier, param_grid, cv=3, scoring="f1", n_jobs=-1)
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_
    results["best_parameters"].append(best_params)

    best_clf = grid_search.best_estimator_
    y_pred = best_clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)

    # Store metric
    results["accuracy"].append(accuracy)
    results["precision"].append(prec)
    results["recall"].append(rec)
    results["f1_score"].append(f1)

    print("Best Parameters:")
    print(best_params)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("----------------\n")

In [12]:
def evaluate(X_train, X_test, y_train, y_test):
    results = {
        "classifier": [],
        "best_parameters": [],
        "accuracy": [],
        "precision": [],
        "recall": [],
        "f1_score": [],
    }
    for name, clf in classifiers.items():
        cv(name, clf, param_grid[name], X_train, X_test, y_train, y_test, results)
    res = pd.DataFrame(results)
    return res

### First strategy: seemingly logical

In [13]:
pd.set_option("display.max_colwidth", None)

In [14]:
%%time
res1 = evaluate(X_train1, X_test1, y_train1, y_test1)

kNN Classifier
Best Parameters:
{'n_neighbors': 7, 'weights': 'uniform'}
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.78      0.78        95
           1       0.55      0.55      0.55        47

    accuracy                           0.70       142
   macro avg       0.67      0.67      0.67       142
weighted avg       0.70      0.70      0.70       142

----------------

Naive Bayes Classifier
Best Parameters:
{}
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.56      0.72        95
           1       0.53      1.00      0.69        47

    accuracy                           0.70       142
   macro avg       0.76      0.78      0.70       142
weighted avg       0.84      0.70      0.71       142

----------------

Logistic Regression Classifier
Best Parameters:
{'max_iter': 100, 'penalty': 'l1', 'tol': 0.01}
Classification Report:
              precision    re

In [15]:
display(res1.sort_values(by="f1_score", ascending=False))

Unnamed: 0,classifier,best_parameters,accuracy,precision,recall,f1_score
4,Random Forest,"{'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}",0.746479,0.641026,0.531915,0.739809
5,CatBoost,"{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500}",0.739437,0.625,0.531915,0.733583
6,XGBoost,"{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500}",0.732394,0.609756,0.531915,0.727367
2,Logistic Regression,"{'max_iter': 100, 'penalty': 'l1', 'tol': 0.01}",0.732394,0.615385,0.510638,0.725354
3,Decision Tree,"{'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}",0.71831,0.581395,0.531915,0.714954
1,Naive Bayes,{},0.704225,0.52809,1.0,0.707928
0,kNN,"{'n_neighbors': 7, 'weights': 'uniform'}",0.704225,0.553191,0.553191,0.704225


In [16]:
display(
    res1[["classifier", "accuracy", "precision", "recall", "f1_score"]].sort_values(
        by="f1_score", ascending=False
    )
)

Unnamed: 0,classifier,accuracy,precision,recall,f1_score
4,Random Forest,0.746479,0.641026,0.531915,0.739809
5,CatBoost,0.739437,0.625,0.531915,0.733583
6,XGBoost,0.732394,0.609756,0.531915,0.727367
2,Logistic Regression,0.732394,0.615385,0.510638,0.725354
3,Decision Tree,0.71831,0.581395,0.531915,0.714954
1,Naive Bayes,0.704225,0.52809,1.0,0.707928
0,kNN,0.704225,0.553191,0.553191,0.704225


### Second strategy: inter-ordinal for each numeric

In [17]:
%%time
res2 = evaluate(X_train2, X_test2, y_train2, y_test2)

kNN Classifier
Best Parameters:
{'n_neighbors': 7, 'weights': 'uniform'}
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.79      0.79        95
           1       0.57      0.57      0.57        47

    accuracy                           0.72       142
   macro avg       0.68      0.68      0.68       142
weighted avg       0.72      0.72      0.72       142

----------------

Naive Bayes Classifier
Best Parameters:
{}
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.60      0.74        95
           1       0.54      0.96      0.69        47

    accuracy                           0.72       142
   macro avg       0.75      0.78      0.72       142
weighted avg       0.83      0.72      0.72       142

----------------

Logistic Regression Classifier
Best Parameters:
{'max_iter': 100, 'penalty': 'l2', 'tol': 0.01}
Classification Report:
              precision    re

In [18]:
display(res2.sort_values(by="f1_score", ascending=False))

Unnamed: 0,classifier,best_parameters,accuracy,precision,recall,f1_score
5,CatBoost,"{'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 100}",0.739437,0.613636,0.574468,0.737167
2,Logistic Regression,"{'max_iter': 100, 'penalty': 'l2', 'tol': 0.01}",0.739437,0.625,0.531915,0.733583
1,Naive Bayes,{},0.71831,0.542169,0.957447,0.724388
0,kNN,"{'n_neighbors': 7, 'weights': 'uniform'}",0.71831,0.574468,0.574468,0.71831
3,Decision Tree,"{'max_depth': 3, 'min_samples_leaf': 4, 'min_samples_split': 2}",0.71831,0.581395,0.531915,0.714954
4,Random Forest,"{'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 50}",0.71831,0.589744,0.489362,0.710899
6,XGBoost,"{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 700}",0.71831,0.589744,0.489362,0.710899


In [19]:
display(
    res2[["classifier", "accuracy", "precision", "recall", "f1_score"]].sort_values(
        by="f1_score", ascending=False
    )
)

Unnamed: 0,classifier,accuracy,precision,recall,f1_score
5,CatBoost,0.739437,0.613636,0.574468,0.737167
2,Logistic Regression,0.739437,0.625,0.531915,0.733583
1,Naive Bayes,0.71831,0.542169,0.957447,0.724388
0,kNN,0.71831,0.574468,0.574468,0.71831
3,Decision Tree,0.71831,0.581395,0.531915,0.714954
4,Random Forest,0.71831,0.589744,0.489362,0.710899
6,XGBoost,0.71831,0.589744,0.489362,0.710899


### Third strategy: larger intervals for numeric features

In [20]:
%%time
res3 = evaluate(X_train3, X_test3, y_train3, y_test3)

kNN Classifier
Best Parameters:
{'n_neighbors': 5, 'weights': 'uniform'}
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.80      0.78        95
           1       0.56      0.51      0.53        47

    accuracy                           0.70       142
   macro avg       0.66      0.66      0.66       142
weighted avg       0.70      0.70      0.70       142

----------------

Naive Bayes Classifier
Best Parameters:
{}
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.67      0.80        95
           1       0.59      0.96      0.73        47

    accuracy                           0.77       142
   macro avg       0.78      0.82      0.76       142
weighted avg       0.84      0.77      0.77       142

----------------

Logistic Regression Classifier




Best Parameters:
{'max_iter': 100, 'penalty': 'l1', 'tol': 0.0001}
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.84      0.81        95
           1       0.62      0.53      0.57        47

    accuracy                           0.74       142
   macro avg       0.70      0.69      0.69       142
weighted avg       0.73      0.74      0.73       142

----------------

Decision Tree Classifier
Best Parameters:
{'max_depth': 3, 'min_samples_leaf': 4, 'min_samples_split': 2}
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.85      0.81        95
           1       0.61      0.47      0.53        47

    accuracy                           0.73       142
   macro avg       0.69      0.66      0.67       142
weighted avg       0.71      0.73      0.71       142

----------------

Random Forest Classifier
Best Parameters:
{'max_depth': 7, 'min_samples_leaf': 4, 'min_samp

In [21]:
display(res3.sort_values(by="f1_score", ascending=False))

Unnamed: 0,classifier,best_parameters,accuracy,precision,recall,f1_score
1,Naive Bayes,{},0.767606,0.592105,0.957447,0.774072
4,Random Forest,"{'max_depth': 7, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 50}",0.746479,0.634146,0.553191,0.741717
5,CatBoost,"{'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 500}",0.739437,0.613636,0.574468,0.737167
2,Logistic Regression,"{'max_iter': 100, 'penalty': 'l1', 'tol': 0.0001}",0.739437,0.625,0.531915,0.733583
6,XGBoost,"{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500}",0.732394,0.609756,0.531915,0.727367
3,Decision Tree,"{'max_depth': 3, 'min_samples_leaf': 4, 'min_samples_split': 2}",0.725352,0.611111,0.468085,0.714668
0,kNN,"{'n_neighbors': 5, 'weights': 'uniform'}",0.704225,0.55814,0.510638,0.700702


In [22]:
display(
    res3[["classifier", "accuracy", "precision", "recall", "f1_score"]].sort_values(
        by="f1_score", ascending=False
    )
)

Unnamed: 0,classifier,accuracy,precision,recall,f1_score
1,Naive Bayes,0.767606,0.592105,0.957447,0.774072
4,Random Forest,0.746479,0.634146,0.553191,0.741717
5,CatBoost,0.739437,0.613636,0.574468,0.737167
2,Logistic Regression,0.739437,0.625,0.531915,0.733583
6,XGBoost,0.732394,0.609756,0.531915,0.727367
3,Decision Tree,0.725352,0.611111,0.468085,0.714668
0,kNN,0.704225,0.55814,0.510638,0.700702


### Fourth strategy: selecting features

In [23]:
%%time
res4 = evaluate(X_train4, X_test4, y_train4, y_test4)

kNN Classifier
Best Parameters:
{'n_neighbors': 7, 'weights': 'uniform'}
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.82      0.80        95
           1       0.60      0.53      0.56        47

    accuracy                           0.73       142
   macro avg       0.69      0.68      0.68       142
weighted avg       0.72      0.73      0.72       142

----------------

Naive Bayes Classifier
Best Parameters:
{}
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.75      0.79        95
           1       0.58      0.70      0.63        47

    accuracy                           0.73       142
   macro avg       0.71      0.72      0.71       142
weighted avg       0.75      0.73      0.74       142

----------------

Logistic Regression Classifier
Best Parameters:
{'max_iter': 100, 'penalty': 'l2', 'tol': 0.0001}
Classification Report:
              precision    

In [24]:
display(res4.sort_values(by="f1_score", ascending=False))

Unnamed: 0,classifier,best_parameters,accuracy,precision,recall,f1_score
1,Naive Bayes,{},0.732394,0.578947,0.702128,0.737827
4,Random Forest,"{'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}",0.739437,0.613636,0.574468,0.737167
5,CatBoost,"{'learning_rate': 0.001, 'max_depth': 5, 'n_estimators': 500}",0.739437,0.613636,0.574468,0.737167
2,Logistic Regression,"{'max_iter': 100, 'penalty': 'l2', 'tol': 0.0001}",0.732394,0.604651,0.553191,0.729206
0,kNN,"{'n_neighbors': 7, 'weights': 'uniform'}",0.725352,0.595238,0.531915,0.721158
6,XGBoost,"{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500}",0.71831,0.581395,0.531915,0.714954
3,Decision Tree,"{'max_depth': 3, 'min_samples_leaf': 4, 'min_samples_split': 2}",0.725352,0.611111,0.468085,0.714668


In [25]:
display(
    res4[["classifier", "accuracy", "precision", "recall", "f1_score"]].sort_values(
        by="f1_score", ascending=False
    )
)

Unnamed: 0,classifier,accuracy,precision,recall,f1_score
1,Naive Bayes,0.732394,0.578947,0.702128,0.737827
4,Random Forest,0.739437,0.613636,0.574468,0.737167
5,CatBoost,0.739437,0.613636,0.574468,0.737167
2,Logistic Regression,0.732394,0.604651,0.553191,0.729206
0,kNN,0.725352,0.595238,0.531915,0.721158
6,XGBoost,0.71831,0.581395,0.531915,0.714954
3,Decision Tree,0.725352,0.611111,0.468085,0.714668
