In [1]:
!pip install xgboost
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import (
    f1_score,
    accuracy_score,
    precision_score,
    recall_score,
    classification_report,
    confusion_matrix,
)

### Dataset import

In [3]:
df_bin1 = pd.read_csv("baseline.csv", index_col=0)
df_bin2 = pd.read_csv("another_baseline.csv", index_col=0)

# Baseline

In [14]:
y = df_bin1['HeartDisease']
X = df_bin1.drop(columns=['HeartDisease'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [15]:
results = {
        "classifier": [],
        "accuracy": [],
        "precision": [],
        "recall": [],
        "f1_score": [],
}

In [16]:
knn = KNeighborsClassifier(n_jobs=-1)
param_knn = {"n_neighbors": [3, 5, 7], "weights": ["uniform", "distance"]}
grid_search = GridSearchCV(knn, param_knn, cv=5, scoring="f1", n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(best_params)

best_clf = grid_search.best_estimator_
y_pred = best_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)

results["classifier"].append('NB')
results["accuracy"].append(accuracy)
results["precision"].append(prec)
results["recall"].append(rec)
results["f1_score"].append(f1)

{'n_neighbors': 7, 'weights': 'uniform'}


In [17]:
nb = GaussianNB()
param_nb = {}
grid_search = GridSearchCV(nb, param_nb, cv=5, scoring="f1", n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(best_params)

best_clf = grid_search.best_estimator_
y_pred = best_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)

results["classifier"].append('NB')
results["accuracy"].append(accuracy)
results["precision"].append(prec)
results["recall"].append(rec)
results["f1_score"].append(f1)

{}


In [18]:
logreg = LogisticRegression(solver="saga", n_jobs=-1, random_state=42)
param_lr = {
        "penalty": [None, "l1", "l2"],
        "tol": [1e-4, 1e-3, 1e-2],
        "max_iter": [100, 500, 1000],
}
grid_search = GridSearchCV(logreg, param_lr, cv=5, scoring="f1", n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(best_params)

best_clf = grid_search.best_estimator_
y_pred = best_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)

results["classifier"].append('LogReg')
results["accuracy"].append(accuracy)
results["precision"].append(prec)
results["recall"].append(rec)
results["f1_score"].append(f1)

{'max_iter': 100, 'penalty': 'l1', 'tol': 0.01}


In [19]:
dt = DecisionTreeClassifier(random_state=42)
param_dt = {
        "max_depth": [3, 5, 7],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
}
grid_search = GridSearchCV(knn, param_knn, cv=5, scoring="f1", n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(best_params)

best_clf = grid_search.best_estimator_
y_pred = best_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)

results["classifier"].append('NB')
results["accuracy"].append(accuracy)
results["precision"].append(prec)
results["recall"].append(rec)
results["f1_score"].append(f1)

{'n_neighbors': 7, 'weights': 'uniform'}


In [20]:
rf = RandomForestClassifier(n_jobs=-1, random_state=42)
param_rf = {
        "n_estimators": [50, 100, 500],
        "max_depth": [3, 5, 7],
        "min_samples_split": [2, 5, 7],
        "min_samples_leaf": [1, 2, 4],
}
grid_search = GridSearchCV(knn, param_knn, cv=5, scoring="f1", n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(best_params)

best_clf = grid_search.best_estimator_
y_pred = best_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)

results["classifier"].append('NB')
results["accuracy"].append(accuracy)
results["precision"].append(prec)
results["recall"].append(rec)
results["f1_score"].append(f1)

{'n_neighbors': 7, 'weights': 'uniform'}


In [21]:
cb = CatBoostClassifier(allow_writing_files=False, verbose=0, random_state=42)
param_cb = {
        "learning_rate": [1e-3, 1e-2, 1e-1],
        "n_estimators": [100, 500, 700],
        "max_depth": [3, 5, 7],
}
grid_search = GridSearchCV(knn, param_knn, cv=5, scoring="f1", n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(best_params)

best_clf = grid_search.best_estimator_
y_pred = best_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)

results["classifier"].append('NB')
results["accuracy"].append(accuracy)
results["precision"].append(prec)
results["recall"].append(rec)
results["f1_score"].append(f1)

{'n_neighbors': 7, 'weights': 'uniform'}


In [22]:
xgb = XGBClassifier(n_jobs=-1, random_state=42)
param_xgb = {
        "learning_rate": [1e-3, 1e-2, 1e-1],
        "n_estimators": [100, 500, 700],
        "max_depth": [3, 5, 7],
}
grid_search = GridSearchCV(knn, param_knn, cv=5, scoring="f1", n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(best_params)

best_clf = grid_search.best_estimator_
y_pred = best_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)

results["classifier"].append('NB')
results["accuracy"].append(accuracy)
results["precision"].append(prec)
results["recall"].append(rec)
results["f1_score"].append(f1)

{'n_neighbors': 7, 'weights': 'uniform'}


In [23]:
results = pd.DataFrame(results)
results.sort_values(by="f1_score", ascending=False)

Unnamed: 0,classifier,accuracy,precision,recall,f1_score
2,LogReg,0.880435,0.917197,0.878049,0.897196
0,NB,0.844203,0.890323,0.841463,0.865204
3,NB,0.844203,0.890323,0.841463,0.865204
4,NB,0.844203,0.890323,0.841463,0.865204
5,NB,0.844203,0.890323,0.841463,0.865204
6,NB,0.844203,0.890323,0.841463,0.865204
1,NB,0.612319,0.87013,0.408537,0.556017


# Another baseline

In [34]:
y = df_bin2['HeartDisease']
X = df_bin2.drop(columns=['HeartDisease'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [35]:
results = {
        "classifier": [],
        "accuracy": [],
        "precision": [],
        "recall": [],
        "f1_score": [],
}

In [36]:
knn = KNeighborsClassifier(n_jobs=-1)
param_knn = {"n_neighbors": [3, 5, 7], "weights": ["uniform", "distance"]}
grid_search = GridSearchCV(knn, param_knn, cv=5, scoring="f1", n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(best_params)

best_clf = grid_search.best_estimator_
y_pred = best_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)

results["classifier"].append('NB')
results["accuracy"].append(accuracy)
results["precision"].append(prec)
results["recall"].append(rec)
results["f1_score"].append(f1)

{'n_neighbors': 7, 'weights': 'uniform'}


In [37]:
nb = GaussianNB()
param_nb = {}
grid_search = GridSearchCV(nb, param_nb, cv=5, scoring="f1", n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(best_params)

best_clf = grid_search.best_estimator_
y_pred = best_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)

results["classifier"].append('NB')
results["accuracy"].append(accuracy)
results["precision"].append(prec)
results["recall"].append(rec)
results["f1_score"].append(f1)

{}


In [38]:
logreg = LogisticRegression(solver="saga", n_jobs=-1, random_state=42)
param_lr = {
        "penalty": [None, "l1", "l2"],
        "tol": [1e-4, 1e-3, 1e-2],
        "max_iter": [100, 500, 1000],
}
grid_search = GridSearchCV(logreg, param_lr, cv=5, scoring="f1", n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(best_params)

best_clf = grid_search.best_estimator_
y_pred = best_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)

results["classifier"].append('LogReg')
results["accuracy"].append(accuracy)
results["precision"].append(prec)
results["recall"].append(rec)
results["f1_score"].append(f1)

{'max_iter': 100, 'penalty': 'l1', 'tol': 0.01}


In [39]:
dt = DecisionTreeClassifier(random_state=42)
param_dt = {
        "max_depth": [3, 5, 7],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
}
grid_search = GridSearchCV(knn, param_knn, cv=5, scoring="f1", n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(best_params)

best_clf = grid_search.best_estimator_
y_pred = best_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)

results["classifier"].append('NB')
results["accuracy"].append(accuracy)
results["precision"].append(prec)
results["recall"].append(rec)
results["f1_score"].append(f1)

{'n_neighbors': 7, 'weights': 'uniform'}


In [40]:
rf = RandomForestClassifier(n_jobs=-1, random_state=42)
param_rf = {
        "n_estimators": [50, 100, 500],
        "max_depth": [3, 5, 7],
        "min_samples_split": [2, 5, 7],
        "min_samples_leaf": [1, 2, 4],
}
grid_search = GridSearchCV(knn, param_knn, cv=5, scoring="f1", n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(best_params)

best_clf = grid_search.best_estimator_
y_pred = best_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)

results["classifier"].append('NB')
results["accuracy"].append(accuracy)
results["precision"].append(prec)
results["recall"].append(rec)
results["f1_score"].append(f1)

{'n_neighbors': 7, 'weights': 'uniform'}


In [41]:
cb = CatBoostClassifier(allow_writing_files=False, verbose=0, random_state=42)
param_cb = {
        "learning_rate": [1e-3, 1e-2, 1e-1],
        "n_estimators": [100, 500, 700],
        "max_depth": [3, 5, 7],
}
grid_search = GridSearchCV(knn, param_knn, cv=5, scoring="f1", n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(best_params)

best_clf = grid_search.best_estimator_
y_pred = best_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)

results["classifier"].append('NB')
results["accuracy"].append(accuracy)
results["precision"].append(prec)
results["recall"].append(rec)
results["f1_score"].append(f1)

{'n_neighbors': 7, 'weights': 'uniform'}


In [42]:
xgb = XGBClassifier(n_jobs=-1, random_state=42)
param_xgb = {
        "learning_rate": [1e-3, 1e-2, 1e-1],
        "n_estimators": [100, 500, 700],
        "max_depth": [3, 5, 7],
}
grid_search = GridSearchCV(knn, param_knn, cv=5, scoring="f1", n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(best_params)

best_clf = grid_search.best_estimator_
y_pred = best_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)

results["classifier"].append('NB')
results["accuracy"].append(accuracy)
results["precision"].append(prec)
results["recall"].append(rec)
results["f1_score"].append(f1)

{'n_neighbors': 7, 'weights': 'uniform'}


In [43]:
results = pd.DataFrame(results)
results.sort_values(by="f1_score", ascending=False)

Unnamed: 0,classifier,accuracy,precision,recall,f1_score
2,LogReg,0.884058,0.923077,0.878049,0.9
0,NB,0.869565,0.905063,0.871951,0.888199
3,NB,0.869565,0.905063,0.871951,0.888199
4,NB,0.869565,0.905063,0.871951,0.888199
5,NB,0.869565,0.905063,0.871951,0.888199
6,NB,0.869565,0.905063,0.871951,0.888199
1,NB,0.721014,0.914286,0.585366,0.713755
