In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb

In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("alexteboul/heart-disease-health-indicators-dataset")

  from .autonotebook import tqdm as notebook_tqdm


Downloading to /home/cloud/.cache/kagglehub/datasets/alexteboul/heart-disease-health-indicators-dataset/3.archive...


100%|██████████| 2.66M/2.66M [00:01<00:00, 1.45MB/s]

Extracting files...





In [4]:
df = pd.read_csv(os.path.join(path,'heart_disease_health_indicators_BRFSS2015.csv'))
columns = df.columns.str.strip()
print(columns)

Index(['HeartDiseaseorAttack', 'HighBP', 'HighChol', 'CholCheck', 'BMI',
       'Smoker', 'Stroke', 'Diabetes', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   HeartDiseaseorAttack  253680 non-null  float64
 1   HighBP                253680 non-null  float64
 2   HighChol              253680 non-null  float64
 3   CholCheck             253680 non-null  float64
 4   BMI                   253680 non-null  float64
 5   Smoker                253680 non-null  float64
 6   Stroke                253680 non-null  float64
 7   Diabetes              253680 non-null  float64
 8   PhysActivity          253680 non-null  float64
 9   Fruits                253680 non-null  float64
 10  Veggies               253680 non-null  float64
 11  HvyAlcoholConsump     253680 non-null  float64
 12  AnyHealthcare         253680 non-null  float64
 13  NoDocbcCost           253680 non-null  float64
 14  GenHlth               253680 non-null  float64
 15  

In [6]:
weights_and_biases = {}
scaler_lr = StandardScaler()
df = pd.read_csv(os.path.join(path,'heart_disease_health_indicators_BRFSS2015.csv'))
Y = df['HeartDiseaseorAttack']
X = df.drop(columns=['HeartDiseaseorAttack'])

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
X_train = scaler_lr.fit_transform(X_train)
X_test = scaler_lr.transform(X_test)

# Logistic Regression Classifier

In [7]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, Y_train)

Y_pred_lr = lr.predict(X_test)
accuracy_lr = accuracy_score(Y_test, Y_pred_lr)
f1_lr = f1_score(Y_test, Y_pred_lr)
precision_lr = precision_score(Y_test, Y_pred_lr)
recall_lr = recall_score(Y_test, Y_pred_lr)
mcc_lr = matthews_corrcoef(Y_test, Y_pred_lr)
auc_lr = roc_auc_score(Y_test, Y_pred_lr)

print(f"Accuracy: {accuracy_lr}")
print(f"F1 Score: {f1_lr}")
print(f"Precision: {precision_lr}")
print(f"Recall: {recall_lr}")
print(f"MCC: {mcc_lr}")
print(f"AUC: {auc_lr}")

Accuracy: 0.90740302743614
F1 Score: 0.20264765784114053
Precision: 0.5227670753064798
Recall: 0.1256842105263158
MCC: 0.22355239867280188
AUC: 0.556916388740738


# Decision Tree Classifier

In [8]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)

Y_pred_dt = lr.predict(X_test)
accuracy_dt = accuracy_score(Y_test, Y_pred_dt)
f1_dt = f1_score(Y_test, Y_pred_dt)
precision_dt = precision_score(Y_test, Y_pred_dt)
recall_dt = recall_score(Y_test, Y_pred_dt)
mcc_dt = matthews_corrcoef(Y_test, Y_pred_dt)
auc_dt = roc_auc_score(Y_test, Y_pred_dt)

print(f"Accuracy: {accuracy_dt}")
print(f"F1 Score: {f1_dt}")
print(f"Precision: {precision_dt}")
print(f"Recall: {recall_dt}")
print(f"MCC: {mcc_dt}")
print(f"AUC: {auc_dt}")

Accuracy: 0.90740302743614
F1 Score: 0.20264765784114053
Precision: 0.5227670753064798
Recall: 0.1256842105263158
MCC: 0.22355239867280188
AUC: 0.556916388740738


# K-Nearest Neighbor Classifier

In [9]:
knn_classifier = KNeighborsClassifier()

knn_classifier.fit(X_train, Y_train)
Y_pred_knn = knn_classifier.predict(X_test)

accuracy_knn = accuracy_score(Y_test, Y_pred_knn)
f1_knn = f1_score(Y_test, Y_pred_knn)
precision_knn = precision_score(Y_test, Y_pred_knn)
recall_knn = recall_score(Y_test, Y_pred_knn)
mcc_knn = matthews_corrcoef(Y_test, Y_pred_knn)
auc_knn = roc_auc_score(Y_test, Y_pred_knn)

print(f"Accuracy: {accuracy_knn}")
print(f"F1 Score: {f1_knn}")
print(f"Precision: {precision_knn}")
print(f"Recall: {recall_knn}")
print(f"MCC: {mcc_knn}")
print(f"AUC: {auc_knn}")

Exception ignored on calling ctypes callback function: <function ThreadpoolController._find_libraries_with_dl_iterate_phdr.<locals>.match_library_callback at 0x7f2dad96c680>
Traceback (most recent call last):
  File "/home/cloud/anaconda3/lib/python3.12/site-packages/threadpoolctl.py", line 1005, in match_library_callback
    self._make_controller_from_path(filepath)
  File "/home/cloud/anaconda3/lib/python3.12/site-packages/threadpoolctl.py", line 1187, in _make_controller_from_path
    lib_controller = controller_class(
                     ^^^^^^^^^^^^^^^^^
  File "/home/cloud/anaconda3/lib/python3.12/site-packages/threadpoolctl.py", line 114, in __init__
    self.dynlib = ctypes.CDLL(filepath, mode=_RTLD_NOLOAD)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/cloud/anaconda3/lib/python3.12/ctypes/__init__.py", line 379, in __init__
    self._handle = _dlopen(self._name, mode)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^
OSError: /home/cloud/anaconda3/lib/py

Accuracy: 0.8960895616524756
F1 Score: 0.2095952023988006
Precision: 0.3640625
Recall: 0.1471578947368421
MCC: 0.18411875735448927
AUC: 0.56030316778333


# Naive Bayes Classifier

In [10]:
nb_classifier = GaussianNB()

nb_classifier.fit(X_train, Y_train)
Y_pred_nb = nb_classifier.predict(X_test)

accuracy_nb = accuracy_score(Y_test, Y_pred_nb)
f1_nb = f1_score(Y_test, Y_pred_nb)
precision_nb = precision_score(Y_test, Y_pred_nb)
recall_nb = recall_score(Y_test, Y_pred_nb)
mcc_nb = matthews_corrcoef(Y_test, Y_pred_nb)
auc_nb = roc_auc_score(Y_test, Y_pred_nb)

print(f"Accuracy: {accuracy_nb}")
print(f"F1 Score: {f1_nb}")
print(f"Precision: {precision_nb}")
print(f"Recall: {recall_nb}")
print(f"MCC: {mcc_nb}")
print(f"AUC: {auc_nb}")


Accuracy: 0.8188071586250394
F1 Score: 0.3563677098648743
Precision: 0.26696737648169516
Recall: 0.5357894736842105
MCC: 0.2862328852330302
AUC: 0.6919150908628942


#  Random Forest

In [11]:
rf_classifier = RandomForestClassifier()

rf_classifier.fit(X_train, Y_train)
Y_pred_rf = rf_classifier.predict(X_test)

accuracy_rf = accuracy_score(Y_test, Y_pred_rf)
f1_rf = f1_score(Y_test, Y_pred_rf)
precision_rf = precision_score(Y_test, Y_pred_rf)
recall_rf = recall_score(Y_test, Y_pred_rf)
mcc_rf = matthews_corrcoef(Y_test, Y_pred_rf)
auc_rf = roc_auc_score(Y_test, Y_pred_rf)

print(f"Accuracy: {accuracy_rf}")
print(f"F1 Score: {f1_rf}")
print(f"Precision: {precision_rf}")
print(f"Recall: {recall_rf}")
print(f"MCC: {mcc_rf}")
print(f"AUC: {auc_rf}")

Accuracy: 0.9028697571743929
F1 Score: 0.1778445111778445
Precision: 0.4284565916398714
Recall: 0.11221052631578947
MCC: 0.18223414643916253
AUC: 0.5483746494928663


# XGBoost

In [12]:
xgb_classifier = xgb.XGBClassifier()

xgb_classifier.fit(X_train, Y_train)
Y_pred_xgb = xgb_classifier.predict(X_test)

accuracy_xgb = accuracy_score(Y_test, Y_pred_xgb)
f1_xgb = f1_score(Y_test, Y_pred_xgb)
precision_xgb = precision_score(Y_test, Y_pred_xgb)
recall_xgb = recall_score(Y_test, Y_pred_xgb)
mcc_xgb = matthews_corrcoef(Y_test, Y_pred_xgb)
auc_xgb = roc_auc_score(Y_test, Y_pred_xgb)

print(f"Accuracy: {accuracy_xgb}")
print(f"F1 Score: {f1_xgb}")
print(f"Precision: {precision_xgb}")
print(f"Recall: {recall_xgb}")
print(f"MCC: {mcc_xgb}")
print(f"AUC: {auc_xgb}")

Accuracy: 0.9067131819615263
F1 Score: 0.18635035241533437
Precision: 0.5079662605435802
Recall: 0.11410526315789474
MCC: 0.20847671299953452
AUC: 0.5513443725435887


In [19]:
import joblib

weights_and_biases = {
    "Logistic Regression": lr,
    "Decision Tree": decision_tree,
    "KNN": knn_classifier,
    "Naive Bayes": nb_classifier,
    "Random Forest": rf_classifier,
    "XGBoost": xgb_classifier
}

with open("./all_models.joblib", "wb") as f:
    joblib.dump(weights_and_biases, f, compress=('lzma', 3))

with open("model/logistic_regression.joblib", "wb") as f:
    joblib.dump(lr, f)
with open("model/decision_tree.joblib", "wb") as f:
    joblib.dump(decision_tree, f)
with open("model/knn_classifier.joblib", "wb") as f:
    joblib.dump(knn_classifier, f)
with open("model/rf_classifier.joblib", "wb") as f:
    joblib.dump(rf_classifier, f, compress=('lzma', 3))
with open("model/xgb_classifier.joblib", "wb") as f:
    joblib.dump(xgb_classifier, f)
with open("model/nb_classifier.joblib", "wb") as f:
    joblib.dump(nb_classifier, f)