In [2]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
import pandas as pd

file_path = "HeartDiseaseTrain-Test.csv"

data = kagglehub.dataset_load(
    KaggleDatasetAdapter.PANDAS,
    "ketangangal/heart-disease-dataset-uci",
    file_path,
)

df = pd.DataFrame(data)
print(df.head())
print(list(df.columns))


Using Colab cache for faster access to the 'heart-disease-dataset-uci' dataset.
   age     sex chest_pain_type  resting_blood_pressure  cholestoral  \
0   52    Male  Typical angina                     125          212   
1   53    Male  Typical angina                     140          203   
2   70    Male  Typical angina                     145          174   
3   61    Male  Typical angina                     148          203   
4   62  Female  Typical angina                     138          294   

      fasting_blood_sugar               rest_ecg  Max_heart_rate  \
0    Lower than 120 mg/ml  ST-T wave abnormality             168   
1  Greater than 120 mg/ml                 Normal             155   
2    Lower than 120 mg/ml  ST-T wave abnormality             125   
3    Lower than 120 mg/ml  ST-T wave abnormality             161   
4  Greater than 120 mg/ml  ST-T wave abnormality             106   

  exercise_induced_angina  oldpeak        slope vessels_colored_by_flourosopy  \
0  

In [3]:
print(df.columns)


Index(['age', 'sex', 'chest_pain_type', 'resting_blood_pressure',
       'cholestoral', 'fasting_blood_sugar', 'rest_ecg', 'Max_heart_rate',
       'exercise_induced_angina', 'oldpeak', 'slope',
       'vessels_colored_by_flourosopy', 'thalassemia', 'target'],
      dtype='object')


In [4]:
print("Shape:", df.shape)


Shape: (1025, 14)


In [5]:
num_features = df.shape[1] - 1  # excluding target
print("Number of features:", num_features)


Number of features: 13


In [6]:
from sklearn.preprocessing import LabelEncoder

df_encoded = df.copy()

for col in df_encoded.select_dtypes(include='object').columns:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])


In [7]:
X = df_encoded.drop("target", axis=1)
y = df_encoded["target"]


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [9]:
test_df = X_test.copy()
test_df["target"] = y_test.values
test_df.to_csv("test_data_encoded.csv", index=False)

In [19]:
from google.colab import files
files.download("test_data_encoded.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(eval_metric='logloss', random_state=42)
}

for name, model in models.items():
    model.fit(X_train, y_train)


In [11]:
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef

results = []

for name, model in models.items():
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    })

import pandas as pd
results_df = pd.DataFrame(results)
print(results_df)


                 Model  Accuracy       AUC  Precision    Recall        F1  \
0  Logistic Regression  0.843902  0.931333   0.823009  0.885714  0.853211   
1        Decision Tree  0.985366  0.985714   1.000000  0.971429  0.985507   
2                  KNN  0.697561  0.833524   0.712871  0.685714  0.699029   
3          Naive Bayes  0.843902  0.913524   0.828829  0.876190  0.851852   
4        Random Forest  1.000000  1.000000   1.000000  1.000000  1.000000   
5              XGBoost  1.000000  1.000000   1.000000  1.000000  1.000000   

        MCC  
0  0.689136  
1  0.971151  
2  0.395639  
3  0.688357  
4  1.000000  
5  1.000000  


In [12]:
import joblib
import os

os.makedirs("model", exist_ok=True)

for name, model in models.items():
    filename = name.lower().replace(" ", "_") + ".pkl"
    joblib.dump(model, f"model/{filename}")


In [13]:
import joblib


In [14]:
import pickle


In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

nb = GaussianNB()
nb.fit(X_train, y_train)

rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

from xgboost import XGBClassifier

xgb = XGBClassifier(eval_metric='logloss', random_state=42)
xgb.fit(X_train, y_train)

In [16]:
nb = GaussianNB()
nb.fit(X_train, y_train)

rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

from xgboost import XGBClassifier
xgb = XGBClassifier(eval_metric='logloss', random_state=42)
xgb.fit(X_train, y_train)


In [17]:
import joblib
import os

joblib.dump(lr, "logistic_regression.pkl")
joblib.dump(dt, "decision_tree.pkl")
joblib.dump(knn, "knn.pkl")
joblib.dump(nb, "naive_bayes.pkl")
joblib.dump(rf, "random_forest.pkl")
joblib.dump(xgb, "xgboost.pkl")

os.listdir()



['.config',
 'knn.pkl',
 'naive_bayes.pkl',
 'xgboost.pkl',
 'random_forest.pkl',
 'test_data_encoded.csv',
 'logistic_regression.pkl',
 'model',
 'decision_tree.pkl',
 'sample_data']

In [18]:
from google.colab import files

files.download("logistic_regression.pkl")
files.download("decision_tree.pkl")
files.download("knn.pkl")
files.download("naive_bayes.pkl")
files.download("random_forest.pkl")
files.download("xgboost.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>