In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Load reduced dataset
df = pd.read_csv("../data/heart_disease_selected_features.csv")

# Separate features and target
X = df.drop(columns=["num"])  # or "target"
y = (df["num"] > 0).astype(int)  # convert to binary

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [2]:
param_grid_lr = {
    "C": [0.01, 0.1, 1, 10],
    "penalty": ["l1", "l2"],
    "solver": ["liblinear"]
}

grid_lr = GridSearchCV(LogisticRegression(max_iter=2000), param_grid_lr, cv=5, scoring="f1")
grid_lr.fit(X_train, y_train)

print("Best Logistic Regression params:", grid_lr.best_params_)


Best Logistic Regression params: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}


In [3]:
param_grid_dt = {
    "max_depth": [3, 5, 10, None],
    "min_samples_split": [2, 5, 10],
    "criterion": ["gini", "entropy"]
}

grid_dt = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid_dt, cv=5, scoring="f1")
grid_dt.fit(X_train, y_train)

print("Best Decision Tree params:", grid_dt.best_params_)


Best Decision Tree params: {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 10}


In [4]:
param_dist_rf = {
    "n_estimators": [100, 200, 300, 500],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True, False]
}

random_rf = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions=param_dist_rf,
    n_iter=20, cv=5, scoring="f1", random_state=42, n_jobs=-1
)
random_rf.fit(X_train, y_train)

print("Best Random Forest params:", random_rf.best_params_)


Best Random Forest params: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': None, 'bootstrap': False}


In [5]:
param_grid_svm = {
    "C": [0.1, 1, 10],
    "kernel": ["linear", "rbf"],
    "gamma": ["scale", "auto"]
}

grid_svm = GridSearchCV(SVC(probability=True), param_grid_svm, cv=5, scoring="f1")
grid_svm.fit(X_train, y_train)

print("Best SVM params:", grid_svm.best_params_)


Best SVM params: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}


In [6]:
best_models = {
    "Logistic Regression": grid_lr.best_estimator_,
    "Decision Tree": grid_dt.best_estimator_,
    "Random Forest": random_rf.best_estimator_,
    "SVM": grid_svm.best_estimator_
}

results = {}
for name, model in best_models.items():
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    
    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob) if y_prob is not None else None
    }

pd.DataFrame(results).T


Unnamed: 0,Accuracy,Precision,Recall,F1,AUC
Logistic Regression,0.836066,0.8,0.857143,0.827586,0.926407
Decision Tree,0.754098,0.69697,0.821429,0.754098,0.85119
Random Forest,0.901639,0.84375,0.964286,0.9,0.957792
SVM,0.852459,0.806452,0.892857,0.847458,0.942641


In [7]:
import joblib
import os

os.makedirs("../models", exist_ok=True)

# Pick the best model (highest F1 or AUC)
best_model_name = max(results, key=lambda x: results[x]["F1"])
best_model = best_models[best_model_name]

joblib.dump(best_model, f"../models/final_model.pkl")
print(f"Saved best model ({best_model_name}) → ../models/final_model.pkl")


Saved best model (Random Forest) → ../models/final_model.pkl


In [8]:
import os

# Ensure results directory exists
os.makedirs("../results", exist_ok=True)

# Save evaluation metrics
with open("../results/evaluation_metrics.txt", "w") as f:
    f.write("Model Evaluation Metrics\n")
    f.write("========================\n\n")
    for model, metrics in results.items():
        f.write(f"{model}:\n")
        for metric, value in metrics.items():
            f.write(f"  {metric}: {value:.4f}\n")
        f.write("\n")

print("Saved evaluation metrics → ../results/evaluation_metrics.txt")


Saved evaluation metrics → ../results/evaluation_metrics.txt
