In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import joblib

In [28]:
data = pd.read_csv('../data/processed/heart.csv')

In [29]:
X = data.drop('HeartDisease', axis=1)
y = data['HeartDisease']

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2)

In [31]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [32]:
models = {
    "Logistic Regression": (
        LogisticRegression(max_iter=1000),
        {
            "C": [0.01, 0.1, 1, 10],
            "solver": ["liblinear", "lbfgs"]
        }
    ),
    "KNN": (
        KNeighborsClassifier(),
        {
            "n_neighbors": [3, 5, 7, 9, 11],
            "weights": ["uniform", "distance"],
            "p": [1, 2]   # 1=Manhattan, 2=Euclidean
        }
    ),
    "Naive Bayes": (
        GaussianNB(),
        {
            "var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6]
        }
    ),
    "Decision Tree": (
        DecisionTreeClassifier(),
        {
            "max_depth": [None, 5, 10, 20],
            "criterion": ["gini", "entropy"],
            "min_samples_split": [2, 5, 10]
        }
    ),
    "SVM": (
        SVC(probability=True),
        {
            "C": [0.1, 1, 10],
            "kernel": ["linear", "rbf", "poly"],
            "gamma": ["scale", "auto"]
        }
    ),
    "Random Forest": (
        RandomForestClassifier(),
        {
            "n_estimators": [50, 100, 200],
            "max_depth": [None, 5, 10, 20],
            "min_samples_split": [2, 5, 10]
        }
    )
}

In [33]:
results = []

for name, (model, params) in models.items():
    print(f"\n Running GridSearchCV for {name}...")
    
    grid = GridSearchCV(
        estimator=model,
        param_grid=params,
        scoring="accuracy",
        cv=5,
        n_jobs=-1,
        verbose=0
    )
    
    grid.fit(X_train_scaled, y_train)
    
    best_model = grid.best_estimator_
    
    y_pred = best_model.predict(X_test_scaled)
    y_proba = best_model.predict_proba(X_test_scaled)[:, 1] if hasattr(best_model, "predict_proba") else None
    
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba) if y_proba is not None else None
    
    results.append({
        "Model": name,
        "Best Params": grid.best_params_,
        "CV Best Score": round(grid.best_score_, 4),
        "Test Accuracy": round(acc, 4),
        "Precision": round(precision, 4),
        "Recall": round(recall, 4),
        "F1 Score": round(f1, 4),
        "ROC-AUC": round(roc_auc, 4) if roc_auc is not None else "N/A"
    })


 Running GridSearchCV for Logistic Regression...

 Running GridSearchCV for KNN...

 Running GridSearchCV for Naive Bayes...

 Running GridSearchCV for Decision Tree...

 Running GridSearchCV for SVM...

 Running GridSearchCV for Random Forest...


In [34]:
results_df = pd.DataFrame(results)
print("\n Final Results with Medical Metrics:")
print(results_df)


 Final Results with Medical Metrics:
                 Model                                        Best Params  \
0  Logistic Regression                     {'C': 0.01, 'solver': 'lbfgs'}   
1                  KNN   {'n_neighbors': 7, 'p': 1, 'weights': 'uniform'}   
2          Naive Bayes                           {'var_smoothing': 1e-09}   
3        Decision Tree  {'criterion': 'entropy', 'max_depth': 5, 'min_...   
4                  SVM         {'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}   
5        Random Forest  {'max_depth': 20, 'min_samples_split': 10, 'n_...   

   CV Best Score  Test Accuracy  Precision  Recall  F1 Score  ROC-AUC  
0         0.8597         0.9130     0.9057  0.9412    0.9231   0.9488  
1         0.8570         0.9130     0.9057  0.9412    0.9231   0.9459  
2         0.8488         0.9130     0.9135  0.9314    0.9223   0.9519  
3         0.8352         0.8913     0.8868  0.9216    0.9038   0.9252  
4         0.8584         0.9239     0.9074  0.9608    0.9333  

In [35]:
best_model_row = results_df.loc[results_df["Recall"].idxmax()]
print("\n Best Model (Highest Recall):")
print(best_model_row)


 Best Model (Highest Recall):
Model                                                   SVM
Best Params      {'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}
CV Best Score                                        0.8584
Test Accuracy                                        0.9239
Precision                                            0.9074
Recall                                               0.9608
F1 Score                                             0.9333
ROC-AUC                                              0.9472
Name: 4, dtype: object


In [36]:
best_params = {'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 50}

from sklearn.ensemble import RandomForestClassifier

final_model = RandomForestClassifier(**best_params, random_state=42)
final_model.fit(X_train_scaled, y_train)

y_pred = final_model.predict(X_test_scaled)
y_proba = final_model.predict_proba(X_test_scaled)[:,1]

print("Final Test Accuracy:", accuracy_score(y_test, y_pred))
print("Final Test Recall:", recall_score(y_test, y_pred))
print("Final Test ROC-AUC:", roc_auc_score(y_test, y_proba))

Final Test Accuracy: 0.907608695652174
Final Test Recall: 0.9313725490196079
Final Test ROC-AUC: 0.9467957914873267


In [37]:
final_model.fit(X_train_scaled, y_train)

0,1,2
,n_estimators,50
,criterion,'gini'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [38]:
joblib.dump({"model": final_model, "scaler": scaler}, "../data/processed/heart_model.pkl")

['../data/processed/heart_model.pkl']