In [1]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import numpy as np


In [2]:
df = pd.read_csv("data/heart_disease_cleveland_clean.csv")

X = df.drop("target", axis=1)
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

preprocessor = joblib.load(r"C:\Users\moham\miniconda3\envs\heart_ml\models\final_pipeline.pkl")


In [3]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ("pre", preprocessor),
    ("clf", RandomForestClassifier(random_state=42))
])


In [4]:
param_grid = {
    "clf__n_estimators": [100, 200, 400],
    "clf__max_depth": [None, 5, 10, 20],
    "clf__min_samples_split": [2, 5, 10],
    "clf__min_samples_leaf": [1, 2, 4]
}


In [5]:
grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best CV AUC:", grid.best_score_)


Best params: {'clf__max_depth': None, 'clf__min_samples_leaf': 2, 'clf__min_samples_split': 10, 'clf__n_estimators': 100}
Best CV AUC: 0.9106127206127207


In [6]:
param_dist = {
    "clf__n_estimators": np.arange(100, 1000, 100),
    "clf__max_depth": [None, 5, 10, 20, 30, 40, 50],
    "clf__min_samples_split": np.arange(2, 20, 2),
    "clf__min_samples_leaf": np.arange(1, 10)
}

random_search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train, y_train)

print("Best params (Randomized):", random_search.best_params_)
print("Best CV AUC (Randomized):", random_search.best_score_)


Best params (Randomized): {'clf__n_estimators': np.int64(400), 'clf__min_samples_split': np.int64(4), 'clf__min_samples_leaf': np.int64(4), 'clf__max_depth': 30}
Best CV AUC (Randomized): 0.9062777222777223


In [None]:
best_model = grid.best_estimator_  

from sklearn.metrics import classification_report, roc_auc_score

y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:,1]

print("Test AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))


Test AUC: 0.9542410714285714
              precision    recall  f1-score   support

           0       0.81      0.91      0.85        32
           1       0.88      0.75      0.81        28

    accuracy                           0.83        60
   macro avg       0.84      0.83      0.83        60
weighted avg       0.84      0.83      0.83        60



In [8]:
joblib.dump(best_model, r"C:\Users\moham\miniconda3\envs\heart_ml\models\final_pipeline.pkl")
print("Saved tuned best model to models/final_pipeline.pkl")


Saved tuned best model to models/final_pipeline.pkl


In [9]:
with open(r"C:\Users\moham\miniconda3\envs\heart_ml\results\evaluation_metrics.txt", "a") as f:
    f.write("\n--- Hyperparameter Tuning ---\n")
    f.write(f"Best Params: {grid.best_params_}\n")
    f.write(f"Best CV AUC: {grid.best_score_:.3f}\n")
