In [1]:
import pandas as pd
import joblib
from pathlib import Path
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scipy.stats import randint

df = pd.read_csv("../data/heart_disease_uci.csv")
df = df.drop(columns=["id", "dataset"], errors="ignore")
df["target"] = (df["num"] > 0).astype(int)
df = df.drop(columns=["num"])

X = pd.get_dummies(df.drop(columns=["target"]), drop_first=True).fillna(0)
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    {"n_estimators":[50,100], "max_depth":[None,5,10]},
    cv=3, scoring="f1", n_jobs=2
)
grid.fit(X_train, y_train)
grid_preds = grid.predict(X_test)

rand = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    {"n_estimators":randint(50,150), "max_depth":[None,5,10,15], "max_features":["sqrt","log2"]},
    n_iter=5, cv=3, scoring="f1", random_state=42, n_jobs=2
)
rand.fit(X_train, y_train)
rand_preds = rand.predict(X_test)

grid_res = {"Method":"GridSearchCV","Accuracy":accuracy_score(y_test,grid_preds),"Precision":precision_score(y_test,grid_preds),"Recall":recall_score(y_test,grid_preds),"F1":f1_score(y_test,grid_preds),"Best Params":grid.best_params_}
rand_res = {"Method":"RandomizedSearchCV","Accuracy":accuracy_score(y_test,rand_preds),"Precision":precision_score(y_test,rand_preds),"Recall":recall_score(y_test,rand_preds),"F1":f1_score(y_test,rand_preds),"Best Params":rand.best_params_}
results_df = pd.DataFrame([grid_res, rand_res])

Path("results").mkdir(exist_ok=True)
Path("models").mkdir(exist_ok=True)
results_df.to_csv("results/evaluation_metrics.txt", mode="a", index=False, sep="\t")

best = grid if grid_res["F1"] >= rand_res["F1"] else rand
joblib.dump(best.best_estimator_, "models/final_model.pkl")

results_df

Unnamed: 0,Method,Accuracy,Precision,Recall,F1,Best Params
0,GridSearchCV,0.836957,0.821429,0.901961,0.859813,"{'max_depth': 10, 'n_estimators': 100}"
1,RandomizedSearchCV,0.836957,0.821429,0.901961,0.859813,"{'max_depth': 10, 'max_features': 'sqrt', 'n_e..."
