# 06 - Hyperparameter Tuning & Export Model

In [26]:
# Required Libs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


# Load selected features dataset
heart_disease_selected = pd.read_csv(r"C:\Users\eyad0\Documents\python\Heart_Disease_Project\data\heart_disease_selected.csv")

x = heart_disease_selected.drop("num", axis=1)
y = (heart_disease_selected["num"] > 0).astype(int)   # binary target

# Split train / test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

print("Train:", x_train.shape, " Test:", x_test.shape)

Train: (242, 10)  Test: (61, 10)


In [27]:
baseline_models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(probability=True, random_state=42)
}

baseline_results = {}

for name, model in baseline_models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    baseline_results[name] = [acc, f1]

baseline_df = pd.DataFrame.from_dict(baseline_results, orient="index", columns=["Accuracy", "F1"])
print("=== Baseline Results ===")
print(baseline_df)


=== Baseline Results ===
                     Accuracy        F1
Logistic Regression  0.918033  0.912281
Decision Tree        0.688525  0.641509
Random Forest        0.737705  0.714286
SVM                  0.868852  0.857143


In [28]:
param_grid_log = [
    { "solver": ["lbfgs"], "penalty": ["l2", None], "C": [0.01, 0.1, 1, 10, 100] },
    { "solver": ["liblinear"], "penalty": ["l1", "l2"], "C": [0.01, 0.1, 1, 10, 100] },
    { "solver": ["saga"], "penalty": ["l1", "l2", "elasticnet", None], "C": [0.01, 0.1, 1, 10, 100], "l1_ratio": [0.0, 0.5, 1.0] }
]


grid_log = GridSearchCV(
    LogisticRegression(max_iter=1000, random_state=42),
    param_grid_log, cv=5, scoring="accuracy", n_jobs=-1
)
grid_log.fit(x_train, y_train)

print("Best Logistic Regression Params:", grid_log.best_params_)
print("Best Logistic Regression CV Score:", grid_log.best_score_)


Best Logistic Regression Params: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
Best Logistic Regression CV Score: 0.8178571428571428


In [29]:
# Decision Tree (GridSearchCV)
param_grid_dt = {
    "max_depth": [3, 5, 10, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

grid_dt = GridSearchCV(
    DecisionTreeClassifier(random_state=42),
    param_grid_dt, cv=5, scoring="accuracy", n_jobs=-1
)
grid_dt.fit(x_train, y_train)

print("Best Decision Tree Params:", grid_dt.best_params_)
print("Best Decision Tree CV Score:", grid_dt.best_score_)


Best Decision Tree Params: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5}
Best Decision Tree CV Score: 0.7933673469387755


In [30]:
# Random Forest (RandomizedSearchCV)
param_dist_rf = {
    "n_estimators": [100, 200, 300, 400, 500],
    "max_depth": [None, 10, 20, 30, 40],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True, False]
}

rand_rf = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions=param_dist_rf,
    n_iter=20, cv=5, scoring="accuracy", random_state=42, n_jobs=-1
)
rand_rf.fit(x_train, y_train)

print("Best Random Forest Params:", rand_rf.best_params_)
print("Best Random Forest CV Score:", rand_rf.best_score_)


Best Random Forest Params: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 40, 'bootstrap': False}
Best Random Forest CV Score: 0.8097789115646258


In [31]:
# SVM (RandomizedSearchCV)
param_dist_svm = {
    "C": [0.1, 1, 10, 100],
    "gamma": ["scale", "auto", 0.01, 0.1, 1],
    "kernel": ["linear", "rbf", "poly"]
}

rand_svm = RandomizedSearchCV(
    SVC(probability=True, random_state=42),
    param_distributions=param_dist_svm,
    n_iter=20, cv=5, scoring="accuracy", random_state=42, n_jobs=-1
)
rand_svm.fit(x_train, y_train)

print("Best SVM Params:", rand_svm.best_params_)
print("Best SVM CV Score:", rand_svm.best_score_)


Best SVM Params: {'kernel': 'linear', 'gamma': 'scale', 'C': 0.1}
Best SVM CV Score: 0.8261054421768707


In [32]:
# Compare Tuned vs Baseline
tuned_models = {
    "Logistic Regression": grid_log.best_estimator_,
    "Decision Tree": grid_dt.best_estimator_,
    "Random Forest": rand_rf.best_estimator_,
    "SVM": rand_svm.best_estimator_
}

tuned_results = {}

for name, model in tuned_models.items():
    y_pred = model.predict(x_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    tuned_results[name] = [acc, f1]

tuned_df = pd.DataFrame.from_dict(tuned_results, orient="index", columns=["Accuracy", "F1"])
print("=== Tuned Results ===")
print(tuned_df)

# baseline Vs tuned
comparison_df = baseline_df.join(tuned_df, lsuffix="_Baseline", rsuffix="_Tuned")
print("\n=== Baseline vs Tuned Models ===")
print(comparison_df)


=== Tuned Results ===
                     Accuracy        F1
Logistic Regression  0.918033  0.912281
Decision Tree        0.819672  0.775510
Random Forest        0.770492  0.750000
SVM                  0.885246  0.877193

=== Baseline vs Tuned Models ===
                     Accuracy_Baseline  F1_Baseline  Accuracy_Tuned  F1_Tuned
Logistic Regression           0.918033     0.912281        0.918033  0.912281
Decision Tree                 0.688525     0.641509        0.819672  0.775510
Random Forest                 0.737705     0.714286        0.770492  0.750000
SVM                           0.868852     0.857143        0.885246  0.877193


## Step 2.7 – Model Export & Deployment


In [33]:
# =====================================
# Step 2.7 – Model Export & Deployment
# =====================================

import joblib
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

best_model = rand_rf.best_estimator_

pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", best_model)
])

pipeline.fit(x, y)

joblib.dump(pipeline, r"C:\Users\eyad0\Documents\python\Heart_Disease_Project\models\final_model.pkl")

print("Final model saved to models/final_model.pkl")


Final model saved to models/final_model.pkl
