In [27]:
# 2.6 Hyperparameter Tuning - Model Optimization
# ===============================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib, os


In [28]:
# --- Step 1: Load Dataset ---
try:
    df = pd.read_csv(r"C:\Users\USER\python\heart_data_clean.csv")
    print(" Data loaded successfully for Feature Selection.")
except FileNotFoundError:
    print("'heart_data_clean.csv' not found")
    raise

print(f"Data shape: {df.shape}")
df.head()


 Data loaded successfully for Feature Selection.
Data shape: (303, 14)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [29]:
# --- Step 2: Split into Features and Target ---
X = df.drop("target", axis=1)   
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Features shape:", X.shape)
print("Target distribution:\n", y.value_counts())


Features shape: (303, 13)
Target distribution:
 target
0    164
1    139
Name: count, dtype: int64


In [30]:
# --- Step 3: Define Models and Hyperparameter Grids ---

# Random Forest
rf = RandomForestClassifier(random_state=42)
rf_params = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 5, 10],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

# Support Vector Machine
svm = SVC(probability=True, random_state=42)
svm_params = {
    "C": [0.1, 1, 10],
    "kernel": ["linear", "rbf", "poly"],
    "gamma": ["scale", "auto"]
}

# Logistic Regression
logreg = LogisticRegression(max_iter=1000, random_state=42)
log_params = {
    "C": [0.01, 0.1, 1, 10],
    "penalty": ["l1", "l2"],
    "solver": ["liblinear", "lbfgs"]   
}

# Model list
models = [
    ("RandomForest", rf, rf_params),
    ("SVM", svm, svm_params),
    ("LogisticRegression", logreg, log_params)
]


In [31]:
# --- Step 4: Randomized Hyperparameter Tuning (Fast Version) ---
from sklearn.model_selection import RandomizedSearchCV

best_models = []
overall_best_score = 0
overall_best_model = None
overall_best_name = ""

# --- Define models and parameter grids ---
rf_model = RandomForestClassifier(random_state=42)
rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

svm_model = SVC(probability=True, random_state=42)
svm_params = {
    'C': [0.1, 1, 10],         # reduced values
    'kernel': ['linear', 'rbf'],  # faster kernels
    'gamma': ['scale']         # only 'scale'
}

log_model = LogisticRegression(max_iter=1000, random_state=42)
log_params = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']    # supports both l1 and l2
}

models = [
    ('RandomForest', rf_model, rf_params),
    ('SVM', svm_model, svm_params),
    ('LogisticRegression', log_model, log_params)
]

# --- Loop through models ---
for name, model, params in models:
    print(f"\n🔍 Fast RandomizedSearchCV for: {name}")
    
    rand_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=params,
        n_iter=5,               # max 5 random trials
        cv=3,                   # 3-fold CV for speed
        scoring='accuracy',
        random_state=42,
        n_jobs=-1
    )
    
    rand_search.fit(X_train, y_train)
    print(f"✅ Best parameters for {name}: {rand_search.best_params_}")
    
    y_pred = rand_search.predict(X_test)
    
    # --- Evaluate metrics ---
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    best_models.append({
        'Model': name,
        'Best_Params': rand_search.best_params_,
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1_Score': f1
    })
    
    # Save each best model
    joblib.dump(rand_search.best_estimator_, f"results/{name}_best_model.pkl")
    
    # Track overall best
    if acc > overall_best_score:
        overall_best_score = acc
        overall_best_model = rand_search.best_estimator_
        overall_best_name = name

# --- Save results ---
results_df = pd.DataFrame(best_models)
results_df.to_csv("results/hyperparameter_tuning_results.csv", index=False)

# --- Save final best model ---
if overall_best_model is not None:
    joblib.dump(overall_best_model, "results/final_model.pkl")
    print(f"\n🏆 Overall best model: {overall_best_name} with Accuracy={overall_best_score:.4f}")
    print("✅ Final best model saved as final_model.pkl")

print("\n✅ Hyperparameter tuning completed with RandomizedSearchCV.")



🔍 Fast RandomizedSearchCV for: RandomForest
✅ Best parameters for RandomForest: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': None}

🔍 Fast RandomizedSearchCV for: SVM
✅ Best parameters for SVM: {'kernel': 'linear', 'gamma': 'scale', 'C': 0.1}

🔍 Fast RandomizedSearchCV for: LogisticRegression
✅ Best parameters for LogisticRegression: {'solver': 'liblinear', 'penalty': 'l2', 'C': 1}

🏆 Overall best model: RandomForest with Accuracy=0.8689
✅ Final best model saved as final_model.pkl

✅ Hyperparameter tuning completed with RandomizedSearchCV.


In [32]:
results_df = pd.DataFrame(best_models)
results_df.to_csv("C:/Users/USER/python/hyperparameter_tuning_final.csv", index=False)
print("\n Hyperparameter Tuning Results:")
print(results_df)

# Save final best model
if overall_best_model is not None:
    os.makedirs("../models", exist_ok=True)
    joblib.dump(overall_best_model, "../models/final_model.pkl")
    print(f"\n Best Model: {overall_best_name} with Accuracy={overall_best_score:.4f}")
    print("Final best model saved at '../models/best_expected_model.pkl'")


 Hyperparameter Tuning Results:
                Model                                        Best_Params  \
0        RandomForest  {'n_estimators': 100, 'min_samples_split': 5, ...   
1                 SVM   {'kernel': 'linear', 'gamma': 'scale', 'C': 0.1}   
2  LogisticRegression   {'solver': 'liblinear', 'penalty': 'l2', 'C': 1}   

   Accuracy  Precision    Recall  F1_Score  
0  0.868852   0.876625  0.868852  0.868993  
1  0.852459   0.857060  0.852459  0.852697  
2  0.868852   0.876625  0.868852  0.868993  

 Best Model: RandomForest with Accuracy=0.8689
Final best model saved at '../models/best_expected_model.pkl'
