# 06 – Hyperparameter Tuning

Focus: Improve top baseline models using GridSearchCV & RandomizedSearchCV.
Compare tuned vs baseline on hold-out test set.

In [None]:
import pandas as pd, numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score, classification_report
from scipy.stats import randint, uniform
import joblib

df = pd.read_csv(Path('../data/heart_disease.csv'))
target_col = 'target' if 'target' in df.columns else 'num'
y = df[target_col]
X = df.drop(columns=[target_col])
categorical = [c for c in X.columns if X[c].dtype=='object']
numeric = [c for c in X.columns if c not in categorical]

numeric_transformer = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
categorical_transformer = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('ohe', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer([('num', numeric_transformer, numeric), ('cat', categorical_transformer, categorical)])

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

## 1. Randomized Search – RandomForest

In [None]:
rf = RandomForestClassifier(random_state=42)
rf_pipe = Pipeline([('prep', preprocessor), ('rf', rf)])
rf_dist = {
    'rf__n_estimators': randint(200, 600),
    'rf__max_depth': randint(3, 20),
    'rf__min_samples_split': randint(2, 15),
    'rf__min_samples_leaf': randint(1, 10)
}
rf_rand = RandomizedSearchCV(rf_pipe, rf_dist, n_iter=30, scoring='roc_auc', cv=5, random_state=42, n_jobs=-1)
rf_rand.fit(X_train, y_train)
rf_rand.best_params_, rf_rand.best_score_

## 2. Grid Search – Logistic Regression (Refined around penalty/C)

In [None]:
log_reg = LogisticRegression(max_iter=2000, solver='liblinear')
log_pipe = Pipeline([('prep', preprocessor), ('log', log_reg)])
log_grid = {
    'log__penalty': ['l1','l2'],
    'log__C': [0.01, 0.1, 1, 5, 10]
}
log_search = GridSearchCV(log_pipe, log_grid, scoring='roc_auc', cv=5, n_jobs=-1)
log_search.fit(X_train, y_train)
log_search.best_params_, log_search.best_score_

## 3. Evaluate Tuned Models on Test Set

In [None]:
best_rf = rf_rand.best_estimator_
best_log = log_search.best_estimator_
for name, model in [('rf_tuned', best_rf), ('log_tuned', best_log)]:
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1]
    print(name, 'ROC AUC:', roc_auc_score(y_test, y_proba), 'F1:', f1_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

## 4. Choose Final Model & Persist

In [None]:
# Suppose best_rf wins by ROC AUC
final_model = best_rf
joblib.dump(final_model, '../models/final_model.pkl')
print('Saved final model pipeline.')

## Notes
- RandomizedSearch scales better early; follow with focused GridSearch.
- Evaluate stability: compare CV score vs test to detect overfit.
- Consider stratified KFold (default for classification in scikit-learn).
- For imbalanced data, tune with 'roc_auc' or 'f1' rather than accuracy.