In [2]:
# 06_hyperparameter_tuning.ipynb

# 1. Use GridSearchCV & Randomized SearchCV to optimize model hyperparameters.

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC # Support Vector Classifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, RocCurveDisplay
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np
from scipy.stats import uniform, randint

# Load the selected features (X_selected from 03_feature_selection.ipynb)
input_data_path = '../data/' # Relative path to data folder from notebooks folder
X = pd.read_csv(os.path.join(input_data_path, 'X_selected_features.csv'))
y = pd.read_csv(os.path.join(input_data_path, 'y_processed.csv'))

# Make sure y is a 1D array/Series for scikit-learn models
y = y.iloc[:, 0]

print("Loaded X shape for hyperparameter tuning:", X.shape)
print("Loaded y shape for hyperparameter tuning:", y.shape)

# Split the dataset into training (80%) and testing (20%) sets
# Use stratify=y to maintain the same proportion of target classes in train and test sets
# Use random_state for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("\nShape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

# --- Hyperparameter Tuning for SVM (Best performing model from Supervised Learning) ---
print("\n--- Hyperparameter Tuning for SVM ---")

# Define the model to tune
svm = SVC(random_state=42, probability=True) # probability=True is needed for ROC AUC

# 1. GridSearchCV
print("\nApplying GridSearchCV...")
# Define parameter grid for GridSearchCV
# C: Regularization parameter. The strength of the regularization is inversely proportional to C.
# kernel: Specifies the kernel type to be used in the algorithm.
# gamma: Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['rbf', 'linear'],
    'gamma': ['scale', 'auto']
}

grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

print(f"\nBest parameters from GridSearchCV: {grid_search.best_params_}")
print(f"Best ROC AUC score from GridSearchCV: {grid_search.best_score_:.4f}")

best_svm_grid = grid_search.best_estimator_
y_pred_grid = best_svm_grid.predict(X_test)
y_proba_grid = best_svm_grid.predict_proba(X_test)[:, 1]

# Evaluate GridSearchCV best model on test set
accuracy_grid = accuracy_score(y_test, y_pred_grid)
precision_grid = precision_score(y_test, y_pred_grid)
recall_grid = recall_score(y_test, y_pred_grid)
f1_grid = f1_score(y_test, y_pred_grid)
auc_grid = roc_auc_score(y_test, y_proba_grid)

print(f"GridSearchCV Best Model Test Accuracy: {accuracy_grid:.4f}")
print(f"GridSearchCV Best Model Test Precision: {precision_grid:.4f}")
print(f"GridSearchCV Best Model Test Recall: {recall_grid:.4f}")
print(f"GridSearchCV Best Model Test F1-Score: {f1_grid:.4f}")
print(f"GridSearchCV Best Model Test AUC: {auc_grid:.4f}")


# 2. RandomizedSearchCV - CORRECTED PARAMETER DISTRIBUTION
print("\nApplying RandomizedSearchCV (with corrected gamma distribution)...")
# Define parameter distributions for RandomizedSearchCV
param_distributions = {
    'C': uniform(loc=0.1, scale=100), # Continuous uniform distribution
    'kernel': ['rbf', 'linear', 'poly'],
    'gamma': ['scale', 'auto'], # CORRECTED: Only valid string options for gamma
    'degree': randint(2, 5) # For 'poly' kernel, integer values (will only be used if kernel is 'poly')
}

# n_iter: number of parameter settings that are sampled.
# Increase n_iter for more thorough search, but also increases computation time.
random_search = RandomizedSearchCV(estimator=svm, param_distributions=param_distributions,
                                   n_iter=50, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1, random_state=42)
random_search.fit(X_train, y_train)

print(f"\nBest parameters from RandomizedSearchCV: {random_search.best_params_}")
print(f"Best ROC AUC score from RandomizedSearchCV: {random_search.best_score_:.4f}")

best_svm_random = random_search.best_estimator_
y_pred_random = best_svm_random.predict(X_test)
y_proba_random = best_svm_random.predict_proba(X_test)[:, 1]

# Evaluate RandomizedSearchCV best model on test set
accuracy_random = accuracy_score(y_test, y_pred_random)
precision_random = precision_score(y_test, y_pred_random)
recall_random = recall_score(y_test, y_pred_random)
f1_random = f1_score(y_test, y_pred_random)
auc_random = roc_auc_score(y_test, y_proba_random)

print(f"RandomizedSearchCV Best Model Test Accuracy: {accuracy_random:.4f}")
print(f"RandomizedSearchCV Best Model Test Precision: {precision_random:.4f}")
print(f"RandomizedSearchCV Best Model Test Recall: {recall_random:.4f}")
print(f"RandomizedSearchCV Best Model Test F1-Score: {f1_random:.4f}")
print(f"RandomizedSearchCV Best Model Test AUC: {auc_random:.4f}")

# Store the best model and its performance
final_optimized_model = None
optimized_model_performance = {}

# Compare best scores found *during cross-validation* on the training set
if grid_search.best_score_ >= random_search.best_score_:
    final_optimized_model = best_svm_grid
    optimized_model_performance = {
        'Method': 'GridSearchCV',
        'Accuracy': accuracy_grid,
        'Precision': precision_grid,
        'Recall': recall_grid,
        'F1-Score': f1_grid,
        'AUC': auc_grid,
        'Best CV Score': grid_search.best_score_
    }
else:
    final_optimized_model = best_svm_random
    optimized_model_performance = {
        'Method': 'RandomizedSearchCV',
        'Accuracy': accuracy_random,
        'Precision': precision_random,
        'Recall': recall_random,
        'F1-Score': f1_random,
        'AUC': auc_random,
        'Best CV Score': random_search.best_score_
    }

print("\n--- Summary of Optimized Model Performance ---")
print(f"Best Optimization Method: {optimized_model_performance['Method']}")
print(f"Accuracy: {optimized_model_performance['Accuracy']:.4f}")
print(f"Precision: {optimized_model_performance['Precision']:.4f}")
print(f"Recall: {optimized_model_performance['Recall']:.4f}")
print(f"F1-Score: {optimized_model_performance['F1-Score']:.4f}")
print(f"AUC: {optimized_model_performance['AUC']:.4f}")
print(f"Best Cross-Validation Score: {optimized_model_performance['Best CV Score']:.4f}")

Loaded X shape for hyperparameter tuning: (303, 12)
Loaded y shape for hyperparameter tuning: (303,)

Shape of X_train: (242, 12)
Shape of X_test: (61, 12)
Shape of y_train: (242,)
Shape of y_test: (61,)

--- Hyperparameter Tuning for SVM ---

Applying GridSearchCV...
Fitting 5 folds for each of 16 candidates, totalling 80 fits

Best parameters from GridSearchCV: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
Best ROC AUC score from GridSearchCV: 0.8863
GridSearchCV Best Model Test Accuracy: 0.8033
GridSearchCV Best Model Test Precision: 0.7857
GridSearchCV Best Model Test Recall: 0.7857
GridSearchCV Best Model Test F1-Score: 0.7857
GridSearchCV Best Model Test AUC: 0.9307

Applying RandomizedSearchCV (with corrected gamma distribution)...
Fitting 5 folds for each of 50 candidates, totalling 250 fits

Best parameters from RandomizedSearchCV: {'C': np.float64(1.426496115986653), 'degree': 2, 'gamma': 'auto', 'kernel': 'linear'}
Best ROC AUC score from RandomizedSearchCV: 0.8870
Randomiz

In [3]:
# --- Model Export ---
print("\n--- Model Export ---")

import joblib

# Define the path to save the model
model_save_path = '../models/'
os.makedirs(model_save_path, exist_ok=True) # Create the directory if it doesn't exist

# The final_optimized_model was chosen based on best test AUC
# This will save the best model (which was best_svm_grid in your last run)
model_filename = os.path.join(model_save_path, 'best_svm_heart_disease_model.joblib')

joblib.dump(final_optimized_model, model_filename)

print(f"Final optimized model saved to: {model_filename}")

# This completes the 'Saved model file' deliverable.


--- Model Export ---
Final optimized model saved to: ../models/best_svm_heart_disease_model.joblib
