In [2]:
import pandas as pd
import numpy as np
from scipy.io import arff
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import optuna
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error, roc_auc_score
import functools
import warnings
import joblib
import time
from collections import defaultdict
from scipy.optimize import curve_fit

# Suppress ConvergenceWarning from scikit-learn
warnings.filterwarnings("ignore", category=UserWarning, module='sklearn')
# Suppress Optuna specific warnings if they become too verbose for logging
optuna.logging.set_verbosity(optuna.logging.WARNING)


# --- 1. Dataset Loading and Preprocessing ---

def load_and_preprocess_arff(filepath, target_column_name=None):
    """
    Loads an ARFF file, identifies features and target, and prepares data.
    """
    data, meta = arff.loadarff(filepath)
    df = pd.DataFrame(data)

    for col in df.columns:
        if df[col].dtype == 'object':
            try:
                df[col] = df[col].apply(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
            except (UnicodeDecodeError, AttributeError):
                pass

    if target_column_name is None:
        target_column_name = df.columns[-1]

    X = df.drop(columns=[target_column_name])
    y = df[target_column_name]

    categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
    numerical_features = X.select_dtypes(include=np.number).columns.tolist()

    feature_names = X.columns.tolist()
    categorical_features_idx = [feature_names.index(col) for col in categorical_features]

    print(f"Loaded {filepath}. Shape: {df.shape}")
    print(f"Target column: '{target_column_name}'")
    print(f"Numerical features: {numerical_features}")
    print(f"Categorical features detected: {categorical_features}")

    return X, y, feature_names, categorical_features_idx

def get_preprocessor(numerical_features, categorical_features):
    """
    Creates a column transformer for preprocessing.
    """
    transformers = []
    if numerical_features:
        transformers.append(('num', 'passthrough', numerical_features))
    if categorical_features:
        transformers.append(('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features))

    preprocessor = ColumnTransformer(
        transformers=transformers,
        remainder='drop'
    )
    return preprocessor

# --- 2. Optuna Objective Function (with Partial RF Evaluation) ---

def objective(trial, X_train_raw, y_train_encoded, X_val_raw, y_val_encoded,
              task_type='classification', preprocessor=None, max_n_estimators_upper_bound=1000):
    """
    Optuna objective function for Random Forest HPO with multi-fidelity evaluation.
    """
    n_estimators = trial.suggest_int("n_estimators", 50, max_n_estimators_upper_bound)
    max_depth = trial.suggest_int("max_depth", 2, 32)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 20)
    max_features_val = trial.suggest_categorical("max_features", ['sqrt', 'log2', 0.5, 0.7, 1.0])

    if task_type == 'classification':
        criterion = trial.suggest_categorical("criterion", ["gini", "entropy"])
        class_weight = trial.suggest_categorical("class_weight", [None, "balanced"])
        model_class = RandomForestClassifier
    else:
        criterion = trial.suggest_categorical("criterion", ["squared_error", "absolute_error"])
        class_weight = None
        model_class = RandomForestRegressor

    rf_model = model_class(
        n_estimators=1,
        max_depth=max_depth,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features_val,
        criterion=criterion,
        class_weight=class_weight,
        oob_score=True,
        random_state=42,
        warm_start=True,
        n_jobs=-1
    )

    if preprocessor:
        X_train_processed = preprocessor.fit_transform(X_train_raw)
        X_val_processed = preprocessor.transform(X_val_raw)
    else:
        X_train_processed = X_train_raw.to_numpy()
        X_val_processed = X_val_raw.to_numpy()

    fidelity_steps = sorted(list(set(
        [max(1, int(n_estimators * p)) for p in [0.02, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]] +
        [max(1, int(n_estimators * 0.01 * i)) for i in range(1, 6)] # Add finer steps for first 5%
    )))
    fidelity_steps = [s for s in fidelity_steps if s > 0]
    fidelity_steps = sorted(list(set(fidelity_steps)))
    fidelity_steps = [s for s in fidelity_steps if s <= n_estimators]
    
    if not fidelity_steps: 
        fidelity_steps = [n_estimators]


    for step_n_estimators in fidelity_steps:
        if step_n_estimators > rf_model.n_estimators:
            rf_model.n_estimators = step_n_estimators
            rf_model.fit(X_train_processed, y_train_encoded)

        if rf_model.n_estimators > 1 and hasattr(rf_model, 'oob_score_'):
            current_oob_score = rf_model.oob_score_
        else:
            current_oob_score = np.nan

        trial.report(current_oob_score, step=step_n_estimators)

        if trial.should_prune():
            print(f"Trial {trial.number} pruned at {step_n_estimators} estimators (OOB: {current_oob_score:.4f}).")
            raise optuna.exceptions.TrialPruned()

    y_pred = rf_model.predict(X_val_processed)

    if hasattr(rf_model, 'predict_proba') and len(np.unique(y_val_encoded)) == 2:
        y_proba = rf_model.predict_proba(X_val_processed)[:, 1]
        final_metric = roc_auc_score(y_val_encoded, y_proba)
        metric_name = "AUC"
    else:
        final_metric = accuracy_score(y_val_encoded, y_pred)
        metric_name = "Accuracy"

    print(f"Trial {trial.number} completed with {metric_name}: {final_metric:.4f}")
    return final_metric


# --- 3. Custom Optuna Pruner Implementation ---

class RandomForestMultiFidelityPruner(optuna.pruners.BasePruner):
    """
    Custom Optuna pruner for Random Forest Hyperparameter Optimization
    using a multi-fidelity approach and learning curve prediction.
    """
    def __init__(self,
                 min_intermediate_steps=3,
                 pruning_quantile=0.75,
                 grace_period_ratio=0.02,
                 predictive_model_type='saturating_curve',
                 debug_mode=True):
        self.min_intermediate_steps = min_intermediate_steps
        self.pruning_quantile = pruning_quantile
        self.grace_period_ratio = grace_period_ratio
        self.predictive_model_type = predictive_model_type
        self.trial_learning_curves = defaultdict(list)
        self.debug_mode = debug_mode

    def prune(self, study: optuna.study.Study, trial: optuna.trial.FrozenTrial) -> bool:
        trial_id = trial.number
        current_step = trial.last_step
        current_value = trial.value

        if current_step is None or current_value is None:
            return False 

        self.trial_learning_curves[trial_id].append((current_step, current_value))

        if len(self.trial_learning_curves[trial_id]) < self.min_intermediate_steps:
            if self.debug_mode:
                print(f"[Debug Pruner Trial {trial_id} Step {current_step}]: Not enough steps ({len(self.trial_learning_curves[trial_id])}/{self.min_intermediate_steps})")
            return False
        
        if current_step < self.grace_period_ratio * trial.params.get('n_estimators', 1000):
            if self.debug_mode:
                print(f"[Debug Pruner Trial {trial_id} Step {current_step}]: In grace period ({current_step}/{self.grace_period_ratio * trial.params.get('n_estimators', 1000):.1f})")
            return False

        current_lc_data = self.trial_learning_curves[trial_id]
        steps = np.array([s for s, _ in current_lc_data])
        values = np.array([v for _, v in current_lc_data])
        
        if len(steps) < 3: 
            return False

        predicted_final_performance = self._predict_final_performance(steps, values, trial.params.get('n_estimators', 1000), trial_id)

        completed_trial_values = [
            t.value for t in study.trials
            if t.state == optuna.trial.TrialState.COMPLETE and t.value is not None
        ]
        
        if not completed_trial_values:
            if self.debug_mode:
                print(f"[Debug Pruner Trial {trial_id} Step {current_step}]: No complete trials yet.")
            return False

        threshold = np.quantile(completed_trial_values, self.pruning_quantile)
        
        if self.debug_mode:
            print(f"[Debug Pruner Trial {trial_id} Step {current_step}]: Pred: {predicted_final_performance:.4f}, Threshold: {threshold:.4f}, Current OOB: {current_value:.4f}")
            if predicted_final_performance < threshold:
                print(f"                                   -> WILL PRUNE (Predicted < Threshold)")

        if predicted_final_performance < threshold:
            return True
        
        return False

    def _predict_final_performance(self, steps, values, max_n_estimators, trial_number):
        """
        Predicts the final performance based on observed learning curve using a saturating curve.
        """
        if self.predictive_model_type == 'saturating_curve':
            def saturating_curve(x, A, B, C):
                return A * (1 - np.exp(-B * x)) + C

            initial_guess_A = max(values.max() * 1.05, 0.75) 
            initial_guess_B = 0.05
            initial_guess_C = values[0] if len(values) > 0 else 0.5 

            p0 = [initial_guess_A, initial_guess_B, initial_guess_C]

            try:
                bounds_lower = [values.max() * 0.9, 0.0001, values.min() * 0.8]
                bounds_upper = [1.0, 0.5, values.max() * 1.2]

                popt, pcov = curve_fit(saturating_curve, steps, values, p0=p0, maxfev=5000,
                                       bounds=(bounds_lower, bounds_upper))
                
                predicted_value = saturating_curve(max_n_estimators, *popt)
                
                predicted_value = max(values.min(), min(1.0, predicted_value))

                if self.debug_mode:
                    print(f"[Debug Pruner Trial {trial_number} Step {steps[-1]}]: Fit successful. Predicted {predicted_value:.4f}")
                return predicted_value
            except (RuntimeError, ValueError) as e:
                if self.debug_mode:
                    print(f"[Debug Pruner Trial {trial_number} Step {steps[-1]}]: Curve fit failed ({e}). Falling back to last value.")
                return values[-1] 

        return values[-1] 

# --- 4. Running the Optuna Study ---

# --- Configuration Parameters for PIMA DIABETES dataset ---
N_TRIALS = 100
RANDOM_STATE = 42
DATASET_PATH = 'datasets/dataset_37_diabetes.arff' # <--- UPDATED for Pima Diabetes
TARGET_COLUMN = 'class'             # <--- UPDATED for Pima Diabetes
TASK_TYPE = 'classification'        # Pima Diabetes is a binary classification

# --- Load and Preprocess Data ---
X_raw, y_raw, feature_names, categorical_features_idx = load_and_preprocess_arff(
    DATASET_PATH, target_column_name=TARGET_COLUMN
)

le = LabelEncoder()
y_encoded = le.fit_transform(y_raw)
print(f"Encoded target classes: {le.classes_}")

# Pima Diabetes features are all numerical based on the ARFF header
numerical_feats = X_raw.columns.tolist()
categorical_feats = [] 

data_preprocessor = get_preprocessor(numerical_feats, categorical_feats)

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_raw, y_encoded, test_size=0.2, random_state=RANDOM_STATE,
    stratify=y_encoded
)

# --- Initialize Custom Pruner with aggressive and DEBUG settings (reused from last run) ---
custom_pruner = RandomForestMultiFidelityPruner(
    min_intermediate_steps=3,
    pruning_quantile=0.75,      
    grace_period_ratio=0.02,    
    predictive_model_type='saturating_curve',
    debug_mode=True             
)

# --- Create Optuna Study ---
study = optuna.create_study(
    direction="maximize",
    pruner=custom_pruner,
    sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE),
    study_name="pima_diabetes_rf_hpo_study_debug_pruner", # <--- UPDATED Study Name
    storage="sqlite:///pima_diabetes_rf_hpo_debug.db"     # <--- UPDATED DB File
)

objective_with_args = functools.partial(
    objective,
    X_train_raw=X_train_split,
    y_train_encoded=y_train_split,
    X_val_raw=X_val_split,
    y_val_encoded=y_val_split,
    task_type=TASK_TYPE,
    preprocessor=data_preprocessor,
    max_n_estimators_upper_bound=250 # Keeping this value to limit max trial cost
)

print(f"Starting Optuna study with {N_TRIALS} trials for Pima Diabetes dataset (DEBUG PRUNER)...")
study.optimize(objective_with_args, n_trials=N_TRIALS, show_progress_bar=True)

# --- Print Results ---
print("\nOptimization finished.")
print(f"Best trial value (AUC/Accuracy): {study.best_trial.value:.4f}")
print("Best hyperparameters:")
for key, value in study.best_trial.params.items():
    print(f"  {key}: {value}")

pruned_trials = study.get_trials(deepcopy=False, states=[optuna.trial.TrialState.PRUNED])
complete_trials = study.get_trials(deepcopy=False, states=[optuna.trial.TrialState.COMPLETE])
print(f"Number of pruned trials: {len(pruned_trials)}") 
print(f"Number of complete trials: {len(complete_trials)}")


# --- 5. Final Model Evaluation ---

def evaluate_final_model(best_params, X_train_full, y_train_full, X_test_full, y_test_full,
                         task_type='classification', preprocessor=None):
    """
    Trains the best model from HPO on the full training data and evaluates on a test set.
    """
    start_time = time.time()

    if preprocessor:
        X_train_processed = preprocessor.fit_transform(X_train_full)
        X_test_processed = preprocessor.transform(X_test_full)
    else:
        X_train_processed = X_train_full.to_numpy()
        X_test_processed = X_test_full.to_numpy()

    if task_type == 'classification':
        model = RandomForestClassifier(random_state=42, n_jobs=-1, **best_params)
    else:
        model = RandomForestRegressor(random_state=42, n_jobs=-1, **best_params)

    model.fit(X_train_processed, y_train_full)
    fit_time = time.time() - start_time

    y_pred = model.predict(X_test_processed)

    if task_type == 'classification':
        if hasattr(model, 'predict_proba') and len(np.unique(y_test_full)) == 2:
            y_proba = model.predict_proba(X_test_processed)[:, 1]
            auc_score = roc_auc_score(y_test_full, y_proba)
            print(f"Final Model AUC: {auc_score:.4f}")
            final_metric = auc_score
            metric_name = "AUC"
        else:
            final_metric = accuracy_score(y_test_full, y_pred)
            metric_name = "Accuracy"
    else:
        final_metric = mean_squared_error(y_test_full, y_pred)
        metric_name = "MSE"

    print(f"\n--- Final Model Evaluation ---")
    print(f"Metric ({metric_name}): {final_metric:.4f}")
    print(f"Training time for final model: {fit_time:.2f} seconds")

    return model, final_metric


X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(
    X_raw, y_encoded, test_size=0.2, random_state=RANDOM_STATE,
    stratify=y_encoded
)

if study.best_trial:
    best_params = study.best_trial.params
    final_model, final_score = evaluate_final_model(
        best_params, X_train_full, y_train_full, X_test_full, y_test_full,
        task_type=TASK_TYPE, preprocessor=data_preprocessor
    )

    joblib.dump(final_model, 'best_rf_pima_diabetes_model_debug.pkl') # <--- UPDATED save name
    joblib.dump(data_preprocessor, 'data_preprocessor_pima_diabetes_debug.pkl') # <--- UPDATED save name
    print("Best model and preprocessor for Pima Diabetes saved (debug pruner run).")

Loaded datasets/dataset_37_diabetes.arff. Shape: (768, 9)
Target column: 'class'
Numerical features: ['preg', 'plas', 'pres', 'skin', 'insu', 'mass', 'pedi', 'age']
Categorical features detected: []
Encoded target classes: ['tested_negative' 'tested_positive']
Starting Optuna study with 100 trials for Pima Diabetes dataset (DEBUG PRUNER)...


  0%|          | 0/100 [00:00<?, ?it/s]

Trial 0 completed with AUC: 0.8335
Trial 1 completed with AUC: 0.8309
Trial 2 completed with AUC: 0.8313
Trial 3 completed with AUC: 0.8320
Trial 4 completed with AUC: 0.8307
Trial 5 completed with AUC: 0.8270
Trial 6 completed with AUC: 0.8150
Trial 7 completed with AUC: 0.8193
Trial 8 completed with AUC: 0.8313
Trial 9 completed with AUC: 0.8119
Trial 10 completed with AUC: 0.8285
Trial 11 completed with AUC: 0.8343
Trial 12 completed with AUC: 0.8315
Trial 13 completed with AUC: 0.8324
Trial 14 completed with AUC: 0.8374
Trial 15 completed with AUC: 0.8337
Trial 16 completed with AUC: 0.8304
Trial 17 completed with AUC: 0.8361
Trial 18 completed with AUC: 0.8365
Trial 19 completed with AUC: 0.8232
Trial 20 completed with AUC: 0.8315
Trial 21 completed with AUC: 0.8359
Trial 22 completed with AUC: 0.8326
Trial 23 completed with AUC: 0.8380
Trial 24 completed with AUC: 0.8344
Trial 25 completed with AUC: 0.8331
Trial 26 completed with AUC: 0.8146
Trial 27 completed with AUC: 0.8335
Tr

In [3]:
# --- Optional: Optuna Visualizations ---
import optuna.visualization as ov
fig1 = ov.plot_optimization_history(study)
fig1.show()
fig2 = ov.plot_intermediate_values(study)
fig2.show()
fig3 = ov.plot_param_importances(study)
fig3.show()