In [5]:
import pandas as pd
import numpy as np
from scipy.io import arff
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder # Keep LabelEncoder for y if needed for classification, but not for regression directly
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import optuna
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor # Both are needed
from sklearn.metrics import accuracy_score, mean_squared_error, roc_auc_score # MSE for regression
import functools
import warnings
import joblib
import time
from collections import defaultdict
from scipy.optimize import curve_fit

# Suppress ConvergenceWarning from scikit-learn
warnings.filterwarnings("ignore", category=UserWarning, module='sklearn')
# Suppress Optuna specific warnings if they become too verbose for logging
optuna.logging.set_verbosity(optuna.logging.WARNING)


# --- 1. Dataset Loading and Preprocessing ---

def load_and_preprocess_arff(filepath, target_column_name=None):
    """
    Loads an ARFF file, identifies features and target, and prepares data.
    Handles decoding byte strings.
    """
    data, meta = arff.loadarff(filepath)
    df = pd.DataFrame(data)

    for col in df.columns:
        if df[col].dtype == 'object':
            # Decode byte strings to UTF-8
            df[col] = df[col].apply(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
            # Try to convert to numeric if all values look like numbers (e.g., '1', '2')
            # This is important for 'Class_number_of_rings' for regression
            try:
                # Attempt to convert to numeric, coercing errors means non-numeric become NaN
                temp_numeric_col = pd.to_numeric(df[col], errors='coerce')
                # If conversion was successful for all non-null values, make it numeric
                if not temp_numeric_col.isnull().any(): # Check if any value became NaN
                    df[col] = temp_numeric_col
            except:
                pass # Not numeric, keep as object/string

    if target_column_name is None:
        target_column_name = df.columns[-1]

    X = df.drop(columns=[target_column_name])
    y = df[target_column_name]

    categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
    numerical_features = X.select_dtypes(include=np.number).columns.tolist()

    feature_names = X.columns.tolist()
    # categorical_features_idx is not directly used by ColumnTransformer, but useful for info.
    # categorical_features_idx = [feature_names.index(col) for col in categorical_features]

    print(f"Loaded {filepath}. Shape: {df.shape}")
    print(f"Target column: '{target_column_name}' (dtype: {y.dtype})")
    print(f"Numerical features: {numerical_features}")
    print(f"Categorical features detected: {categorical_features}")

    return X, y, feature_names, categorical_features

def get_preprocessor(numerical_features, categorical_features):
    """
    Creates a column transformer for preprocessing.
    """
    transformers = []
    if numerical_features:
        transformers.append(('num', 'passthrough', numerical_features))
    if categorical_features:
        # Use OneHotEncoder for categorical features
        transformers.append(('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features))

    preprocessor = ColumnTransformer(
        transformers=transformers,
        remainder='drop' # Drop any columns not specified
    )
    return preprocessor

# --- 2. Optuna Objective Function (with Partial RF Evaluation) ---

def objective(trial, X_train_raw, y_train_processed, X_val_raw, y_val_processed,
              task_type='classification', preprocessor=None, max_n_estimators_upper_bound=1000):
    """
    Optuna objective function for Random Forest HPO with multi-fidelity evaluation.
    """
    # 1. Hyperparameter Sampling
    n_estimators = trial.suggest_int("n_estimators", 50, max_n_estimators_upper_bound)
    max_depth = trial.suggest_int("max_depth", 2, 32)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 20)
    max_features_val = trial.suggest_float("max_features", 0.1, 1.0)

    # Initialize a dictionary for common model parameters
    model_params = {
        "n_estimators": 1, # Start with 1 estimator for warm_start
        "max_depth": max_depth,
        "min_samples_leaf": min_samples_leaf,
        "max_features": max_features_val,
        "oob_score": True, # OOB score is available for both
        "random_state": 42,
        "warm_start": True,
        "n_jobs": -1
    }

    if task_type == 'classification':
        criterion = trial.suggest_categorical("criterion", ["gini", "entropy"])
        class_weight = trial.suggest_categorical("class_weight", [None, "balanced"])
        model_class = RandomForestClassifier
        model_params["criterion"] = criterion
        model_params["class_weight"] = class_weight # Only add class_weight for Classifier
    else: # Regression
        criterion = trial.suggest_categorical("criterion", ["squared_error", "absolute_error", "poisson"])
        model_class = RandomForestRegressor
        model_params["criterion"] = criterion # No class_weight for Regressor

    rf_model = model_class(**model_params) # Unpack the dictionary

    if preprocessor:
        X_train_transformed = preprocessor.fit_transform(X_train_raw)
        X_val_transformed = preprocessor.transform(X_val_raw)
    else:
        # Assuming X_train_raw and X_val_raw are already suitable numpy arrays or numeric DFs
        X_train_transformed = X_train_raw.to_numpy()
        X_val_transformed = X_val_raw.to_numpy()


    # 2. Fidelity Schedule and Partial Evaluation
    fidelity_steps = sorted(list(set(
        [max(1, int(n_estimators * p)) for p in [0.02, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]] +
        [max(1, int(n_estimators * 0.01 * i)) for i in range(1, 6)] # Add finer steps for first 5%
    )))
    fidelity_steps = [s for s in fidelity_steps if s > 0]
    fidelity_steps = sorted(list(set(fidelity_steps)))
    fidelity_steps = [s for s in fidelity_steps if s <= n_estimators]
    
    if not fidelity_steps: 
        fidelity_steps = [n_estimators]


    for step_n_estimators in fidelity_steps:
        if step_n_estimators > rf_model.n_estimators:
            rf_model.n_estimators = step_n_estimators
            rf_model.fit(X_train_transformed, y_train_processed)

        # OOB score for regression is R^2. For pruning, we want to maximize it,
        # or minimize negative MSE. Let's stick to maximizing R^2 for consistency.
        current_oob_score = np.nan
        if rf_model.n_estimators > 1 and hasattr(rf_model, 'oob_score_') and not np.isnan(rf_model.oob_score_):
            current_oob_score = rf_model.oob_score_ # This is R^2 for Regressor, which we want to maximize
        elif task_type == 'regression' and rf_model.n_estimators > 1:
            # If oob_score_ is not directly available or reliable for some reason,
            # we could calculate an intermediate MSE on validation, but that's less efficient.
            # Stick to OOB for multi-fidelity.
            pass # Keep as nan, or consider other partial metrics if OOB not suitable

        trial.report(current_oob_score, step=step_n_estimators)

        if trial.should_prune():
            print(f"Trial {trial.number} pruned at {step_n_estimators} estimators (OOB R^2: {current_oob_score:.4f}).")
            raise optuna.exceptions.TrialPruned()

    y_pred = rf_model.predict(X_val_transformed)

    if task_type == 'classification':
        if hasattr(rf_model, 'predict_proba') and len(np.unique(y_val_processed)) == 2:
            y_proba = rf_model.predict_proba(X_val_transformed)[:, 1]
            final_metric = roc_auc_score(y_val_processed, y_proba)
            metric_name = "AUC"
        else:
            final_metric = accuracy_score(y_val_processed, y_pred)
            metric_name = "Accuracy"
    else: # Regression
        final_metric = -mean_squared_error(y_val_processed, y_pred) # Minimize MSE -> Maximize -MSE
        metric_name = "Negative MSE"

    print(f"Trial {trial.number} completed with {metric_name}: {final_metric:.4f}")
    return final_metric


# --- 3. Custom Optuna Pruner Implementation ---

class RandomForestMultiFidelityPruner(optuna.pruners.BasePruner):
    """
    Custom Optuna pruner for Random Forest Hyperparameter Optimization
    using a multi-fidelity approach and learning curve prediction.
    """
    def __init__(self,
                 min_intermediate_steps=3,
                 pruning_quantile=0.75,
                 grace_period_ratio=0.02,
                 predictive_model_type='saturating_curve',
                 debug_mode=True):
        self.min_intermediate_steps = min_intermediate_steps
        self.pruning_quantile = pruning_quantile
        self.grace_period_ratio = grace_period_ratio
        self.predictive_model_type = predictive_model_type
        self.trial_learning_curves = defaultdict(list)
        self.debug_mode = debug_mode

    def prune(self, study: optuna.study.Study, trial: optuna.trial.FrozenTrial) -> bool:
        trial_id = trial.number
        current_step = trial.last_step
        current_value = trial.value

        # Note: For regression, we are maximizing -MSE. So a higher value is better.
        # The pruner's logic remains "maximize" naturally.

        if current_step is None or current_value is None:
            return False 

        # Filter out NaN or very abnormal OOB scores if they occur early
        if np.isnan(current_value) or current_value < -1e6: # Arbitrary large negative threshold for very bad OOB
             if self.debug_mode:
                print(f"[Debug Pruner Trial {trial_id} Step {current_step}]: Current OOB value is NaN or too extreme ({current_value:.4f}). Skipping pruning check.")
             return False # Or could return True if you want to prune on NaN/extreme values

        self.trial_learning_curves[trial_id].append((current_step, current_value))

        if len(self.trial_learning_curves[trial_id]) < self.min_intermediate_steps:
            if self.debug_mode:
                print(f"[Debug Pruner Trial {trial_id} Step {current_step}]: Not enough steps ({len(self.trial_learning_curves[trial_id])}/{self.min_intermediate_steps})")
            return False
        
        # Ensure we have a valid n_estimators in trial.params before calculating grace period
        n_estimators_param = trial.params.get('n_estimators')
        if n_estimators_param is None:
             if self.debug_mode:
                print(f"[Debug Pruner Trial {trial_id} Step {current_step}]: n_estimators not in params. Skipping grace period check.")
             return False

        if current_step < self.grace_period_ratio * n_estimators_param:
            if self.debug_mode:
                print(f"[Debug Pruner Trial {trial_id} Step {current_step}]: In grace period ({current_step}/{self.grace_period_ratio * n_estimators_param:.1f})")
            return False

        current_lc_data = self.trial_learning_curves[trial_id]
        steps = np.array([s for s, _ in current_lc_data])
        values = np.array([v for _, v in current_lc_data])
        
        if len(steps) < 3: 
            return False

        predicted_final_performance = self._predict_final_performance(steps, values, n_estimators_param, trial_id)

        completed_trial_values = [
            t.value for t in study.trials
            if t.state == optuna.trial.TrialState.COMPLETE and t.value is not None
        ]
        
        if not completed_trial_values:
            if self.debug_mode:
                print(f"[Debug Pruner Trial {trial_id} Step {current_step}]: No complete trials yet.")
            return False

        threshold = np.quantile(completed_trial_values, self.pruning_quantile)
        
        if self.debug_mode:
            print(f"[Debug Pruner Trial {trial_id} Step {current_step}]: Pred: {predicted_final_performance:.4f}, Threshold: {threshold:.4f}, Current OOB R^2: {current_value:.4f}")
            if predicted_final_performance < threshold:
                print(f"                                   -> WILL PRUNE (Predicted < Threshold)")

        if predicted_final_performance < threshold:
            return True
        
        return False

    def _predict_final_performance(self, steps, values, max_n_estimators, trial_number):
        """
        Predicts the final performance based on observed learning curve using a saturating curve.
        """
        if self.predictive_model_type == 'saturating_curve':
            # For regression (Negative MSE), we are maximizing. R^2 is also maximized.
            # So the saturating curve that goes upwards towards an asymptote is still appropriate.
            def saturating_curve(x, A, B, C):
                return A * (1 - np.exp(-B * x)) + C

            # Initial guess for parameters (A, B, C)
            # A: asymptote (max performance). Should be towards 0 (for -MSE) or 1 (for R^2)
            # B: positive learning rate
            # C: starting value
            
            # Adjusting initial guess and bounds for regression (R^2 or -MSE)
            # If maximizing R^2, max is 1. If maximizing -MSE, max is 0 (since MSE >= 0)
            target_upper_bound = 1.0 # For R^2
            if study.direction == "minimize": # If we were minimizing MSE directly, not -MSE
                 target_upper_bound = 0.0 # For MSE
            
            initial_guess_A = max(values.max() * 1.05, 0.0) # A should be >= 0 for R^2 / -MSE
            initial_guess_B = 0.05
            initial_guess_C = values[0] if len(values) > 0 else (values.min() if len(values) > 0 else -1.0) # Start from initial value or a reasonable guess

            p0 = [initial_guess_A, initial_guess_B, initial_guess_C]

            try:
                # Bounds for A, B, C
                # A: (values.max(), target_upper_bound) - Asymptote at least max observed, up to 1.0 (for R^2) or 0 (for -MSE)
                # B: (0.0001, 0.5] - Rate should be positive, not too high
                # C: [values.min()*1.2, values.max()*1.2] - Initial value within reasonable range. Can be negative for R^2.
                bounds_lower = [values.max() * 0.8, 0.0001, min(values.min() * 1.2, values.max() * 0.8)]
                bounds_upper = [target_upper_bound, 0.5, max(values.min() * 0.8, values.max() * 1.2)]
                
                # Make sure lower bound for A is not greater than upper bound
                if bounds_lower[0] > bounds_upper[0]:
                    bounds_lower[0] = bounds_upper[0] * 0.9 # Adjust lower bound to be slightly less than upper

                popt, pcov = curve_fit(saturating_curve, steps, values, p0=p0, maxfev=5000,
                                       bounds=(bounds_lower, bounds_upper))
                
                predicted_value = saturating_curve(max_n_estimators, *popt)
                
                # Cap the prediction within realistic bounds (min observed to target_upper_bound)
                predicted_value = max(values.min(), min(target_upper_bound, predicted_value))

                if self.debug_mode:
                    print(f"[Debug Pruner Trial {trial_number} Step {steps[-1]}]: Fit successful. Predicted {predicted_value:.4f}")
                return predicted_value
            except (RuntimeError, ValueError) as e:
                if self.debug_mode:
                    print(f"[Debug Pruner Trial {trial_number} Step {steps[-1]}]: Curve fit failed ({e}). Falling back to last value.")
                return values[-1] 

        return values[-1] 

# --- 4. Running the Optuna Study ---

# --- Configuration Parameters for ABALONE dataset ---
N_TRIALS = 100
RANDOM_STATE = 42
DATASET_PATH = 'datasets/dataset_187_abalone.arff' # <--- UPDATED for Abalone
TARGET_COLUMN = 'Class_number_of_rings' # <--- UPDATED for Abalone
TASK_TYPE = 'regression' # <--- UPDATED to REGRESSION

# --- Load and Preprocess Data ---
# Note: y will be converted to numeric inside load_and_preprocess_arff now for 'Class_number_of_rings'
X_raw, y_raw, feature_names, categorical_features = load_and_preprocess_arff(
    DATASET_PATH, target_column_name=TARGET_COLUMN
)

# Ensure y is numeric for regression (it should be handled by load_and_preprocess_arff now, but double check)
if y_raw.dtype == 'object':
    print(f"Warning: Target column '{TARGET_COLUMN}' is still object type after loading. Attempting final conversion to int.")
    y_processed = y_raw.astype(int)
else:
    y_processed = y_raw

print(f"Final target dtype for regression: {y_processed.dtype}")


# Categorical features are now explicitly passed from load_and_preprocess_arff
numerical_feats = X_raw.select_dtypes(include=np.number).columns.tolist()
# categorical_feats should now contain ['Sex']
# Ensure data_preprocessor is called with the dynamically detected lists
data_preprocessor = get_preprocessor(numerical_feats, categorical_features)

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_raw, y_processed, test_size=0.2, random_state=RANDOM_STATE
    # No stratify for regression
)

# --- Initialize Custom Pruner with aggressive and DEBUG settings (reused) ---
custom_pruner = RandomForestMultiFidelityPruner(
    min_intermediate_steps=3,
    pruning_quantile=0.75,      
    grace_period_ratio=0.02,    
    predictive_model_type='saturating_curve',
    debug_mode=True             
)

# --- Create Optuna Study ---
# Direction is "maximize" because we are maximizing negative MSE (equivalent to minimizing MSE)
study = optuna.create_study(
    direction="maximize", # Maximize -MSE (or R^2 from OOB for pruner)
    pruner=custom_pruner,
    sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE),
    study_name="abalone_rf_hpo_study_debug_pruner", # <--- UPDATED Study Name
    storage="sqlite:///abalone_rf_hpo_debug.db",
    load_if_exists=True     # <--- UPDATED DB File
)

objective_with_args = functools.partial(
    objective,
    X_train_raw=X_train_split,
    y_train_processed=y_train_split, # y is now processed for regression
    X_val_raw=X_val_split,
    y_val_processed=y_val_split,     # y is now processed for regression
    task_type=TASK_TYPE,
    preprocessor=data_preprocessor,
    max_n_estimators_upper_bound=250 # Keeping this value
)

print(f"Starting Optuna study with {N_TRIALS} trials for Abalone dataset (DEBUG PRUNER)...")
study.optimize(objective_with_args, n_trials=N_TRIALS, show_progress_bar=True)

# --- Print Results ---
print("\nOptimization finished.")
print(f"Best trial value (Negative MSE): {study.best_trial.value:.4f}")
print("Best hyperparameters:")
for key, value in study.best_trial.params.items():
    print(f"  {key}: {value}")

pruned_trials = study.get_trials(deepcopy=False, states=[optuna.trial.TrialState.PRUNED])
complete_trials = study.get_trials(deepcopy=False, states=[optuna.trial.TrialState.COMPLETE])
print(f"Number of pruned trials: {len(pruned_trials)}") 
print(f"Number of complete trials: {len(complete_trials)}")


# --- 5. Final Model Evaluation ---

def evaluate_final_model(best_params, X_train_full, y_train_full, X_test_full, y_test_full,
                         task_type='classification', preprocessor=None):
    """
    Trains the best model from HPO on the full training data and evaluates on a test set.
    """
    start_time = time.time()

    if preprocessor:
        X_train_transformed = preprocessor.fit_transform(X_train_full)
        X_test_transformed = preprocessor.transform(X_test_full)
    else:
        X_train_transformed = X_train_full.to_numpy()
        X_test_transformed = X_test_full.to_numpy()

    if task_type == 'classification':
        model = RandomForestClassifier(random_state=42, n_jobs=-1, **best_params)
    else: # Regression
        model = RandomForestRegressor(random_state=42, n_jobs=-1, **best_params)

    model.fit(X_train_transformed, y_train_full)
    fit_time = time.time() - start_time

    y_pred = model.predict(X_test_transformed)

    if task_type == 'classification':
        if hasattr(model, 'predict_proba') and len(np.unique(y_test_full)) == 2:
            y_proba = model.predict_proba(X_test_full)[:, 1] # Use transformed X for predict_proba too
            auc_score = roc_auc_score(y_test_full, y_proba)
            print(f"Final Model AUC: {auc_score:.4f}")
            final_metric = auc_score
            metric_name = "AUC"
        else:
            final_metric = accuracy_score(y_test_full, y_pred)
            metric_name = "Accuracy"
    else: # Regression
        final_metric = mean_squared_error(y_test_full, y_pred) # Report actual MSE, not negative
        print(f"Final Model MSE: {final_metric:.4f}")
        metric_name = "MSE"

    print(f"\n--- Final Model Evaluation ---")
    print(f"Metric ({metric_name}): {final_metric:.4f}")
    print(f"Training time for final model: {fit_time:.2f} seconds")

    return model, final_metric


X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(
    X_raw, y_processed, test_size=0.2, random_state=RANDOM_STATE
    # No stratify for regression
)

if study.best_trial:
    best_params = study.best_trial.params
    final_model, final_score = evaluate_final_model(
        best_params, X_train_full, y_train_full, X_test_full, y_test_full,
        task_type=TASK_TYPE, preprocessor=data_preprocessor
    )

    joblib.dump(final_model, 'best_rf_abalone_model_debug.pkl') # <--- UPDATED save name
    joblib.dump(data_preprocessor, 'data_preprocessor_abalone_debug.pkl') # <--- UPDATED save name
    print("Best model and preprocessor for Abalone saved (debug pruner run).")

Loaded datasets/dataset_187_abalone.arff. Shape: (4177, 9)
Target column: 'Class_number_of_rings' (dtype: int64)
Numerical features: ['Length', 'Diameter', 'Height', 'Whole_weight', 'Shucked_weight', 'Viscera_weight', 'Shell_weight']
Categorical features detected: ['Sex']
Final target dtype for regression: int64
Starting Optuna study with 100 trials for Abalone dataset (DEBUG PRUNER)...


  0%|          | 0/100 [00:00<?, ?it/s]

Trial 1 completed with Negative MSE: -4.8539
Trial 2 completed with Negative MSE: -5.3605
Trial 3 completed with Negative MSE: -4.9107
Trial 4 completed with Negative MSE: -4.8373
Trial 5 completed with Negative MSE: -6.4833
Trial 6 completed with Negative MSE: -4.8702
Trial 7 completed with Negative MSE: -4.8263
Trial 8 completed with Negative MSE: -4.9030
Trial 9 completed with Negative MSE: -4.9999
Trial 10 completed with Negative MSE: -5.0138
Trial 11 completed with Negative MSE: -5.1952
Trial 12 completed with Negative MSE: -4.8412
Trial 13 completed with Negative MSE: -4.9647
Trial 14 completed with Negative MSE: -4.8755
Trial 15 completed with Negative MSE: -4.8445
Trial 16 completed with Negative MSE: -4.9113
Trial 17 completed with Negative MSE: -6.1034
Trial 18 completed with Negative MSE: -4.8202
Trial 19 completed with Negative MSE: -4.8433
Trial 20 completed with Negative MSE: -4.8516
Trial 21 completed with Negative MSE: -4.9608
Trial 22 completed with Negative MSE: -4.81

In [6]:
# --- Optional: Optuna Visualizations ---
import optuna.visualization as ov
fig1 = ov.plot_optimization_history(study)
fig1.show()
fig2 = ov.plot_intermediate_values(study)
fig2.show()
fig3 = ov.plot_param_importances(study)
fig3.show()