In [1]:
import pandas as pd
import numpy as np
from scipy.io import arff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
import functools
import warnings
import joblib
import time
from collections import defaultdict
from scipy.optimize import curve_fit
from scipy.stats import t
import json

# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning, module='sklearn')
optuna.logging.set_verbosity(optuna.logging.WARNING)

# --- 1. RandomForestMultiFidelityPruner Class ---
class RandomForestMultiFidelityPruner(optuna.pruners.BasePruner):
    def __init__(self, 
                 min_intermediate_steps=3, 
                 pruning_quantile=0.5, 
                 grace_period_steps=2, 
                 predictive_model_type='saturating_curve', 
                 debug_mode=False):
        self.min_intermediate_steps = min_intermediate_steps
        self.pruning_quantile = pruning_quantile
        self.grace_period_steps = grace_period_steps
        self.predictive_model_type = predictive_model_type
        self.trial_learning_curves = defaultdict(list)
        self.debug_mode = debug_mode
        self.predictions_log = []

    def prune(self, study, trial):
        trial_id = trial.number
        intermediate_values = trial.intermediate_values
        
        if self.debug_mode:
            print(f"\n=== DEBUGGING TRIAL {trial_id} ===")
            print(f"Intermediate values: {intermediate_values}")
        
        if not intermediate_values:
            return False
        
        current_step = max(intermediate_values.keys())
        current_value = intermediate_values[current_step]
        
        if trial_id not in self.trial_learning_curves:
            self.trial_learning_curves[trial_id] = []
        if not self.trial_learning_curves[trial_id] or \
           self.trial_learning_curves[trial_id][-1][0] != current_step:
            self.trial_learning_curves[trial_id].append((current_step, current_value))
        
        if len(intermediate_values) < self.min_intermediate_steps or current_step < self.grace_period_steps:
            return False
        
        steps = np.array(list(intermediate_values.keys()))
        values = np.array(list(intermediate_values.values()))
        
        try:
            predicted_final_performance = self._predict_final_performance(
                steps, values, trial.params.get('n_estimators', 250), trial_id)
        except Exception:
            return False
        
        completed_trial_values = [
            t.value for t in study.trials 
            if t.state == optuna.trial.TrialState.COMPLETE and t.value is not None
        ]
        
        if not completed_trial_values:
            return False
        
        threshold = np.quantile(completed_trial_values, self.pruning_quantile)
        should_prune = predicted_final_performance < threshold
        
        self.predictions_log.append({
            'trial_id': trial_id,
            'step': current_step,
            'predicted': predicted_final_performance,
            'threshold': threshold,
            'pruned': should_prune,
            'intermediate_values': dict(intermediate_values)
        })
        
        return should_prune

    def _predict_final_performance(self, steps, values, max_n_estimators, trial_id):
        if self.predictive_model_type == 'saturating_curve':
            return self._fit_saturating_curve(steps, values, trial_id)
        elif self.predictive_model_type == 'linear_extrapolation':
            return self._linear_extrapolation(steps, values)
        return values[-1]
    
    def _fit_saturating_curve(self, steps, values, trial_id):
        def saturating_func(x, a, b, c):
            return a - b * np.exp(-c * x)
        
        try:
            p0 = [max(values), max(values) - min(values), 0.1]
            popt, _ = curve_fit(saturating_func, steps, values, p0=p0, maxfev=1000)
            final_step = max(steps) * 3
            return saturating_func(final_step, *popt)
        except Exception:
            return self._linear_extrapolation(steps, values)
    
    def _linear_extrapolation(self, steps, values):
        if len(steps) < 2:
            return values[-1]
        slope = (values[-1] - values[-2]) / (steps[-1] - steps[-2])
        target_step = steps[-1] * 2
        return values[-1] + slope * (target_step - steps[-1])
    
    def get_prediction_analysis(self):
        return self.predictions_log

# --- 2. Dataset Loading and Preprocessing ---
def load_and_preprocess_arff(filepath, target_column_name=None):
    data, meta = arff.loadarff(filepath)
    df = pd.DataFrame(data)
    for col in df.columns:
        if df[col].dtype == 'object':
            try:
                df[col] = df[col].apply(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
            except (UnicodeDecodeError, AttributeError):
                pass
    if target_column_name is None:
        target_column_name = df.columns[-1]
    X = df.drop(columns=[target_column_name])
    y = df[target_column_name]
    categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
    numerical_features = X.select_dtypes(include=np.number).columns.tolist()
    feature_names = X.columns.tolist()
    categorical_features_idx = [feature_names.index(col) for col in categorical_features]
    return X, y, feature_names, categorical_features_idx

def get_preprocessor(numerical_features, categorical_features):
    from sklearn.preprocessing import OneHotEncoder
    transformers = []
    if numerical_features:
        transformers.append(('num', 'passthrough', numerical_features))
    if categorical_features:
        transformers.append(('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features))
    return ColumnTransformer(transformers=transformers, remainder='drop')

# --- 3. Objective Function with Timing ---
def objective(trial, X_train_raw, y_train_encoded, X_val_raw, y_val_encoded,
              task_type='classification', preprocessor=None, max_n_estimators_upper_bound=250,
              debug_mode=False, timing_data=None):
    start_time = time.time()
    
    n_estimators = trial.suggest_int("n_estimators", 50, max_n_estimators_upper_bound)
    max_depth = trial.suggest_int("max_depth", 2, 32)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 20)
    max_features_val = trial.suggest_categorical("max_features", ['sqrt', 'log2', 0.5, 0.7, 1.0])
    criterion = trial.suggest_categorical("criterion", ["gini", "entropy"])
    class_weight = trial.suggest_categorical("class_weight", [None, "balanced"])

    rf_model = RandomForestClassifier(
        n_estimators=1,
        max_depth=max_depth,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features_val,
        criterion=criterion,
        class_weight=class_weight,
        oob_score=True,
        random_state=42,
        warm_start=True,
        n_jobs=-1
    )

    if preprocessor:
        X_train_processed = preprocessor.fit_transform(X_train_raw)
        X_val_processed = preprocessor.transform(X_val_raw)
    else:
        X_train_processed = X_train_raw.to_numpy()
        X_val_processed = X_val_raw.to_numpy()

    fidelity_schedule = [0.02, 0.05, 0.1, 0.2, 0.5, 1.0]
    fidelity_steps = sorted(list(set([max(1, int(n_estimators * p)) for p in fidelity_schedule])))

    for step, current_n_estimators in enumerate(fidelity_steps):
        if current_n_estimators > rf_model.n_estimators:
            rf_model.n_estimators = current_n_estimators
            rf_model.fit(X_train_processed, y_train_encoded)

        if rf_model.n_estimators > 1 and hasattr(rf_model, 'oob_score_'):
            current_oob_score = rf_model.oob_score_
        else:
            current_oob_score = np.nan

        trial.report(current_oob_score, step)

        if trial.should_prune():
            break

    if not trial.should_prune():
        y_pred = rf_model.predict(X_val_processed)
        if len(np.unique(y_val_encoded)) == 2:
            y_proba = rf_model.predict_proba(X_val_processed)[:, 1]
            final_metric = roc_auc_score(y_val_encoded, y_proba)
        else:
            final_metric = accuracy_score(y_val_encoded, y_pred)
    else:
        final_metric = current_oob_score

    trial_time = time.time() - start_time
    timing_data.append({
        'trial_id': trial.number,
        'time_seconds': trial_time,
        'pruned': trial.should_prune(),
        'final_metric': final_metric
    })

    return final_metric

# --- 4. Baseline Objective Function ---
def objective_baseline(trial, X_train_raw, y_train_encoded, X_val_raw, y_val_encoded,
                      task_type='classification', preprocessor=None, max_n_estimators_upper_bound=250,
                      debug_mode=False, timing_data=None):
    start_time = time.time()
    
    n_estimators = trial.suggest_int("n_estimators", 50, max_n_estimators_upper_bound)
    max_depth = trial.suggest_int("max_depth", 2, 32)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 20)
    max_features_val = trial.suggest_categorical("max_features", ['sqrt', 'log2', 0.5, 0.7, 1.0])
    criterion = trial.suggest_categorical("criterion", ["gini", "entropy"])
    class_weight = trial.suggest_categorical("class_weight", [None, "balanced"])

    rf_model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features_val,
        criterion=criterion,
        class_weight=class_weight,
        random_state=42,
        n_jobs=-1
    )

    if preprocessor:
        X_train_processed = preprocessor.fit_transform(X_train_raw)
        X_val_processed = preprocessor.transform(X_val_raw)
    else:
        X_train_processed = X_train_raw.to_numpy()
        X_val_processed = X_val_raw.to_numpy()

    rf_model.fit(X_train_processed, y_train_encoded)
    y_pred = rf_model.predict(X_val_processed)

    if len(np.unique(y_val_encoded)) == 2:
        y_proba = rf_model.predict_proba(X_val_processed)[:, 1]
        final_metric = roc_auc_score(y_val_encoded, y_proba)
    else:
        final_metric = accuracy_score(y_val_encoded, y_pred)

    trial_time = time.time() - start_time
    timing_data.append({
        'trial_id': trial.number,
        'time_seconds': trial_time,
        'pruned': False,
        'final_metric': final_metric
    })

    return final_metric

# --- 5. Final Model Evaluation ---
def evaluate_final_model(best_params, X_train_full, y_train_full, X_test_full, y_test_full,
                         task_type='classification', preprocessor=None):
    start_time = time.time()
    if preprocessor:
        X_train_processed = preprocessor.fit_transform(X_train_full)
        X_test_processed = preprocessor.transform(X_test_full)
    else:
        X_train_processed = X_train_full.to_numpy()
        X_test_processed = X_test_full.to_numpy()
    model = RandomForestClassifier(random_state=42, n_jobs=-1, **best_params)
    model.fit(X_train_processed, y_train_full)
    fit_time = time.time() - start_time
    y_pred = model.predict(X_test_processed)
    if len(np.unique(y_test_full)) == 2:
        y_proba = model.predict_proba(X_test_processed)[:, 1]
        final_metric = roc_auc_score(y_test_full, y_proba)
    else:
        final_metric = accuracy_score(y_test_full, y_pred)
    return model, final_metric, fit_time

# --- 6. Confidence Interval Calculation ---
def calculate_confidence_interval(data, confidence=0.95):
    n = len(data)
    mean = np.mean(data)
    std_err = np.std(data, ddof=1) / np.sqrt(n)
    t_crit = t.ppf((1 + confidence) / 2, n - 1)
    margin_error = t_crit * std_err
    return mean, margin_error, (mean - margin_error, mean + margin_error)

# --- 7. Main Experiment ---
N_RUNS = 5
N_TRIALS = 100
RANDOM_STATE = 42
DATASET_PATH = 'datasets/php0iVrYT.arff'
TARGET_COLUMN = 'Class'
TASK_TYPE = 'classification'
DEBUG_MODE = False

# Load and preprocess data
X_raw, y_raw, _, _ = load_and_preprocess_arff(DATASET_PATH, target_column_name=TARGET_COLUMN)
le = LabelEncoder()
y_encoded = le.fit_transform(y_raw)
numerical_feats = X_raw.columns.tolist()
categorical_feats = []
data_preprocessor = get_preprocessor(numerical_feats, categorical_feats)
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_raw, y_encoded, test_size=0.2, random_state=RANDOM_STATE, stratify=y_encoded
)
X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(
    X_raw, y_encoded, test_size=0.2, random_state=RANDOM_STATE, stratify=y_encoded
)

# Store results
results_pruned = []
results_baseline = []
timing_data_pruned = []
timing_data_baseline = []

for run in range(N_RUNS):
    seed = RANDOM_STATE + run
    print(f"\nRun {run + 1}/{N_RUNS} (Seed: {seed})")
    
    # Pruned Study
    timing_data_pruned_run = []
    custom_pruner = RandomForestMultiFidelityPruner(
        min_intermediate_steps=2,
        pruning_quantile=0.25,
        grace_period_steps=1,
        predictive_model_type='linear_extrapolation',
        debug_mode=DEBUG_MODE
    )
    study_pruned = optuna.create_study(
        direction="maximize",
        pruner=custom_pruner,
        sampler=optuna.samplers.TPESampler(seed=seed),
        study_name=f"blood_transfusion_rf_hpo_pruned_run_{run}",
        storage=f"sqlite:///blood_transfusion_rf_hpo_pruned_run_{run}.db",
        load_if_exists=True
    )
    objective_with_args = functools.partial(
        objective,
        X_train_raw=X_train_split,
        y_train_encoded=y_train_split,
        X_val_raw=X_val_split,
        y_val_encoded=y_val_split,
        task_type=TASK_TYPE,
        preprocessor=data_preprocessor,
        max_n_estimators_upper_bound=250,
        debug_mode=DEBUG_MODE,
        timing_data=timing_data_pruned_run
    )
    start_time = time.time()
    study_pruned.optimize(objective_with_args, n_trials=N_TRIALS, show_progress_bar=True)
    total_time_pruned = time.time() - start_time
    pruned_trials = len(study_pruned.get_trials(states=[optuna.trial.TrialState.PRUNED]))
    complete_trials = len(study_pruned.get_trials(states=[optuna.trial.TrialState.COMPLETE]))
    model, final_score, _ = evaluate_final_model(
        study_pruned.best_trial.params, X_train_full, y_train_full, X_test_full, y_test_full,
        task_type=TASK_TYPE, preprocessor=data_preprocessor
    )
    timing_data_pruned.append({
        'run': run,
        'total_time': total_time_pruned,
        'pruned_trials': pruned_trials,
        'complete_trials': complete_trials
    })
    results_pruned.append(final_score)
    
    # Baseline Study
    timing_data_baseline_run = []
    study_baseline = optuna.create_study(
        direction="maximize",
        pruner=optuna.pruners.NopPruner(),
        sampler=optuna.samplers.TPESampler(seed=seed),
        study_name=f"blood_transfusion_rf_hpo_baseline_run_{run}",
        storage=f"sqlite:///blood_transfusion_rf_hpo_baseline_run_{run}.db",
        load_if_exists=True
    )
    objective_baseline_with_args = functools.partial(
        objective_baseline,
        X_train_raw=X_train_split,
        y_train_encoded=y_train_split,
        X_val_raw=X_val_split,
        y_val_encoded=y_val_split,
        task_type=TASK_TYPE,
        preprocessor=data_preprocessor,
        max_n_estimators_upper_bound=250,
        debug_mode=DEBUG_MODE,
        timing_data=timing_data_baseline_run
    )
    start_time = time.time()
    study_baseline.optimize(objective_baseline_with_args, n_trials=N_TRIALS, show_progress_bar=True)
    total_time_baseline = time.time() - start_time
    model, final_score, _ = evaluate_final_model(
        study_baseline.best_trial.params, X_train_full, y_train_full, X_test_full, y_test_full,
        task_type=TASK_TYPE, preprocessor=data_preprocessor
    )
    timing_data_baseline.append({
        'run': run,
        'total_time': total_time_baseline,
        'pruned_trials': 0,
        'complete_trials': N_TRIALS
    })
    results_baseline.append(final_score)

# --- 8. Analyze Results ---
mean_pruned, margin_pruned, ci_pruned = calculate_confidence_interval(results_pruned)
mean_baseline, margin_baseline, ci_baseline = calculate_confidence_interval(results_baseline)
mean_time_pruned = np.mean([run['total_time'] for run in timing_data_pruned])
mean_time_baseline = np.mean([run['total_time'] for run in timing_data_baseline])
speedup_factor = mean_time_baseline / mean_time_pruned
mean_pruned_trials = np.mean([run['pruned_trials'] for run in timing_data_pruned])
mean_complete_trials = np.mean([run['complete_trials'] for run in timing_data_pruned])

print("\n=== Results Summary ===")
print(f"Pruned Runs (AUC): Mean: {mean_pruned:.4f}, 95% CI: [{ci_pruned[0]:.4f}, {ci_pruned[1]:.4f}]")
print(f"Baseline Runs (AUC): Mean: {mean_baseline:.4f}, 95% CI: [{ci_baseline[0]:.4f}, {ci_baseline[1]:.4f}]")
print(f"Mean Total Time (Pruned): {mean_time_pruned:.2f} seconds")
print(f"Mean Total Time (Baseline): {mean_time_baseline:.2f} seconds")
print(f"Speedup Factor: {speedup_factor:.2f}x")
print(f"Pruning Rate: {(mean_pruned_trials / N_TRIALS * 100):.1f}%")

# Save results
results_summary = {
    'pruned': {'auc_scores': results_pruned, 'mean_auc': mean_pruned, 'ci_95': ci_pruned},
    'baseline': {'auc_scores': results_baseline, 'mean_auc': mean_baseline, 'ci_95': ci_baseline},
    'speedup_factor': speedup_factor,
    'pruning_rate': mean_pruned_trials / N_TRIALS
}
with open('blood_transfusion_results.json', 'w') as f:
    json.dump(results_summary, f, indent=2)



Run 1/5 (Seed: 42)


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]


Run 2/5 (Seed: 43)


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]


Run 3/5 (Seed: 44)


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]


Run 4/5 (Seed: 45)


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]


Run 5/5 (Seed: 46)


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]


=== Results Summary ===
Pruned Runs (AUC): Mean: 0.7972, 95% CI: [0.7941, 0.8004]
Baseline Runs (AUC): Mean: 0.8005, 95% CI: [0.7990, 0.8020]
Mean Total Time (Pruned): 38.44 seconds
Mean Total Time (Baseline): 29.19 seconds
Speedup Factor: 0.76x
Pruning Rate: 0.0%


In [None]:
# --- Visualize Results ---
# Create a bar chart comparing AUC scores across runs
print("\nCreating comparison chart for AUC scores across runs...")
# --- 9. Chart ---
# Placeholder data (replace with actual results_pruned and results_baseline after running)
```chartjs
{
  "type": "bar",
  "data": {
    "labels": ["Run 1", "Run 2", "Run 3", "Run 4", "Run 5"],
    "datasets": [
      {
        "label": "Pruned (AUC)",
        "data": [0.0, 0.0, 0.0, 0.0, 0.0],
        "backgroundColor": "rgba(54, 162, 235, 0.6)",
        "borderColor": "rgba(54, 162, 235, 1)",
        "borderWidth": 1
      },
      {
        "label": "Baseline (AUC)",
        "data": [0.0, 0.0, 0.0, 0.0, 0.0],
        "backgroundColor": "rgba(255, 99, 132, 0.6)",
        "borderColor": "rgba(255, 99, 132, 1)",
        "borderWidth": 1
      }
    ]
  },
  "options": {
    "scales": {
      "y": {"beginAtZero": false, "title": {"display": true, "text": "AUC Score"}},
      "x": {"title": {"display": true, "text": "Run"}}
    },
    "plugins": {
      "legend": {"display": true},
      "title": {"display": true, "text": "AUC Scores: Pruned vs Baseline"}
    }
  }
}

In [3]:
# --- Optional: Optuna Visualizations ---
# To use these, you need to install plotly and kaleido: pip install plotly kaleido
import optuna.visualization as ov
fig1 = ov.plot_optimization_history(study)
fig1.show()
fig2 = ov.plot_intermediate_values(study)
fig2.show()
fig3 = ov.plot_param_importances(study)
fig3.show()