## 4. Ensemble Methods: Aggregating Models for Superior Performance

**Techniques**:
- **Voting Regressor**: Averages predictions from multiple models (reduces variance)
- **Stacking Regressor**: Uses a meta-learner to combine base model predictions (captures complex patterns)

In [None]:
# --- 4.1 Voting Regressor (Averaging) ---
# Ensure numeric types for comparison
comparison_df['R2_Test'] = pd.to_numeric(comparison_df['R2_Test'])

# Select top 3 performing models for ensemble
top_models = comparison_df.nlargest(3, 'R2_Test').index.tolist()
print(f"Building Voting Ensemble with: {top_models}")

# Extract the fitted models (we need to extract the 'model' step from each pipeline)
voting_estimators = []
for name in top_models:
    # Get the model configuration and create a fresh instance
    config = base_models[name]
    best_params = model_results[name]['Best_Params']
    
    # Create model with best params (strip 'model__' prefix)
    model_params = {k.replace('model__', ''): v for k, v in best_params.items()}
    model = config['model'].__class__(**model_params, 
                                       **{k: v for k, v in config['model'].get_params().items() 
                                          if k not in model_params and k != 'random_state'})
    if 'random_state' in model.get_params():
        model.set_params(random_state=42)
    voting_estimators.append((name, model))

# Create Voting Regressor Pipeline
voting_pipeline = Pipeline([
    ('prep', preprocessor),
    ('voting', VotingRegressor(estimators=voting_estimators))
])

print("\nTraining Voting Regressor...")
voting_pipeline.fit(X_train, y_train)

# Evaluate Voting Regressor
y_pred_voting = voting_pipeline.predict(X_test)
r2_voting = r2_score(y_test, y_pred_voting)
mae_voting = mean_absolute_error(np.expm1(y_test), np.expm1(y_pred_voting))
rmse_voting = np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(y_pred_voting)))

print(f"\nVoting Regressor Results:")
print(f"  R² Score: {r2_voting:.4f}")
print(f"  MAE: {mae_voting:,.0f} TND")
print(f"  RMSE: {rmse_voting:,.0f} TND")

# Store results
model_results['Voting_Ensemble'] = {
    'R2_Test': r2_voting,
    'MAE': mae_voting,
    'RMSE': rmse_voting,
    'CV_Train_R2': np.nan,
    'CV_Test_R2': np.nan,
    'CV_Std': np.nan,
    'Bias_Indicator': np.nan,
    'Variance_Indicator': np.nan,
    'Best_Params': {'estimators': top_models}
}
best_estimators['Voting_Ensemble'] = voting_pipeline

In [None]:
# --- 4.2 Stacking Regressor (Meta-Learning) ---
print("Building Stacking Regressor with Ridge meta-learner...")

# Use all base models as base estimators, Ridge as meta-learner
stacking_estimators = []
for name in ['Ridge', 'RandomForest', 'GradientBoosting']:
    config = base_models[name]
    best_params = model_results[name]['Best_Params']
    model_params = {k.replace('model__', ''): v for k, v in best_params.items()}
    
    model = config['model'].__class__(**{k: v for k, v in config['model'].get_params().items() 
                                         if k not in model_params})
    model.set_params(**model_params)
    if hasattr(model, 'random_state'):
        model.set_params(random_state=42)
    stacking_estimators.append((name, model))

# Create Stacking Regressor with Ridge as final estimator
stacking_reg = StackingRegressor(
    estimators=stacking_estimators,
    final_estimator=Ridge(alpha=1.0),
    cv=5,
    n_jobs=-1
)

stacking_pipeline = Pipeline([
    ('prep', preprocessor),
    ('stacking', stacking_reg)
])

print("Training Stacking Regressor (this may take a moment)...")
stacking_pipeline.fit(X_train, y_train)

# Evaluate Stacking Regressor
y_pred_stacking = stacking_pipeline.predict(X_test)
r2_stacking = r2_score(y_test, y_pred_stacking)
mae_stacking = mean_absolute_error(np.expm1(y_test), np.expm1(y_pred_stacking))
rmse_stacking = np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(y_pred_stacking)))

print(f"\nStacking Regressor Results:")
print(f"  R² Score: {r2_stacking:.4f}")
print(f"  MAE: {mae_stacking:,.0f} TND")
print(f"  RMSE: {rmse_stacking:,.0f} TND")

# Store results  
model_results['Stacking_Ensemble'] = {
    'R2_Test': r2_stacking,
    'MAE': mae_stacking,
    'RMSE': rmse_stacking,
    'CV_Train_R2': np.nan,
    'CV_Test_R2': np.nan,
    'CV_Std': np.nan,
    'Bias_Indicator': np.nan,
    'Variance_Indicator': np.nan,
    'Best_Params': {'base_estimators': ['Ridge', 'RandomForest', 'GradientBoosting'], 'meta': 'Ridge'}
}
best_estimators['Stacking_Ensemble'] = stacking_pipeline