In [None]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class EnsembleAggregator(BaseEstimator, TransformerMixin):
    """
    Ensemble aggregator for combining multiple models.

    Parameters
    ----------
    method : str
        Name of the ensemble method. One of:
        'VotingClassifierAggregator', 'AveragingRegressor',
        'StackingAggregator', 'BlendingAggregator'.
    config : dict
        Configuration dictionary for the chosen method.
        Must contain at least 'target_column' to identify the target in the dataset.
        Additional parameters depend on the method:

        For VotingClassifierAggregator:
            - voting : str, default='hard' ('hard' or 'soft')
            - weights : list or None, default=None

        For AveragingRegressor:
            - weights : list or None, default=None

        For StackingAggregator:
            - final_estimator : estimator, default=None (uses LogisticRegression for classification, Ridge for regression)
            - cv : int, default=5
            - stack_method : str, default='auto' (classification only)
            - passthrough : bool, default=False
            - task : str, required ('classification' or 'regression')

        For BlendingAggregator:
            - val_size : float, default=0.2
            - random_state : int, default=42
            - meta_model : estimator, default=None (LogisticRegression for classification, Ridge for regression)
            - task : str, required ('classification' or 'regression')

    Notes
    -----
    The dataset passed to `transform` must include the target column specified in config.
    The method expects a list of pre-instantiated models as the `models` argument.
    """
    def __init__(self, method, config):
        self.method = method
        self.config = config
        self.ensemble_model_ = None
        self.target_column_ = config.get('target_column')
        if self.target_column_ is None:
            raise ValueError("Config must contain 'target_column'.")

    def fit(self, X, y, models):
        """
        Fit the ensemble model.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training features.
        y : array-like of shape (n_samples,)
            Target values.
        models : list of estimators
            List of base models (already instantiated).
        """
        if self.method == 'VotingClassifierAggregator':
            from sklearn.ensemble import VotingClassifier
            estimators = [(f'model_{i}', m) for i, m in enumerate(models)]
            voting = self.config.get('voting', 'hard')
            weights = self.config.get('weights', None)
            self.ensemble_model_ = VotingClassifier(
                estimators=estimators, voting=voting, weights=weights
            )
            self.ensemble_model_.fit(X, y)

        elif self.method == 'AveragingRegressor':
            from sklearn.ensemble import VotingRegressor
            estimators = [(f'model_{i}', m) for i, m in enumerate(models)]
            weights = self.config.get('weights', None)
            self.ensemble_model_ = VotingRegressor(
                estimators=estimators, weights=weights
            )
            self.ensemble_model_.fit(X, y)

        elif self.method == 'StackingAggregator':
            from sklearn.ensemble import StackingClassifier, StackingRegressor
            estimators = [(f'model_{i}', m) for i, m in enumerate(models)]
            final_estimator = self.config.get('final_estimator', None)
            cv = self.config.get('cv', 5)
            passthrough = self.config.get('passthrough', False)
            task = self.config.get('task')
            if task is None:
                raise ValueError("For StackingAggregator, 'task' must be specified in config ('classification' or 'regression').")
            if task == 'classification':
                stack_method = self.config.get('stack_method', 'auto')
                self.ensemble_model_ = StackingClassifier(
                    estimators=estimators, final_estimator=final_estimator,
                    cv=cv, stack_method=stack_method, passthrough=passthrough
                )
            else:
                self.ensemble_model_ = StackingRegressor(
                    estimators=estimators, final_estimator=final_estimator,
                    cv=cv, passthrough=passthrough
                )
            self.ensemble_model_.fit(X, y)

        elif self.method == 'BlendingAggregator':
            from sklearn.model_selection import train_test_split
            val_size = self.config.get('val_size', 0.2)
            random_state = self.config.get('random_state', 42)
            X_train, X_val, y_train, y_val = train_test_split(
                X, y, test_size=val_size, random_state=random_state
            )
            # Train base models on training set
            for model in models:
                model.fit(X_train, y_train)
            # Get predictions on validation set
            val_preds = np.column_stack([model.predict(X_val) for model in models])
            # Meta-model
            meta_model = self.config.get('meta_model', None)
            if meta_model is None:
                task = self.config.get('task')
                if task is None:
                    raise ValueError("For BlendingAggregator, 'task' must be specified in config ('classification' or 'regression').")
                if task == 'classification':
                    from sklearn.linear_model import LogisticRegression
                    meta_model = LogisticRegression()
                else:
                    from sklearn.linear_model import Ridge
                    meta_model = Ridge()
            meta_model.fit(val_preds, y_val)
            self.ensemble_model_ = {
                'base_models': models,
                'meta_model': meta_model
            }
        else:
            raise ValueError(f"Unknown ensemble method: {self.method}")

    def predict(self, X):
        """
        Generate predictions using the fitted ensemble.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Features.

        Returns
        -------
        y_pred : array-like of shape (n_samples,)
            Predictions.
        """
        if self.ensemble_model_ is None:
            raise RuntimeError("Model must be fitted before calling predict.")
        if self.method in ['VotingClassifierAggregator', 'AveragingRegressor', 'StackingAggregator']:
            return self.ensemble_model_.predict(X)
        elif self.method == 'BlendingAggregator':
            base_preds = np.column_stack([
                model.predict(X) for model in self.ensemble_model_['base_models']
            ])
            return self.ensemble_model_['meta_model'].predict(base_preds)
        else:
            raise ValueError(f"Unknown method: {self.method}")

    def transform(self, dataset, models):
        """
        Apply ensemble to dataset and return dataset with predictions added.

        Parameters
        ----------
        dataset : pandas.DataFrame
            Input dataset containing features and the target column.
        models : list of estimators
            List of base models (already instantiated).

        Returns
        -------
        result : pandas.DataFrame
            Original dataset with an additional column 'ensemble_pred' containing
            the ensemble predictions. If the dataset had no target (initial dataset),
            the target column is assumed to be present as per config.
        """
        # Ensure dataset is a DataFrame
        if not isinstance(dataset, pd.DataFrame):
            raise TypeError("dataset must be a pandas DataFrame")
        if self.target_column_ not in dataset.columns:
            raise ValueError(f"Target column '{self.target_column_}' not found in dataset.")
        X = dataset.drop(columns=[self.target_column_])
        y = dataset[self.target_column_]
        self.fit(X, y, models)
        preds = self.predict(X)
        result = dataset.copy()
        result['ensemble_pred'] = preds
        return result