In [None]:

# EVO-STACK-KDE - Pipeline completo em uma única célula para execução no Colab
import json
import random
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Callable, Iterable, List, Optional, Sequence, Tuple

import joblib
import numpy as np
import pandas as pd
from deap import base, creator, tools
from matplotlib import pyplot as plt
from scipy.special import logsumexp
from sklearn.model_selection import KFold, train_test_split
from sklearn.neighbors import KernelDensity
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm


# ---------------------------------------------------------------------------
# Utilidades gerais e preparação
# ---------------------------------------------------------------------------

def log_step(message: str) -> None:
    print(f"[EVO-STACK-KDE] {message}", flush=True)


def seed_everything(seed: int) -> None:
    random.seed(seed)
    np.random.seed(seed)
    try:
        import torch  # type: ignore

        torch.manual_seed(seed)
    except Exception:
        pass


# ---------------------------------------------------------------------------
# Manipulação dos dados
# ---------------------------------------------------------------------------

@dataclass
class DataSplits:
    X_train: np.ndarray
    X_val: np.ndarray
    X_test: np.ndarray
    scaler: StandardScaler

    def shapes(self) -> Tuple[int, int, int]:
        return self.X_train.shape[0], self.X_val.shape[0], self.X_test.shape[0]

    def dimension(self) -> int:
        return self.X_train.shape[1]


def load_wine_red(csv_path: str | Path) -> pd.DataFrame:
    csv_path = Path(csv_path)
    if not csv_path.exists():
        raise FileNotFoundError(f"CSV não encontrado: {csv_path}")
    try:
        return pd.read_csv(csv_path, sep=None, engine="python")
    except Exception:
        return pd.read_csv(csv_path)


def split_and_scale(
    df: pd.DataFrame,
    test_size: float = 0.15,
    val_size: float = 0.15,
    seed: int = 42,
) -> DataSplits:
    if "quality" in df.columns:
        df = df.drop(columns=["quality"])
    X = df.values.astype(float)
    X_train, X_temp = train_test_split(
        X,
        test_size=test_size + val_size,
        random_state=seed,
        shuffle=True,
    )
    relative_val_size = val_size / (test_size + val_size)
    X_val, X_test = train_test_split(
        X_temp,
        test_size=1 - relative_val_size,
        random_state=seed,
        shuffle=True,
    )
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    return DataSplits(X_train_scaled, X_val_scaled, X_test_scaled, scaler)


def persist_splits(splits: DataSplits, out_dir: str | Path) -> None:
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    np.savez(
        out_dir / "splits.npz",
        X_train=splits.X_train,
        X_val=splits.X_val,
        X_test=splits.X_test,
    )
    meta = {
        "n_train": int(splits.X_train.shape[0]),
        "n_val": int(splits.X_val.shape[0]),
        "n_test": int(splits.X_test.shape[0]),
        "dimension": int(splits.dimension()),
        "scaler_mean": splits.scaler.mean_.tolist(),
        "scaler_scale": splits.scaler.scale_.tolist(),
    }
    with open(out_dir / "meta.json", "w", encoding="utf-8") as f:
        json.dump(meta, f, indent=2)


# ---------------------------------------------------------------------------
# Modelos KDE
# ---------------------------------------------------------------------------

def scott_bandwidth(X: np.ndarray) -> float:
    if X.ndim != 2:
        raise ValueError("Array deve ser 2D")
    n, d = X.shape
    if n <= 0 or d <= 0:
        raise ValueError("Dimensões inválidas para KDE")
    return np.power(n, -1.0 / (d + 4))


@dataclass
class KDEExpert:
    feature_mask: Optional[np.ndarray]
    alpha: float = 1.0

    def __post_init__(self) -> None:
        if self.feature_mask is not None:
            mask = np.asarray(self.feature_mask, dtype=bool)
            if mask.ndim != 1:
                raise ValueError("feature_mask deve ser 1D")
            if not mask.any():
                raise ValueError("feature_mask deve selecionar ao menos uma feature")
            self.feature_mask = mask
        self.model: Optional[KernelDensity] = None

    def _apply_mask(self, X: np.ndarray) -> np.ndarray:
        if self.feature_mask is None:
            return X
        return X[:, self.feature_mask]

    def fit(self, X: np.ndarray) -> "KDEExpert":
        subspace = self._apply_mask(X)
        h0 = scott_bandwidth(subspace)
        bandwidth = max(h0 * float(self.alpha), 1e-6)
        self.model = KernelDensity(kernel="gaussian", bandwidth=bandwidth)
        self.model.fit(subspace)
        return self

    def logpdf(self, X: np.ndarray) -> np.ndarray:
        if self.model is None:
            raise RuntimeError("KDEExpert precisa ser treinado antes de logpdf")
        subspace = self._apply_mask(X)
        return self.model.score_samples(subspace)


@dataclass
class KDEEnsemble:
    experts: Sequence[KDEExpert]
    weight_logits: np.ndarray

    def __post_init__(self) -> None:
        if len(self.experts) == 0:
            raise ValueError("KDEEnsemble requer pelo menos um expert")
        self.weight_logits = np.asarray(self.weight_logits, dtype=float)
        if self.weight_logits.shape != (len(self.experts),):
            raise ValueError("weight_logits deve ter o mesmo tamanho de experts")
        self._weights = self._softmax(self.weight_logits)

    @staticmethod
    def _softmax(z: np.ndarray) -> np.ndarray:
        z = np.asarray(z, dtype=float)
        z = z - np.max(z)
        exp_z = np.exp(z)
        return exp_z / np.sum(exp_z)

    @property
    def weights(self) -> np.ndarray:
        return self._weights

    def fit(self, X: np.ndarray) -> "KDEEnsemble":
        for expert in self.experts:
            expert.fit(X)
        return self

    def logpdf(self, X: np.ndarray) -> np.ndarray:
        logps = np.column_stack([expert.logpdf(X) for expert in self.experts])
        log_weights = np.log(self._weights)
        return logsumexp(logps + log_weights, axis=1)


# ---------------------------------------------------------------------------
# Genoma evolutivo (configurações do ensemble)
# ---------------------------------------------------------------------------

@dataclass
class ExpertConfig:
    alpha: float
    feature_mask: np.ndarray

    def to_jsonable(self) -> dict:
        return {
            "alpha": float(self.alpha),
            "feature_mask": self.feature_mask.astype(int).tolist(),
        }


@dataclass
class ModelConfig:
    weight_logits: np.ndarray
    experts: List[ExpertConfig] = field(default_factory=list)

    def to_jsonable(self) -> dict:
        return {
            "weight_logits": self.weight_logits.tolist(),
            "experts": [expert.to_jsonable() for expert in self.experts],
        }


def random_mask(
    d: int,
    rng: np.random.Generator,
    keep_frac_range: Sequence[float] = (0.6, 0.8),
) -> np.ndarray:
    if d <= 0:
        raise ValueError("d deve ser positivo")
    low, high = keep_frac_range
    keep_frac = rng.uniform(low, high)
    n_keep = max(1, int(round(keep_frac * d)))
    mask = np.zeros(d, dtype=bool)
    idx = rng.choice(d, size=n_keep, replace=False)
    mask[idx] = True
    return mask


def random_model_config(d: int, rng: np.random.Generator, m: int = 5) -> ModelConfig:
    experts = []
    for _ in range(m):
        alpha = float(rng.uniform(0.5, 1.5))
        mask = random_mask(d, rng)
        experts.append(ExpertConfig(alpha=alpha, feature_mask=mask))
    weight_logits = rng.normal(0.0, 1.0, size=m)
    return ModelConfig(weight_logits=weight_logits, experts=experts)


def decode_to_model(config: ModelConfig) -> Callable[[], KDEEnsemble]:
    def factory() -> KDEEnsemble:
        experts = [
            KDEExpert(feature_mask=cfg.feature_mask.copy(), alpha=cfg.alpha)
            for cfg in config.experts
        ]
        return KDEEnsemble(
            experts=experts,
            weight_logits=np.array(config.weight_logits, dtype=float),
        )

    return factory


def model_config_from_jsonable(data: dict) -> ModelConfig:
    experts = [
        ExpertConfig(
            alpha=float(exp["alpha"]),
            feature_mask=np.array(exp["feature_mask"], dtype=bool),
        )
        for exp in data["experts"]
    ]
    weight_logits = np.array(data["weight_logits"], dtype=float)
    return ModelConfig(weight_logits=weight_logits, experts=experts)


# ---------------------------------------------------------------------------
# Métricas de avaliação
# ---------------------------------------------------------------------------

def _ensure_2d(X: np.ndarray) -> np.ndarray:
    if X.ndim != 2:
        raise ValueError("Esperado array 2D")
    return X


def nll_kfold(
    model_factory: Callable[[], KDEEnsemble],
    X: np.ndarray,
    k: int = 3,
    seed: int = 42,
) -> float:
    X = _ensure_2d(np.asarray(X, dtype=float))
    kf = KFold(n_splits=k, shuffle=True, random_state=seed)
    nlls = []
    for train_idx, val_idx in kf.split(X):
        model = model_factory()
        model.fit(X[train_idx])
        logp = model.logpdf(X[val_idx])
        nlls.append(float(-np.mean(logp)))
    return float(np.mean(nlls))


def stability_bootstrap(
    model_factory: Callable[[], KDEEnsemble],
    X: np.ndarray,
    B: int = 10,
    k: int = 3,
    seed: int = 42,
) -> float:
    X = _ensure_2d(np.asarray(X, dtype=float))
    rng = np.random.default_rng(seed)
    n = X.shape[0]
    scores = []
    for _ in range(B):
        sample_idx = rng.integers(0, n, size=n)
        X_boot = X[sample_idx]
        scores.append(nll_kfold(model_factory, X_boot, k=k, seed=seed))
    return float(np.std(scores))


def _softmax(z: Sequence[float]) -> np.ndarray:
    z = np.asarray(z, dtype=float)
    z = z - np.max(z)
    exp_z = np.exp(z)
    return exp_z / np.sum(exp_z)


def complexity(
    config: ModelConfig,
    n: int,
    d: int,
    lam_entropy: float = 0.0,
) -> float:
    if d <= 0:
        raise ValueError("d deve ser positivo")
    base_factor = min(1.0, n / 2000.0)
    complexities = []
    for expert in config.experts:
        mask = np.asarray(expert.feature_mask, dtype=bool)
        if not mask.any():
            raise ValueError("Máscara do expert deve selecionar ao menos uma feature")
        complexities.append(mask.sum() / d * base_factor)
    mean_complexity = float(np.mean(complexities))
    if lam_entropy > 0:
        weights = _softmax(config.weight_logits)
        entropy = -np.sum(weights * np.log(weights + 1e-12))
        norm_entropy = entropy / np.log(len(weights))
        mean_complexity += lam_entropy * float(norm_entropy)
    return mean_complexity


# ---------------------------------------------------------------------------
# Rotinas de plotagem
# ---------------------------------------------------------------------------

def plot_pareto_2d(pareto_fits: Iterable[Iterable[float]], out_png: str | Path) -> None:
    data = np.array(list(pareto_fits), dtype=float)
    if data.ndim != 2 or data.shape[1] < 3:
        raise ValueError("pareto_fits deve fornecer triplas (f1, f2, f3)")
    fig, axes = plt.subplots(1, 2, figsize=(10, 4))
    axes[0].scatter(data[:, 0], data[:, 1], c="tab:blue", alpha=0.7)
    axes[0].set_xlabel("f1: NLL")
    axes[0].set_ylabel("f2: Estabilidade")
    axes[0].set_title("Pareto (f1 vs f2)")
    axes[1].scatter(data[:, 0], data[:, 2], c="tab:orange", alpha=0.7)
    axes[1].set_xlabel("f1: NLL")
    axes[1].set_ylabel("f3: Complexidade")
    axes[1].set_title("Pareto (f1 vs f3)")
    fig.tight_layout()
    out_png = Path(out_png)
    out_png.parent.mkdir(parents=True, exist_ok=True)
    fig.savefig(out_png)
    plt.close(fig)


def plot_hist_neglogp_test(neg_logp: Iterable[float], out_png: str | Path) -> None:
    values = np.array(list(neg_logp), dtype=float)
    fig, ax = plt.subplots(figsize=(6, 4))
    ax.hist(values, bins=20, color="tab:green", alpha=0.8)
    ax.set_xlabel("-log p(x)")
    ax.set_ylabel("Frequência")
    ax.set_title("Distribuição no conjunto de teste")
    fig.tight_layout()
    out_png = Path(out_png)
    out_png.parent.mkdir(parents=True, exist_ok=True)
    fig.savefig(out_png)
    plt.close(fig)


# ---------------------------------------------------------------------------
# Núcleo NSGA-II
# ---------------------------------------------------------------------------

def _ensure_creators() -> None:
    if "FitnessMin3" not in creator.__dict__:
        creator.create("FitnessMin3", base.Fitness, weights=(-1.0, -1.0, -1.0))
    if "Individual" not in creator.__dict__:
        creator.create("Individual", ModelConfig, fitness=creator.FitnessMin3)


def _clone_expert(expert: ExpertConfig) -> ExpertConfig:
    return ExpertConfig(alpha=expert.alpha, feature_mask=expert.feature_mask.copy())


def _clone_config(config: ModelConfig) -> ModelConfig:
    return ModelConfig(
        weight_logits=config.weight_logits.copy(),
        experts=[_clone_expert(exp) for exp in config.experts],
    )


def _make_individual(config: ModelConfig) -> ModelConfig:
    return creator.Individual(
        weight_logits=config.weight_logits.copy(),
        experts=[_clone_expert(e) for e in config.experts],
    )


def _crossover(
    config_a: ModelConfig,
    config_b: ModelConfig,
    rng: np.random.Generator,
) -> Tuple[ModelConfig, ModelConfig]:
    m = len(config_a.experts)
    point = rng.integers(1, m) if m > 1 else 0
    experts1 = [
        _clone_expert(exp) for exp in (config_a.experts[:point] + config_b.experts[point:])
    ]
    experts2 = [
        _clone_expert(exp) for exp in (config_b.experts[:point] + config_a.experts[point:])
    ]
    blend = rng.uniform(0.25, 0.75)
    logits1 = blend * config_a.weight_logits + (1.0 - blend) * config_b.weight_logits
    logits2 = blend * config_b.weight_logits + (1.0 - blend) * config_a.weight_logits
    return (
        ModelConfig(weight_logits=logits1.copy(), experts=experts1),
        ModelConfig(weight_logits=logits2.copy(), experts=experts2),
    )


def _mutate(config: ModelConfig, rng: np.random.Generator) -> ModelConfig:
    mutated = _clone_config(config)
    for expert in mutated.experts:
        log_alpha = np.log(expert.alpha)
        log_alpha += rng.normal(0.0, 0.15)
        expert.alpha = float(np.clip(np.exp(log_alpha), 0.1, 5.0))
        mask = expert.feature_mask.copy()
        flip_prob = 1.0 / mask.size
        flips = rng.random(mask.size) < flip_prob
        mask = np.logical_xor(mask, flips)
        if not mask.any():
            mask[rng.integers(0, mask.size)] = True
        expert.feature_mask = mask
    mutated.weight_logits = (
        mutated.weight_logits + rng.normal(0.0, 0.3, size=mutated.weight_logits.shape)
    )
    return mutated


def run_nsga(
    X: np.ndarray,
    pop_size: int = 60,
    n_gen: int = 40,
    seed: int = 42,
    kfold: int = 3,
    bootstraps: int = 10,
    outdir: str | Path = "outputs",
) -> Tuple[List[dict], Path]:
    _ensure_creators()
    rng = np.random.default_rng(seed)
    toolbox = base.Toolbox()
    d = X.shape[1]

    def init_individual() -> ModelConfig:
        config = random_model_config(d, rng, m=5)
        return _make_individual(config)

    toolbox.register("individual", init_individual)
    toolbox.register("population", tools.initRepeat, list, toolbox.individual)

    def evaluate(individual: ModelConfig) -> Tuple[float, float, float]:
        config = individual
        factory = decode_to_model(config)
        f1 = nll_kfold(factory, X, k=kfold, seed=seed)
        f2 = stability_bootstrap(factory, X, B=bootstraps, k=kfold, seed=seed)
        f3 = complexity(config, n=X.shape[0], d=d)
        return f1, f2, f3

    toolbox.register("evaluate", evaluate)

    def mate(ind1: ModelConfig, ind2: ModelConfig) -> Tuple[ModelConfig, ModelConfig]:
        child1, child2 = _crossover(ind1, ind2, rng)
        return _make_individual(child1), _make_individual(child2)

    def mutate(ind: ModelConfig) -> Tuple[ModelConfig]:
        mutated = _mutate(ind, rng)
        return (_make_individual(mutated),)

    toolbox.register("mate", mate)
    toolbox.register("mutate", mutate)
    toolbox.register("select", tools.selNSGA2)

    pop = toolbox.population(n=pop_size)

    for ind in tqdm(pop, desc="Avaliando população inicial", unit="ind"):
        ind.fitness.values = toolbox.evaluate(ind)

    pop = toolbox.select(pop, len(pop))

    for gen in tqdm(range(1, n_gen + 1), desc="Evoluindo gerações", unit="ger"):
        offspring = tools.selTournamentDCD(pop, len(pop))
        offspring = [_clone_config(ind) for ind in offspring]
        for i in range(1, len(offspring), 2):
            if rng.random() < 0.9:
                child1, child2 = _crossover(offspring[i - 1], offspring[i], rng)
                offspring[i - 1], offspring[i] = child1, child2
        for i in range(len(offspring)):
            if rng.random() < 0.4:
                offspring[i] = _mutate(offspring[i], rng)
        offspring = [_make_individual(cfg) for cfg in offspring]
        for ind in tqdm(
            offspring,
            desc=f"Geração {gen}: avaliando descendentes",
            unit="ind",
            leave=False,
        ):
            ind.fitness.values = toolbox.evaluate(ind)
        pop = toolbox.select(pop + offspring, pop_size)

    pareto_front = tools.sortNondominated(pop, k=len(pop), first_front_only=True)[0]
    timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
    run_dir = Path(outdir) / "runs" / timestamp
    run_dir.mkdir(parents=True, exist_ok=True)
    pareto_data: List[dict] = []
    for ind in pareto_front:
        pareto_data.append({"config": ind.to_jsonable(), "fitness": list(ind.fitness.values)})
    with open(run_dir / "pareto.json", "w", encoding="utf-8") as f:
        json.dump(pareto_data, f, indent=2)
    return pareto_data, run_dir


# ---------------------------------------------------------------------------
# Pipeline completo
# ---------------------------------------------------------------------------

def select_knee(pareto: List[dict]) -> dict:
    fitness = np.array([entry["fitness"] for entry in pareto], dtype=float)
    mins = fitness.min(axis=0)
    maxs = fitness.max(axis=0)
    ranges = np.where(maxs - mins == 0, 1.0, maxs - mins)
    normalized = (fitness - mins) / ranges
    distances = np.linalg.norm(normalized, axis=1)
    idx = int(np.argmin(distances))
    return pareto[idx]


def run_pipeline(
    csv_path: str | Path,
    seed: int = 42,
    outdir: str | Path = "outputs",
    pop: int = 60,
    gens: int = 40,
    kfold: int = 3,
    bootstraps: int = 10,
) -> dict:
    log_step("Iniciando pipeline EVO-STACK-KDE")
    log_step(f"Seed global definida como {seed}")
    seed_everything(seed)

    log_step(f"Carregando dados a partir de '{csv_path}'")
    data_df = load_wine_red(csv_path)
    log_step("Dividindo dataset em treino/validação/teste e aplicando padronização")
    splits = split_and_scale(data_df, seed=seed)
    log_step("Persistindo divisões processadas em 'data/processed'")
    persist_splits(splits, Path("data/processed"))

    log_step(
        "Iniciando busca evolutiva NSGA-II "
        f"(população={pop}, gerações={gens}, kfold={kfold}, bootstraps={bootstraps})"
    )
    pareto, run_dir = run_nsga(
        splits.X_train,
        pop_size=pop,
        n_gen=gens,
        seed=seed,
        kfold=kfold,
        bootstraps=bootstraps,
        outdir=outdir,
    )
    log_step(f"Busca evolutiva concluída. Resultados em '{run_dir}'")

    log_step("Selecionando solução joelho da frente de Pareto")
    knee_entry = select_knee(pareto)
    best_config = model_config_from_jsonable(knee_entry["config"])

    log_step("Treinando modelo final com treino+validação")
    X_train_full = np.vstack([splits.X_train, splits.X_val])
    factory = decode_to_model(best_config)
    model = factory()
    model.fit(X_train_full)

    log_step("Avaliando modelo no conjunto de teste")
    test_logp = model.logpdf(splits.X_test)
    test_nll = float(-np.mean(test_logp))

    log_step(f"Criando diretórios de saída em '{outdir}'")
    outdir = Path(outdir)
    outdir.mkdir(parents=True, exist_ok=True)
    figures_dir = outdir / "figures"
    figures_dir.mkdir(parents=True, exist_ok=True)
    models_dir = outdir / "models"
    models_dir.mkdir(parents=True, exist_ok=True)

    log_step("Gerando visualizações e salvando artefatos")
    plot_pareto_2d([entry["fitness"] for entry in pareto], figures_dir / "pareto_2d.png")
    plot_hist_neglogp_test(-test_logp, figures_dir / "hist_test_logp.png")
    joblib.dump(
        {
            "model": model,
            "scaler": splits.scaler,
            "config": best_config.to_jsonable(),
        },
        models_dir / "best_model.pkl",
    )

    metrics = {
        "test_nll": test_nll,
        "knee_fitness": knee_entry["fitness"],
        "knee_config": knee_entry["config"],
        "pareto_run_dir": str(run_dir),
    }

    metrics_path = outdir / "metrics.json"
    with open(metrics_path, "w", encoding="utf-8") as f:
        json.dump(metrics, f, indent=2)
    log_step(f"Pipeline concluído. Métricas salvas em '{metrics_path}'")
    return metrics


# ---------------------------------------------------------------------------
# Exemplo de uso (descomente as linhas abaixo para executar no Colab)
# ---------------------------------------------------------------------------

# csv_no_colab = "/content/winequality-red.csv"  # ajuste o caminho conforme necessário
# resultados = run_pipeline(
#     csv_path=csv_no_colab,
#     seed=42,
#     outdir="/content/outputs",
#     pop=60,
#     gens=40,
#     kfold=3,
#     bootstraps=10,
# )
# resultados
