# TPE-Optimized Ensemble Submission

This notebook generates final predictions using **Tree-structured Parzen Estimator (TPE)** to find optimal weights for:
1. **Per-model fold blending**: Combine each model's 5 folds optimally
2. **Cross-model ensemble**: Blend predictions from 6 different transformer models

Models included:
- DeBERTa-v3-base
- ModernBERT
- ELECTRA
- XLNet
- Llama-3.2-1B
- Qwen3-0.6B

In [None]:
# Import required libraries
import os
import gc
import json
import html
import warnings
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm
from transformers import AutoConfig, AutoModel, AutoTokenizer, DebertaV2TokenizerFast
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from scipy.stats import spearmanr

warnings.filterwarnings('ignore')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

In [None]:
# Target columns
TARGET_COLS = [
    "question_asker_intent_understanding", "question_body_critical", "question_conversational",
    "question_expect_short_answer", "question_fact_seeking", "question_has_commonly_accepted_answer",
    "question_interestingness_others", "question_interestingness_self", "question_multi_intent",
    "question_not_really_a_question", "question_opinion_seeking", "question_type_choice",
    "question_type_compare", "question_type_consequence", "question_type_definition",
    "question_type_entity", "question_type_instructions", "question_type_procedure",
    "question_type_reason_explanation", "question_type_spelling", "question_well_written",
    "answer_helpful", "answer_level_of_information", "answer_plausible", "answer_relevance",
    "answer_satisfaction", "answer_type_instructions", "answer_type_procedure",
    "answer_type_reason_explanation", "answer_well_written",
]

print(f"Total targets: {len(TARGET_COLS)}")

## Configuration & Model Definitions

In [None]:
# ============================================================================
# CONFIGURATION - Edit paths here for local/Kaggle environments
# ============================================================================

# Auto-detect environment (local vs Kaggle)
TEST_PATH = 'test.csv'
TRAIN_PATH = 'train.csv'
MODEL_WEIGHTS_DIR = '/kaggle/input/deberta-finetuned/pytorch/arch1/23'  # Main directory containing all .pth files
PRE_CALCULATED_WEIGHTS_PATH = None  # Path to pre-calculated tpe_weights.json (if available)

if not os.path.exists(TEST_PATH):
    # Running on Kaggle
    TEST_PATH = '/kaggle/input/google-quest-challenge/test.csv'
    TRAIN_PATH = '/kaggle/input/google-quest-challenge/train.csv'
    MODEL_WEIGHTS_DIR = '/kaggle/input/deberta-finetuned/pytorch/arch1/23'  # Update this path on Kaggle

print(f"Environment detected:")
print(f"  Train: {TRAIN_PATH}")
print(f"  Test: {TEST_PATH}")
print(f"  Model weights: {MODEL_WEIGHTS_DIR}")
if PRE_CALCULATED_WEIGHTS_PATH and os.path.exists(PRE_CALCULATED_WEIGHTS_PATH):
    print(f"  Pre-calculated weights: {PRE_CALCULATED_WEIGHTS_PATH}")

# ============================================================================
# Model & Tokenizer Paths (for Kaggle, use /kaggle/input/... paths)
# ============================================================================

# Set custom paths for models/tokenizers (use local HF cache or Kaggle dataset paths)
MODEL_PATHS = {
    'deberta_v3': {
        'model': '/kaggle/input/deberta-tokenizer',  # or '/kaggle/input/deberta-v3-base/...'
        'tokenizer': '/kaggle/input/deberta-tokenizer',  # or use 'model' key if same
    },
    'modernbert': {
        'model': '/kaggle/input/modernbert/transformers/base/2',
        'tokenizer': '/kaggle/input/modernbert/transformers/base/2',  # None = use model path
    },
    'electra': {
        'model': '/kaggle/input/electra',
        'tokenizer': '/kaggle/input/electra',
    },
    'xlnet': {
        'model': '/kaggle/input/xlnet0',
        'tokenizer': '/kaggle/input/xlnet0',
    },
    'llama': {
        'model': '/kaggle/input/llama-3.2/transformers/1b/1',  # or '/kaggle/input/llama-3-2-1b/...'
        'tokenizer': '/kaggle/input/llama-3.2/transformers/1b/1',
    },
    'qwen': {
        'model': '/kaggle/input/qwen-3/transformers/0.6b/1',  # or '/kaggle/input/qwen-3/transformers/0.6b/1'
        'tokenizer': '/kaggle/input/qwen-3/transformers/0.6b/1',
    },
}

# Model weight filename patterns (will be searched in MODEL_WEIGHTS_DIR)
WEIGHT_PATTERNS = {
    'deberta_v3': 'deberta_v3_',
    'modernbert': 'modernbert_',
    'electra': 'electra_',
    'xlnet': 'xlnet_',
    'llama': 'llama_',
    'qwen': 'qwen_',
}

# ============================================================================

@dataclass
class ModelSpec:
    """Specification for each model in the ensemble."""
    name: str
    model_dir: str
    base_model: str
    pooling_strategy: str = "arch1_6groups"
    tokenizer_path: Optional[str] = None
    max_len: int = 512
    batch_size: int = 8
    pass_token_type_ids: bool = True
    trust_remote_code: bool = False
    weight_prefix: Optional[str] = None

@dataclass
class Config:
    """Global configuration."""
    train_path: str = TRAIN_PATH
    test_path: str = TEST_PATH
    model_base_dir: str = MODEL_WEIGHTS_DIR
    sample_frac: float = 1.0  # Use full training data for weight optimization
    num_workers: int = 4
    tpe_trials: int = 50  # TPE iterations for global model weights (only if no pre-calculated weights)
    seed: int = 42
    optimize_fold_weights: bool = False  # Set True to also optimize per-fold weights
    use_voter_postprocessing: bool = False  # Apply VotersRounder to final preds
    voter_dev_threshold: float = 0.01  # Std deviation guardrail for voter snapping
    pre_calculated_weights_path: Optional[str] = PRE_CALCULATED_WEIGHTS_PATH  # Path to tpe_weights.json

cfg = Config()
print(f"\nConfig loaded:")
print(f"  sample_frac: {cfg.sample_frac}")
print(f"  tpe_trials: {cfg.tpe_trials}")
print(f"  optimize_fold_weights: {cfg.optimize_fold_weights}")
print(f"  use_voter_postprocessing: {cfg.use_voter_postprocessing}")
print(f"  voter_dev_threshold: {cfg.voter_dev_threshold}")
if cfg.pre_calculated_weights_path:
    print(f"  pre_calculated_weights_path: {cfg.pre_calculated_weights_path}")

In [None]:
# Build model specs from configuration
MODEL_SPECS = [
    ModelSpec(
        name="deberta_v3",
        model_dir=cfg.model_base_dir,
        base_model=MODEL_PATHS['deberta_v3']['model'],
        pooling_strategy="arch1_6groups",
        tokenizer_path=MODEL_PATHS['deberta_v3']['tokenizer'],
        batch_size=8,
        pass_token_type_ids=True,
        weight_prefix=WEIGHT_PATTERNS['deberta_v3'],
    ),
    ModelSpec(
        name="modernbert",
        model_dir=cfg.model_base_dir,
        base_model=MODEL_PATHS['modernbert']['model'],
        pooling_strategy="arch1_6groups",
        tokenizer_path=MODEL_PATHS['modernbert']['tokenizer'],
        batch_size=8,
        pass_token_type_ids=False,
        weight_prefix=WEIGHT_PATTERNS['modernbert'],
    ),
    ModelSpec(
        name="electra",
        model_dir=cfg.model_base_dir,
        base_model=MODEL_PATHS['electra']['model'],
        pooling_strategy="arch1_6groups",
        tokenizer_path=MODEL_PATHS['electra']['tokenizer'],
        batch_size=8,
        pass_token_type_ids=True,
        weight_prefix=WEIGHT_PATTERNS['electra'],
    ),
    ModelSpec(
        name="xlnet",
        model_dir=cfg.model_base_dir,
        base_model=MODEL_PATHS['xlnet']['model'],
        pooling_strategy="arch1_6groups",
        tokenizer_path=MODEL_PATHS['xlnet']['tokenizer'],
        batch_size=8,
        pass_token_type_ids=True,
        weight_prefix=WEIGHT_PATTERNS['xlnet'],
    ),
    ModelSpec(
        name="llama",
        model_dir=cfg.model_base_dir,
        base_model=MODEL_PATHS['llama']['model'],
        pooling_strategy="arch1_6groups",
        tokenizer_path=MODEL_PATHS['llama']['tokenizer'],
        batch_size=4,
        pass_token_type_ids=False,
        weight_prefix=WEIGHT_PATTERNS['llama'],
    ),
    ModelSpec(
        name="qwen",
        model_dir=cfg.model_base_dir,
        base_model=MODEL_PATHS['qwen']['model'],
        pooling_strategy="arch1_6groups",
        tokenizer_path=MODEL_PATHS['qwen']['tokenizer'],
        batch_size=4,
        pass_token_type_ids=False,
        weight_prefix=WEIGHT_PATTERNS['qwen'],
    ),
]

print(f"\nConfigured {len(MODEL_SPECS)} models:")
for spec in MODEL_SPECS:
    tok_path = spec.tokenizer_path or spec.base_model
    print(f"  â€¢ {spec.name:12s}: {spec.base_model}")
    print(f"    Tokenizer: {tok_path}")
    print(f"    Weights: {spec.weight_prefix}*.pth")

## Data Processing & Model Architecture

In [None]:
def modern_preprocess(text: str) -> str:
    """Clean and normalize text."""
    if pd.isna(text):
        return ""
    text = str(text)
    text = html.unescape(text)
    return " ".join(text.split())


class QuestDataset(Dataset):
    """Dataset for Q&A pairs."""
    def __init__(self, df: pd.DataFrame, tokenizer, max_len: int = 512):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.questions = [
            modern_preprocess(t) + " " + modern_preprocess(b)
            for t, b in zip(df["question_title"].values, df["question_body"].values)
        ]
        self.answers = [modern_preprocess(a) for a in df["answer"].values]

    def __len__(self) -> int:
        return len(self.questions)

    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
        question = self.questions[idx]
        answer = self.answers[idx]
        inputs = self.tokenizer(
            question, answer,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors=None,
        )
        item = {
            "input_ids": torch.tensor(inputs["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(inputs["attention_mask"], dtype=torch.long),
        }
        if "token_type_ids" in inputs:
            item["token_type_ids"] = torch.tensor(inputs["token_type_ids"], dtype=torch.long)
        return item

print("Dataset class defined")

In [None]:
class QuestModel(nn.Module):
    """6-head QA model with flexible pooling strategies."""
    def __init__(self, model_name: str, num_targets: int, pooling_strategy: str = "arch1_6groups",
                 dropout_rate: float = 0.1, trust_remote_code: bool = False):
        super().__init__()
        self.pooling_strategy = pooling_strategy
        self.config = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code)
        if pooling_strategy == "cls_all":
            self.config.update({"output_hidden_states": True})
        self.backbone = AutoModel.from_pretrained(model_name, config=self.config, trust_remote_code=trust_remote_code)
        hidden_size = self.config.hidden_size

        # 6-group task indices
        self.idx_g1 = [3, 4, 5, 16, 17]
        self.idx_g2 = [0, 1, 6, 7, 20]
        self.idx_g3 = [2, 10]
        self.idx_g4 = [8, 9, 11, 12, 13, 14, 15, 18, 19]
        self.idx_g5 = [26, 27]
        self.idx_g6 = [21, 22, 23, 24, 25, 28, 29]

        if self.pooling_strategy == "arch1_6groups":
            self.head_g1 = self._make_head(hidden_size * 3, len(self.idx_g1), dropout_rate)
            self.head_g2 = self._make_head(hidden_size * 3, len(self.idx_g2), dropout_rate)
            self.head_g3 = self._make_head(hidden_size * 3, len(self.idx_g3), dropout_rate)
            self.head_g4 = self._make_head(hidden_size * 3, len(self.idx_g4), dropout_rate)
            self.head_g5 = self._make_head(hidden_size * 3, len(self.idx_g5), dropout_rate)
            self.head_g6 = self._make_head(hidden_size * 3, len(self.idx_g6), dropout_rate)

    def _make_head(self, input_dim: int, output_dim: int, dropout_rate: float):
        return nn.Sequential(
            nn.Linear(input_dim, self.config.hidden_size),
            nn.Tanh(),
            nn.Dropout(dropout_rate),
            nn.Linear(self.config.hidden_size, output_dim),
        )

    def _masked_mean_pooling(self, hidden_state, attention_mask):
        mask = attention_mask.unsqueeze(-1).expand(hidden_state.size()).float()
        return torch.sum(hidden_state * mask, 1) / torch.clamp(mask.sum(1), min=1e-9)

    def _get_pooling_features(self, last_hidden_state, attention_mask, token_type_ids):
        cls_token = last_hidden_state[:, 0, :]
        global_avg = self._masked_mean_pooling(last_hidden_state, attention_mask)
        if token_type_ids is None:
            return cls_token, global_avg, global_avg, global_avg
        q_mask = attention_mask * (1 - token_type_ids)
        a_mask = attention_mask * token_type_ids
        q_avg = self._masked_mean_pooling(last_hidden_state, q_mask)
        a_avg = self._masked_mean_pooling(last_hidden_state, a_mask)
        return cls_token, global_avg, q_avg, a_avg

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        # Only pass token_type_ids when the backbone supports/needs it.
        backbone_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask}
        if token_type_ids is not None:
            backbone_kwargs["token_type_ids"] = token_type_ids
        outputs = self.backbone(**backbone_kwargs)

        cls_token, global_avg, q_avg, a_avg = self._get_pooling_features(
            outputs.last_hidden_state, attention_mask, token_type_ids
        )
        feat_q = torch.cat([cls_token, global_avg, q_avg], dim=1)
        feat_a = torch.cat([cls_token, global_avg, a_avg], dim=1)
        
        out_g1 = self.head_g1(feat_q)
        out_g2 = self.head_g2(feat_q)
        out_g3 = self.head_g3(feat_q)
        out_g4 = self.head_g4(feat_q)
        out_g5 = self.head_g5(feat_a)
        out_g6 = self.head_g6(feat_a)

        output = torch.zeros(input_ids.size(0), 30, dtype=out_g1.dtype, device=input_ids.device)
        output[:, self.idx_g1] = out_g1
        output[:, self.idx_g2] = out_g2
        output[:, self.idx_g3] = out_g3
        output[:, self.idx_g4] = out_g4
        output[:, self.idx_g5] = out_g5
        output[:, self.idx_g6] = out_g6
        return output

print("Model architecture defined")

## Utility Functions

In [None]:
def mean_spearman(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    """Calculate mean Spearman correlation across all targets."""
    scores = []
    for i in range(y_true.shape[1]):
        score, _ = spearmanr(y_true[:, i], y_pred[:, i])
        scores.append(0.0 if np.isnan(score) else score)
    return float(np.mean(scores))


def get_weight_paths(model_dir: str, prefix: Optional[str]) -> List[str]:
    """Get sorted list of .pth files matching the prefix."""
    files = [f for f in os.listdir(model_dir) if f.endswith(".pth")]
    if prefix:
        files = [f for f in files if f.startswith(prefix)]
    paths = [os.path.join(model_dir, f) for f in files]
    if not paths:
        raise FileNotFoundError(f"No .pth files found in {model_dir} with prefix '{prefix or '*'}'")
    return sorted(paths)


def build_loader(df: pd.DataFrame, spec: ModelSpec) -> Tuple[DataLoader, any]:
    """Build DataLoader for a given model spec."""
    tokenizer_name = spec.tokenizer_path or spec.base_model

    # Explicitly use the fast DeBERTa tokenizer so the local SentencePiece vocab is picked up.
    if spec.name.startswith("deberta"):
        tokenizer = DebertaV2TokenizerFast.from_pretrained(
            tokenizer_name,
            trust_remote_code=spec.trust_remote_code,
        )
    else:
        tokenizer = AutoTokenizer.from_pretrained(
            tokenizer_name,
            trust_remote_code=spec.trust_remote_code,
        )
    
    # Ensure tokenizer has a padding token (required for models like Llama, Qwen)
    if tokenizer.pad_token is None:
        if tokenizer.eos_token is not None:
            tokenizer.pad_token = tokenizer.eos_token
        else:
            tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    dataset = QuestDataset(df, tokenizer, max_len=spec.max_len)
    loader = DataLoader(
        dataset, batch_size=spec.batch_size, shuffle=False,
        num_workers=cfg.num_workers, pin_memory=True
    )
    return loader, tokenizer


class VotersRounder:
    """Snap predictions to nearest observed training values with a deviation guard."""

    def __init__(self, train_vals: np.ndarray, dev_threshold: float = 0.01):
        clean_vals = train_vals[~np.isnan(train_vals)]
        self.unique_vals = np.sort(np.unique(clean_vals))
        self.dev_threshold = dev_threshold

    def predict(self, preds: np.ndarray) -> np.ndarray:
        preds = np.nan_to_num(preds, nan=0.5)
        idx = np.abs(preds[:, None] - self.unique_vals[None, :]).argmin(axis=1)
        snapped = self.unique_vals[idx]
        # If snapping collapses variance, fall back to raw predictions
        if np.std(snapped) < self.dev_threshold:
            return preds
        return snapped


def apply_voter_postprocessing(preds: np.ndarray, train_df: pd.DataFrame, dev_threshold: float) -> np.ndarray:
    """Apply VotersRounder per target using training column distributions."""
    rounded = preds.copy()
    for i, col in enumerate(TARGET_COLS):
        voter = VotersRounder(train_df[col].values, dev_threshold=dev_threshold)
        rounded[:, i] = voter.predict(preds[:, i])
    return rounded

print("Utility functions defined")

## Inference & Ensemble Functions

In [None]:
def inference_single_model(loader: DataLoader, weight_path: str, spec: ModelSpec) -> np.ndarray:
    """Run inference with a single model checkpoint."""
    model = QuestModel(
        model_name=spec.base_model,
        num_targets=len(TARGET_COLS),
        pooling_strategy=spec.pooling_strategy,
        trust_remote_code=spec.trust_remote_code,
    )
    state = torch.load(weight_path, map_location=device)
    model.load_state_dict(state)
    model.to(device)
    model.eval()

    preds = []
    with torch.no_grad():
        for batch in tqdm(loader, desc=f"{spec.name}:{os.path.basename(weight_path)}", leave=False):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            token_type_ids = batch.get("token_type_ids")
            if token_type_ids is not None and spec.pass_token_type_ids:
                token_type_ids = token_type_ids.to(device)
            else:
                token_type_ids = None
            outputs = model(input_ids, attention_mask, token_type_ids)
            preds.append(outputs.sigmoid().cpu().numpy())

    del model
    torch.cuda.empty_cache()
    gc.collect()
    return np.concatenate(preds)


def collect_fold_predictions(df: pd.DataFrame, test_df: pd.DataFrame, spec: ModelSpec, skip_train: bool = False) -> Tuple[Optional[np.ndarray], np.ndarray]:
    """Collect predictions from all available folds for a single model.

    If skip_train is True, only test predictions are generated (no train forward pass).
    """
    print(f"\n{'='*60}")
    print(f"Processing: {spec.name}")
    print(f"{'='*60}")
    
    # Build loaders
    loader = None
    if not skip_train:
        loader, _ = build_loader(df, spec)
    test_loader, _ = build_loader(test_df, spec)
    fold_paths = get_weight_paths(spec.model_dir, spec.weight_prefix)
    
    print(f"Found {len(fold_paths)} fold checkpoint(s)")
    
    train_fold_preds = [] if not skip_train else None
    test_fold_preds = []
    
    for path in fold_paths:
        print(f"  Loading {os.path.basename(path)}...")
        if not skip_train:
            train_fold_preds.append(inference_single_model(loader, path, spec))
        test_fold_preds.append(inference_single_model(test_loader, path, spec))
    
    train_stack = np.stack(train_fold_preds) if train_fold_preds is not None else None
    test_stack = np.stack(test_fold_preds)
    return train_stack, test_stack

print("Inference functions defined")

In [None]:
def tpe_weight_search(preds: np.ndarray, y_true: np.ndarray, max_evals: int, label: str = "model") -> np.ndarray:
    """Use TPE to find optimal weights for blending predictions."""
    n = preds.shape[0]
    if n == 1:
        return np.ones(1, dtype=np.float32)

    space = {f"w_{i}": hp.uniform(f"w_{i}", 0.0, 1.0) for i in range(n)}

    def objective(params: Dict[str, float]) -> Dict[str, float]:
        weights = np.array([params[f"w_{i}"] for i in range(n)], dtype=np.float64)
        weights = np.clip(weights, 1e-6, None)
        weights = weights / weights.sum()
        blended = np.tensordot(weights, preds, axes=((0), (0)))
        score = mean_spearman(y_true, blended)
        return {"loss": -score, "status": STATUS_OK}

    trials = Trials()
    best_params = fmin(
        fn=objective, space=space, algo=tpe.suggest,
        max_evals=max_evals, trials=trials, verbose=0
    )
    
    raw = np.array([best_params[f"w_{i}"] for i in range(n)], dtype=np.float64)
    raw = np.clip(raw, 1e-6, None)
    weights = raw / raw.sum()
    
    print(f"\n{label.upper()} WEIGHTS:")
    for i, w in enumerate(weights):
        print(f"  {label}_{i}: {w:.4f}")
    
    return weights.astype(np.float32)


def blend_preds(preds: np.ndarray, weights: np.ndarray) -> np.ndarray:
    """Blend predictions using weights."""
    return np.tensordot(weights, preds, axes=((0), (0)))

print("TPE optimization functions defined")

## Load Data

In [None]:
print("Loading datasets...")
train_df = pd.read_csv(cfg.train_path)
test_df = pd.read_csv(cfg.test_path)

if cfg.sample_frac < 1.0:
    train_df = train_df.sample(frac=cfg.sample_frac, random_state=cfg.seed).reset_index(drop=True)
    print(f"Using {cfg.sample_frac:.1%} of training data: {len(train_df)} samples")
else:
    print(f"Using full training data: {len(train_df)} samples")

print(f"Test data: {len(test_df)} samples")

y_true = train_df[TARGET_COLS].values
print(f"Target shape: {y_true.shape}")

## Step 1: Collect All Model Predictions

In [None]:
model_train_preds = []
model_test_preds = []
fold_weights_all = {}

# Try to load pre-calculated weights
pre_calculated_weights = None
if cfg.pre_calculated_weights_path and os.path.exists(cfg.pre_calculated_weights_path):
    print(f"\nLoading pre-calculated weights from: {cfg.pre_calculated_weights_path}")
    with open(cfg.pre_calculated_weights_path, "r") as f:
        pre_calculated_weights = json.load(f)
    print(f"âœ“ Loaded weights with score: {pre_calculated_weights.get('final_score', 'N/A'):.4f}")

test_only = pre_calculated_weights is not None
if test_only:
    print("Test-only mode: skipping train inference because weights are provided.")

for spec in MODEL_SPECS:
    # Get predictions from all available folds (train optional)
    train_folds, test_folds = collect_fold_predictions(train_df, test_df, spec, skip_train=test_only)
    
    # Determine fold weights
    if pre_calculated_weights and "fold_weights" in pre_calculated_weights and spec.name in pre_calculated_weights["fold_weights"]:
        # Use pre-calculated fold weights
        fold_weights = np.array(pre_calculated_weights["fold_weights"][spec.name], dtype=np.float32)
        print(f"Using pre-calculated fold weights for {spec.name}")
    elif cfg.optimize_fold_weights and train_folds is not None:
        # Use TPE to optimize fold weights
        print(f"Optimizing fold weights for {spec.name} using TPE...")
        fold_weights = tpe_weight_search(
            train_folds, y_true,
            max_evals=min(cfg.tpe_trials, 30),  # Fewer trials for fold optimization
            label=f"{spec.name}_fold"
        )
    else:
        # Use equal weights across all folds
        fold_weights = np.ones(test_folds.shape[0], dtype=np.float32) / test_folds.shape[0]
        print(f"Using equal weights over {test_folds.shape[0]} fold(s)")
    
    fold_weights_all[spec.name] = fold_weights.tolist()
    
    # Blend folds
    blended_train = blend_preds(train_folds, fold_weights) if train_folds is not None else None
    blended_test = blend_preds(test_folds, fold_weights)
    
    # Evaluate
    if blended_train is not None:
        score = mean_spearman(y_true, blended_train)
        print(f"{spec.name} blended score: {score:.4f}")
    else:
        print(f"{spec.name}: train inference skipped (test-only mode)")
    
    if blended_train is not None:
        model_train_preds.append(blended_train)
    model_test_preds.append(blended_test)

model_train_preds_np = np.stack(model_train_preds) if model_train_preds else None
model_test_preds_np = np.stack(model_test_preds)

print(f"\nCollected predictions from {len(MODEL_SPECS)} models")
if model_train_preds_np is not None:
    print(f"Train predictions shape: {model_train_preds_np.shape}")
print(f"Test predictions shape: {model_test_preds_np.shape}")

## Step 2: TPE Optimization Across Models

In [None]:
if pre_calculated_weights and "model_weights" in pre_calculated_weights:
    # Use pre-calculated weights
    print(f"\nUsing pre-calculated model weights...")
    model_weights = np.array([
        pre_calculated_weights["model_weights"].get(spec.name, 1.0/len(MODEL_SPECS))
        for spec in MODEL_SPECS
    ], dtype=np.float32)
    model_weights = model_weights / model_weights.sum()  # Re-normalize
else:
    if model_train_preds_np is None:
        raise ValueError("No training predictions available to run TPE. Provide pre-calculated weights or enable train inference.")
    # Run TPE optimization
    print(f"Running TPE optimization across {len(MODEL_SPECS)} models...")
    print(f"TPE trials: {cfg.tpe_trials}")
    model_weights = tpe_weight_search(
        model_train_preds_np, y_true, 
        max_evals=cfg.tpe_trials, 
        label="model"
    )

print("\n" + "="*60)
print("FINAL MODEL WEIGHTS:")
print("="*60)
for spec, weight in zip(MODEL_SPECS, model_weights):
    print(f"{spec.name:20s}: {weight:.4f}")
print("="*60)

## Step 3: Generate Final Predictions

In [None]:
final_train = blend_preds(model_train_preds_np, model_weights) if model_train_preds_np is not None else None
final_test = blend_preds(model_test_preds_np, model_weights)

if cfg.use_voter_postprocessing:
    print("Applying VotersRounder post-processing...")
    if final_train is not None:
        final_train = apply_voter_postprocessing(final_train, train_df, dev_threshold=cfg.voter_dev_threshold)
    final_test = apply_voter_postprocessing(final_test, train_df, dev_threshold=cfg.voter_dev_threshold)

if final_train is not None:
    final_score = mean_spearman(y_true, final_train)
else:
    final_score = None

print("\n" + "="*60)
print("FINAL RESULTS")
print("="*60)
if final_score is not None:
    print(f"Training Spearman Score: {final_score:.4f}")
else:
    print("Training Spearman Score: skipped (test-only mode)")
print(f"Final test predictions shape: {final_test.shape}")
print("="*60)

## Step 4: Create Submission File

In [None]:
# Load sample submission to get correct format
sample_sub = pd.read_csv("../data/sample_submission.csv")

# Create submission
submission = sample_sub[['qa_id']].copy()
submission[TARGET_COLS] = final_test

# Save submission
submission.to_csv("submission.csv", index=False)

print("âœ“ Submission file created: submission.csv")
print(f"  Shape: {submission.shape}")
print(f"\nFirst few rows:")
print(submission.head())

## Save Weights & OOF Predictions

In [None]:
# Save out-of-fold predictions (only if available)
if final_train is not None:
    np.save("tpe_oof_predictions.npy", final_train)
    print("âœ“ Saved OOF predictions: tpe_oof_predictions.npy")
else:
    print("Skipping OOF save (test-only mode)")

# Save weights
weight_payload = {
    "final_score": None if final_score is None else float(final_score),
    "model_weights": {spec.name: float(w) for spec, w in zip(MODEL_SPECS, model_weights)},
    "fold_weights": fold_weights_all,
    "config": {
        "tpe_trials": cfg.tpe_trials,
        "optimize_fold_weights": cfg.optimize_fold_weights,
        "sample_frac": cfg.sample_frac,
        "test_only": test_only,
    }
}

with open("tpe_weights.json", "w", encoding="utf-8") as f:
    json.dump(weight_payload, f, indent=2)

print("âœ“ Saved weights: tpe_weights.json")
print("\nWeight summary:")
print(json.dumps(weight_payload["model_weights"], indent=2))

## Summary

**TPE-Optimized Ensemble Complete! ðŸŽ‰**

This notebook:
1. âœ… Loaded predictions from **5 transformer models** (DeBERTa, ModernBERT, ELECTRA, XLNet, Llama)
2. âœ… Optimized fold blending for each model (mean or TPE-weighted)
3. âœ… Found optimal model weights using **Tree-structured Parzen Estimator (TPE)**
4. âœ… Generated final predictions on test set
5. âœ… Created submission file: `submission.csv`
6. âœ… Saved weights and OOF predictions for analysis

The final ensemble achieves better performance than any single model by intelligently weighting each model's strengths!