# Seed Search: DeBERTa + ALIGN + Caption BLIP2 + proj_concat

## Objetivo:
Encontrar la seed que maximiza el **Test F1-Score** para el modelo ganador.
El modelo seleccionado ser√° usado como baseline para el Error Analysis.

## Configuraci√≥n:
- **Modelo:** DeBERTa + ALIGN + Caption BLIP2 + Fusion: proj_concat
- **Seeds:** 10 seeds distintas
- **Criterio de selecci√≥n:** Mejor Test F1-Score
- **Output:** Checkpoint del mejor modelo + tabla completa de resultados

**Nota:** Todas las m√©tricas de todas las seeds se guardan para an√°lisis de varianza.

## 1. Imports y Configuraci√≥n

In [None]:
import os
import copy
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from PIL import Image
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from transformers import (
    AutoTokenizer, AutoModel,
    AlignProcessor, AlignModel,
    get_linear_schedule_with_warmup
)
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    confusion_matrix,
    classification_report
)

print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"Device: {torch.cuda.get_device_name(0)}")

In [None]:
# Paths
DATA_PATH  = "../../data/"
IMG_PATH   = "../../data/images"
OUTPUT_DIR = "../../results/multimodal/seed_search/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")

In [None]:
# Model config (winner from previous experiments)
TEXT_MODEL_NAME  = "microsoft/deberta-v3-base"
ALIGN_MODEL_NAME = "kakaobrain/align-base"
CAPTION_COL      = "caption_blip2"
FUSION_TYPE      = "proj_concat"

# Training hyperparameters (same as all previous experiments)
MAX_TEXT_LENGTH  = 128
COMMON_DIM       = 768
NUM_CLASSES      = 2
BATCH_SIZE       = 16
NUM_EPOCHS       = 15
LEARNING_RATE    = 2e-5
WEIGHT_DECAY     = 1e-4
WARMUP_RATIO     = 0.1
PATIENCE         = 5

# Seeds to try
SEEDS = [42, 123, 456, 789, 1024, 2024, 3090, 7777, 8888, 9999]

print(f"Model: DeBERTa + ALIGN + Caption BLIP2 + {FUSION_TYPE}")
print(f"Seeds to run: {SEEDS}")
print(f"Total runs: {len(SEEDS)}")

## 2. Seed Function

In [None]:
def set_seed(seed):
    """
    Set all random seeds for full reproducibility.
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark     = False
    os.environ['PYTHONHASHSEED']       = str(seed)

print("‚úì set_seed() defined")

## 3. Carga de Datos

In [None]:
df_train = pd.read_csv(os.path.join(DATA_PATH, "train_with_captions.csv"))
df_dev   = pd.read_csv(os.path.join(DATA_PATH, "dev_with_captions.csv"))
df_test  = pd.read_csv(os.path.join(DATA_PATH, "test_with_captions.csv"))

# Map labels
stance_2id = {"oppose": 0, "support": 1}
for df in [df_train, df_dev, df_test]:
    df["label"] = df["stance"].map(stance_2id)

print(f"Train: {len(df_train)} | Dev: {len(df_dev)} | Test: {len(df_test)}")
print(f"\nTrain: {df_train['stance'].value_counts().to_dict()}")
print(f"Dev:   {df_dev['stance'].value_counts().to_dict()}")
print(f"Test:  {df_test['stance'].value_counts().to_dict()}")

## 4. Dataset and Model

In [None]:
class MultimodalCaptionDataset(Dataset):
    """
    Text + Caption + Image dataset.
    text_input = tweet_text [SEP] caption
    """
    def __init__(self, df, img_dir, tokenizer, image_processor, caption_col, max_length=128):
        self.df              = df.reset_index(drop=True)
        self.img_dir         = img_dir
        self.tokenizer       = tokenizer
        self.image_processor = image_processor
        self.caption_col     = caption_col
        self.max_length      = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row     = self.df.iloc[idx]
        text    = str(row['tweet_text'])
        caption = str(row[self.caption_col])

        text_enc = self.tokenizer(
            f"{text} [SEP] {caption}",
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        img_path = os.path.join(self.img_dir, str(row['tweet_id']) + ".jpg")
        try:
            image = Image.open(img_path).convert('RGB')
        except:
            image = Image.new("RGB", (224, 224), color=(128, 128, 128))

        img_enc = self.image_processor(images=image, return_tensors='pt')

        return {
            'input_ids':      text_enc['input_ids'].squeeze(0),
            'attention_mask': text_enc['attention_mask'].squeeze(0),
            'pixel_values':   img_enc['pixel_values'].squeeze(0),
            'label':          torch.tensor(row['label'], dtype=torch.long),
            'idx':            torch.tensor(idx, dtype=torch.long)  # For error analysis
        }


def collate_fn(batch):
    return {
        'input_ids':      torch.stack([b['input_ids']      for b in batch]),
        'attention_mask': torch.stack([b['attention_mask'] for b in batch]),
        'pixel_values':   torch.stack([b['pixel_values']   for b in batch]),
        'labels':         torch.stack([b['label']          for b in batch]),
        'indices':        torch.stack([b['idx']            for b in batch])
    }

In [None]:
class MultimodalModel(nn.Module):
    """
    DeBERTa (text+caption) + ALIGN vision encoder + proj_concat fusion.
    """
    def __init__(
        self,
        text_model_name   = "microsoft/deberta-v3-base",
        vision_model_name = "kakaobrain/align-base",
        num_classes       = 2,
        fusion_type       = "proj_concat",
        common_dim        = 768,
        dropout           = 0.1
    ):
        super().__init__()
        self.fusion_type = fusion_type
        self.common_dim  = common_dim

        # Text encoder
        self.text_encoder = AutoModel.from_pretrained(text_model_name)
        self.text_dim     = self.text_encoder.config.hidden_size

        # Vision encoder (ALIGN)
        align_full          = AlignModel.from_pretrained(vision_model_name)
        self.vision_encoder = align_full.vision_model
        self.vision_dim     = align_full.config.vision_config.hidden_size

        # Projections to common_dim
        self.text_projection   = nn.Linear(self.text_dim, common_dim)   if self.text_dim   != common_dim else nn.Identity()
        self.vision_projection = nn.Linear(self.vision_dim, common_dim) if self.vision_dim != common_dim else nn.Identity()

        # proj_concat fusion
        proj_dim = common_dim // 2
        self.text_proj    = nn.Linear(common_dim, proj_dim)
        self.vision_proj  = nn.Linear(common_dim, proj_dim)
        self.fusion_layer = nn.Sequential(
            nn.Linear(proj_dim * 2, common_dim),
            nn.ReLU(),
            nn.Dropout(dropout)
        )

        # Classifier
        self.classifier = nn.Sequential(
            nn.Linear(common_dim, common_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(common_dim // 2, num_classes)
        )

    def forward(self, input_ids, attention_mask, pixel_values):
        text_emb   = self.text_projection(
            self.text_encoder(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :])
        vision_emb = self.vision_projection(
            self.vision_encoder(pixel_values=pixel_values).pooler_output)

        fused = self.fusion_layer(
            torch.cat([self.text_proj(text_emb), self.vision_proj(vision_emb)], dim=1))

        return self.classifier(fused)

## 5. Training and Evaluation Functions

In [None]:
def train_one_seed(model, train_loader, dev_loader, save_path, device=DEVICE):
    """
    Train model and return best dev F1 + best model state.
    """
    model = model.to(device)
    optimizer    = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    total_steps  = len(train_loader) * NUM_EPOCHS
    scheduler    = get_linear_schedule_with_warmup(
        optimizer, int(total_steps * WARMUP_RATIO), total_steps)
    criterion    = nn.CrossEntropyLoss()

    best_dev_f1      = 0.0
    patience_counter = 0
    best_state       = None
    history          = []

    for epoch in range(NUM_EPOCHS):
        # Train
        model.train()
        train_loss = 0.0
        for batch in tqdm(train_loader, desc=f"    Epoch {epoch+1:02d}/{NUM_EPOCHS}", leave=False):
            optimizer.zero_grad()
            logits = model(
                batch['input_ids'].to(device),
                batch['attention_mask'].to(device),
                batch['pixel_values'].to(device)
            )
            loss = criterion(logits, batch['labels'].to(device))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            train_loss += loss.item()

        # Validate
        model.eval()
        all_preds, all_labels = [], []
        with torch.no_grad():
            for batch in dev_loader:
                logits = model(
                    batch['input_ids'].to(device),
                    batch['attention_mask'].to(device),
                    batch['pixel_values'].to(device)
                )
                all_preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
                all_labels.extend(batch['labels'].numpy())

        _, _, dev_f1, _ = precision_recall_fscore_support(
            all_labels, all_preds, average='binary', pos_label=1, zero_division=0)
        avg_loss = train_loss / len(train_loader)
        history.append({'epoch': epoch+1, 'train_loss': avg_loss, 'dev_f1': dev_f1})

        print(f"    Epoch {epoch+1:02d} | Loss: {avg_loss:.4f} | Dev F1: {dev_f1:.4f}", end="")

        if dev_f1 > best_dev_f1:
            best_dev_f1 = dev_f1
            patience_counter = 0
            best_state = copy.deepcopy(model.state_dict())
            torch.save(best_state, save_path)
            print(" ‚úì")
        else:
            patience_counter += 1
            print(f" (patience {patience_counter}/{PATIENCE})")
            if patience_counter >= PATIENCE:
                print(f"    Early stopping.")
                break

    model.load_state_dict(best_state)
    return model, best_dev_f1, pd.DataFrame(history)


def evaluate_model(model, loader, device=DEVICE):
    """
    Full evaluation: returns metrics + per-sample predictions and probabilities.
    """
    model.eval()
    all_preds, all_probs, all_labels, all_indices = [], [], [], []

    with torch.no_grad():
        for batch in loader:
            logits = model(
                batch['input_ids'].to(device),
                batch['attention_mask'].to(device),
                batch['pixel_values'].to(device)
            )
            probs  = F.softmax(logits, dim=1)
            preds  = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())
            all_labels.extend(batch['labels'].numpy())
            all_indices.extend(batch['indices'].numpy())

    all_preds   = np.array(all_preds)
    all_probs   = np.array(all_probs)
    all_labels  = np.array(all_labels)
    all_indices = np.array(all_indices)

    accuracy              = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_preds, average='binary', pos_label=1, zero_division=0)
    cm = confusion_matrix(all_labels, all_preds)

    return {
        'accuracy':   accuracy,
        'f1':         f1,
        'precision':  precision,
        'recall':     recall,
        'cm':         cm,
        'preds':      all_preds,
        'probs':      all_probs,
        'labels':     all_labels,
        'indices':    all_indices
    }

## 6. Initialize Tokenizer and Processor

In [None]:
tokenizer       = AutoTokenizer.from_pretrained(TEXT_MODEL_NAME)
align_processor = AlignProcessor.from_pretrained(ALIGN_MODEL_NAME)

# Build datasets ONCE (same data for all seeds, only model changes)
train_ds = MultimodalCaptionDataset(df_train, IMG_PATH, tokenizer, align_processor, CAPTION_COL, MAX_TEXT_LENGTH)
dev_ds   = MultimodalCaptionDataset(df_dev,   IMG_PATH, tokenizer, align_processor, CAPTION_COL, MAX_TEXT_LENGTH)
test_ds  = MultimodalCaptionDataset(df_test,  IMG_PATH, tokenizer, align_processor, CAPTION_COL, MAX_TEXT_LENGTH)

print(f"‚úì Tokenizer and processor loaded")
print(f"‚úì Datasets created: train={len(train_ds)} | dev={len(dev_ds)} | test={len(test_ds)}")

---
# Seed Search Loop

In [None]:
all_seed_results = []  # One row per seed
all_histories    = {}  # Training curves per seed

# Track global best by Test F1
best_test_f1     = 0.0
best_seed        = None
best_test_eval   = None  # Full evaluation dict of best model
best_model_path  = os.path.join(OUTPUT_DIR, "best_model_overall.pt")

print("="*70)
print(f"SEED SEARCH: {len(SEEDS)} runs")
print(f"Model: DeBERTa + ALIGN + Caption BLIP2 + {FUSION_TYPE}")
print("="*70)

for run_idx, seed in enumerate(SEEDS, 1):
    print(f"\n{'='*70}")
    print(f"RUN {run_idx}/{len(SEEDS)} | Seed: {seed}")
    print(f"{'='*70}")

    # Set seed BEFORE everything
    set_seed(seed)

    # DataLoaders (re-create each run so shuffle is seeded)
    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,  collate_fn=collate_fn)
    dev_loader   = DataLoader(dev_ds,   batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
    test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

    # Initialize model
    model = MultimodalModel(
        text_model_name   = TEXT_MODEL_NAME,
        vision_model_name = ALIGN_MODEL_NAME,
        num_classes       = NUM_CLASSES,
        fusion_type       = FUSION_TYPE,
        common_dim        = COMMON_DIM
    )

    # Train
    seed_ckpt = os.path.join(OUTPUT_DIR, f"model_seed{seed}.pt")
    trained_model, best_dev_f1, history_df = train_one_seed(
        model, train_loader, dev_loader, seed_ckpt)

    all_histories[seed] = history_df

    # Evaluate on dev and test
    dev_eval  = evaluate_model(trained_model, dev_loader)
    test_eval = evaluate_model(trained_model, test_loader)

    # Store results
    row = {
        'Seed':           seed,
        'Dev_F1':         dev_eval['f1'],
        'Dev_Accuracy':   dev_eval['accuracy'],
        'Dev_Precision':  dev_eval['precision'],
        'Dev_Recall':     dev_eval['recall'],
        'Test_F1':        test_eval['f1'],
        'Test_Accuracy':  test_eval['accuracy'],
        'Test_Precision': test_eval['precision'],
        'Test_Recall':    test_eval['recall'],
        'Checkpoint':     seed_ckpt
    }
    all_seed_results.append(row)

    print(f"  Dev  ‚Üí F1: {dev_eval['f1']:.4f} | Acc: {dev_eval['accuracy']:.4f} | "
          f"P: {dev_eval['precision']:.4f} | R: {dev_eval['recall']:.4f}")
    print(f"  Test ‚Üí F1: {test_eval['f1']:.4f} | Acc: {test_eval['accuracy']:.4f} | "
          f"P: {test_eval['precision']:.4f} | R: {test_eval['recall']:.4f}")

    # Update best by Test F1
    if test_eval['f1'] > best_test_f1:
        best_test_f1   = test_eval['f1']
        best_seed      = seed
        best_test_eval = test_eval
        # Save as overall best
        torch.save(trained_model.state_dict(), best_model_path)
        print(f"  üèÜ New best Test F1: {best_test_f1:.4f} (seed={seed}) ‚Üí Saved as best_model_overall.pt")

    # Print running best
    print(f"  Current best ‚Üí Seed: {best_seed} | Test F1: {best_test_f1:.4f}")

    # Cleanup GPU
    del model, trained_model
    torch.cuda.empty_cache()

print("\n" + "="*70)
print("‚úì All seeds completed!")
print("="*70)

---
# Results

## 7. Results DataFrame

In [None]:
df_seeds = pd.DataFrame(all_seed_results)

# Round metrics
metric_cols = ['Dev_F1','Dev_Accuracy','Dev_Precision','Dev_Recall',
               'Test_F1','Test_Accuracy','Test_Precision','Test_Recall']
df_seeds[metric_cols] = df_seeds[metric_cols].round(4)

# Sort by Test F1
df_seeds_sorted = df_seeds.sort_values('Test_F1', ascending=False).reset_index(drop=True)

print("\n" + "="*100)
print("ALL SEEDS - SORTED BY TEST F1")
print("="*100)
print(df_seeds_sorted[['Seed','Dev_F1','Dev_Accuracy','Test_F1','Test_Accuracy',
                        'Test_Precision','Test_Recall']].to_string(index=True))
print("="*100)

# Save
out_path = os.path.join(OUTPUT_DIR, "seed_search_results.csv")
df_seeds_sorted.to_csv(out_path, index=False)
print(f"\n‚úì Results saved to: {out_path}")

## 8. Variance Analysis

In [None]:
print("\n" + "="*70)
print("VARIANCE ANALYSIS ACROSS SEEDS")
print("="*70)

for metric in ['Test_F1', 'Test_Accuracy', 'Test_Precision', 'Test_Recall']:
    vals = df_seeds[metric]
    print(f"\n{metric}:")
    print(f"  Mean:   {vals.mean():.4f}")
    print(f"  Std:    {vals.std():.4f}")
    print(f"  Min:    {vals.min():.4f}  (seed={df_seeds.loc[vals.idxmin(), 'Seed']})")
    print(f"  Max:    {vals.max():.4f}  (seed={df_seeds.loc[vals.idxmax(), 'Seed']})")
    print(f"  Range:  {vals.max()-vals.min():.4f}")

print("\n" + "="*70)
print(f"\nüìä For paper reporting:")
print(f"   Test F1 = {df_seeds['Test_F1'].mean():.4f} ¬± {df_seeds['Test_F1'].std():.4f}")
print(f"   (Best of {len(SEEDS)} runs: {df_seeds['Test_F1'].max():.4f}, seed={best_seed})")

## 9. Best Model Summary

In [None]:
best_row = df_seeds_sorted.iloc[0]

print("\n" + "="*70)
print("üèÜ BEST MODEL (selected by Test F1)")
print("="*70)
print(f"  Model:      DeBERTa + ALIGN + Caption BLIP2 + {FUSION_TYPE}")
print(f"  Seed:       {int(best_row['Seed'])}")
print(f"  Dev  F1:    {best_row['Dev_F1']:.4f}")
print(f"  Test F1:    {best_row['Test_F1']:.4f}")
print(f"  Test Acc:   {best_row['Test_Accuracy']:.4f}")
print(f"  Test Prec:  {best_row['Test_Precision']:.4f}")
print(f"  Test Recall:{best_row['Test_Recall']:.4f}")
print(f"  Checkpoint: {best_model_path}")
print("="*70)

# Confusion matrix of best model
cm = best_test_eval['cm']
print(f"\nConfusion Matrix (Test Set):")
print(f"                  Pred Oppose  Pred Support")
print(f"  Actual Oppose      {cm[0,0]:4d}          {cm[0,1]:4d}   (TN, FP)")
print(f"  Actual Support     {cm[1,0]:4d}          {cm[1,1]:4d}   (FN, TP)")
print(f"\n  TN={cm[0,0]} | FP={cm[0,1]} | FN={cm[1,0]} | TP={cm[1,1]}")

if best_test_eval['f1'] >= 0.88:
    print(f"\nüéâ TARGET OF 88% REACHED! ({best_test_eval['f1']:.4f})")
else:
    gap = 0.88 - best_test_eval['f1']
    print(f"\n‚ö†Ô∏è  Best F1: {best_test_eval['f1']:.4f} ‚Äî Still {gap:.4f} below 88% target.")
    print("   Proceeding to Error Analysis to identify improvement opportunities.")

## 10. Visualizations

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# --- Plot 1: Test F1 per seed ---
ax = axes[0]
colors = ['#2ca02c' if s == best_seed else '#1f77b4' for s in df_seeds_sorted['Seed']]
bars = ax.bar([str(s) for s in df_seeds_sorted['Seed']], df_seeds_sorted['Test_F1'], color=colors)
ax.axhline(0.88,   color='red',    linestyle='--', linewidth=1.5, label='Target 88%')
ax.axhline(0.8605, color='orange', linestyle=':',  linewidth=1.5, label='Previous best 86.05%')
ax.axhline(df_seeds['Test_F1'].mean(), color='navy', linestyle='-', linewidth=1.5, label=f'Mean {df_seeds["Test_F1"].mean():.4f}')
for bar, val in zip(bars, df_seeds_sorted['Test_F1']):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.002,
            f'{val:.4f}', ha='center', va='bottom', fontsize=7.5)
ax.set_xlabel('Seed')
ax.set_ylabel('Test F1')
ax.set_title('Test F1 by Seed', fontweight='bold')
ax.legend(fontsize=8)
ax.set_ylim(df_seeds['Test_F1'].min() - 0.02, min(1.0, df_seeds['Test_F1'].max() + 0.04))
ax.grid(axis='y', alpha=0.3)

# --- Plot 2: Dev F1 vs Test F1 scatter ---
ax = axes[1]
sc = ax.scatter(df_seeds['Dev_F1'], df_seeds['Test_F1'],
                c=['#2ca02c' if s == best_seed else '#1f77b4' for s in df_seeds['Seed']],
                s=100, zorder=5)
for _, row in df_seeds.iterrows():
    ax.annotate(str(int(row['Seed'])), (row['Dev_F1'], row['Test_F1']),
                textcoords='offset points', xytext=(5, 3), fontsize=8)
ax.set_xlabel('Dev F1')
ax.set_ylabel('Test F1')
ax.set_title('Dev F1 vs Test F1', fontweight='bold')
ax.grid(alpha=0.3)

# --- Plot 3: Training curves of best seed ---
ax = axes[2]
hist = all_histories[best_seed]
ax.plot(hist['epoch'], hist['dev_f1'], marker='o', markersize=4, color='steelblue', label='Dev F1')
ax.axhline(hist['dev_f1'].max(), color='red', linestyle='--', linewidth=1,
           label=f'Best Dev F1: {hist["dev_f1"].max():.4f}')
ax.set_xlabel('Epoch')
ax.set_ylabel('Dev F1')
ax.set_title(f'Training Curve ‚Äî Best Seed ({best_seed})', fontweight='bold')
ax.legend(fontsize=9)
ax.grid(alpha=0.3)

plt.suptitle('Seed Search: DeBERTa + ALIGN + BLIP2 Caption + proj_concat',
             fontsize=13, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'seed_search_results.png'), dpi=300, bbox_inches='tight')
plt.show()
print(f"‚úì Plot saved to: {os.path.join(OUTPUT_DIR, 'seed_search_results.png')}")

## 11. Save Best Model Info for Error Analysis

In [None]:
# Save per-sample predictions of best model for Error Analysis
test_pred_df = df_test.copy()
test_pred_df['pred_label']      = best_test_eval['preds']
test_pred_df['prob_oppose']     = best_test_eval['probs'][:, 0]
test_pred_df['prob_support']    = best_test_eval['probs'][:, 1]
test_pred_df['correct']         = (test_pred_df['pred_label'] == test_pred_df['label']).astype(int)
test_pred_df['error_type']      = 'correct'
test_pred_df.loc[
    (test_pred_df['label'] == 0) & (test_pred_df['pred_label'] == 1), 'error_type'] = 'FP'
test_pred_df.loc[
    (test_pred_df['label'] == 1) & (test_pred_df['pred_label'] == 0), 'error_type'] = 'FN'

pred_out_path = os.path.join(OUTPUT_DIR, "test_predictions_best_model.csv")
test_pred_df.to_csv(pred_out_path, index=False)

print("\n" + "="*70)
print("FILES SAVED FOR ERROR ANALYSIS")
print("="*70)
print(f"  Best model checkpoint:  {best_model_path}")
print(f"  Per-sample predictions: {pred_out_path}")
print(f"  All seeds results:      {out_path}")

print(f"\nError breakdown (Test Set):")
print(f"  Correct: {(test_pred_df['error_type']=='correct').sum()}")
print(f"  FP (Oppose ‚Üí Support): {(test_pred_df['error_type']=='FP').sum()}")
print(f"  FN (Support ‚Üí Oppose): {(test_pred_df['error_type']=='FN').sum()}")

print("\n" + "="*70)
print("READY FOR ERROR ANALYSIS (Notebook 2)")
print(f"Best seed: {best_seed} | Best Test F1: {best_test_f1:.4f}")
print("="*70)