# Final Evaluation: TEST Set Only

**⚠️ IMPORTANT**: This notebook evaluates on the **TEST set** which was separated at the beginning and never used for training or model selection.

**Steps:**
1. Load best model/classifier combination (based on Dev set results)
2. Extract features for TEST set (if not already done)
3. Evaluate on TEST set
4. Generate final reports and plots

**Note:** TEST set is ONLY used here, never before!


In [None]:
# Setup (run ALL previous notebooks first)
import sys
from pathlib import Path
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel

BASE_PATH = Path('/content/semeval-context-tree-modular')
DATA_PATH = Path('/content/drive/MyDrive/semeval_data')
sys.path.insert(0, str(BASE_PATH))

from src.storage.manager import StorageManager
from src.features.extraction import featurize_hf_dataset_in_batches_v2
from src.models.classifiers import get_classifier_dict
from src.evaluation.metrics import compute_all_metrics, print_classification_report
from src.evaluation.tables import print_results_table
from src.evaluation.visualizer import visualize_all_evaluation

storage = StorageManager(
    base_path=str(BASE_PATH),
    data_path=str(DATA_PATH),
    github_path=str(BASE_PATH)
)

# Load TEST split (ONLY used here!)
test_ds = storage.load_split('test')

print("✅ Setup complete!")
print(f"⚠️  TEST set: {len(test_ds)} samples (ONLY used for final evaluation!)")


In [None]:
# Configuration
# TODO: Select best model/classifier based on Dev set results
# For now, using all combinations for comparison

MODELS = ['bert', 'roberta', 'deberta', 'xlnet']
TASKS = ['clarity', 'evasion']

# Label mappings
CLARITY_LABELS = ['Clear Reply', 'Ambiguous', 'Clear Non-Reply']
EVASION_LABELS = ['Direct Answer', 'Partial Answer', 'Implicit Answer', 
                  'Uncertainty', 'Refusal', 'Clarification', 
                  'Question', 'Topic Shift', 'Other']

# Get classifiers
classifiers = get_classifier_dict(random_state=42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"✅ Will evaluate on TEST set for all model/classifier combinations")


In [None]:
# Extract TEST features if not already done
# (This should have been done in 02_feature_extraction_separate.ipynb, but check first)

MODEL_CONFIGS = {
    'bert': 'bert-base-uncased',
    'roberta': 'roberta-base',
    'deberta': 'microsoft/deberta-v3-base',
    'xlnet': 'xlnet-base-cased'
}

for model_key, model_name in MODEL_CONFIGS.items():
    for task in TASKS:
        # Check if TEST features already exist
        try:
            X_test = storage.load_features(model_key, task, 'test')
            print(f"✅ TEST features already exist: {model_key}_{task}")
        except FileNotFoundError:
            print(f"⚠️  TEST features not found for {model_key}_{task}, extracting...")
            
            # Load model
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            model = AutoModel.from_pretrained(model_name)
            model.to(device)
            model.eval()
            
            # Extract features
            X_test, feature_names, _ = featurize_hf_dataset_in_batches_v2(
                test_ds,
                tokenizer,
                model,
                device,
                batch_size=8,
                max_sequence_length=256,
                question_key='question',
                answer_key='answer',
                show_progress=True
            )
            
            # Save features
            storage.save_features(
                X_test, model_key, task, 'test', feature_names
            )
            
            print(f"    ✅ Saved: {X_test.shape[0]} samples, {X_test.shape[1]} features")
            
            # Free memory
            del model, tokenizer
            torch.cuda.empty_cache() if torch.cuda.is_available() else None

print("\n✅ TEST feature extraction complete!")


In [None]:
# Final evaluation on TEST set
# Load best models from Dev set and evaluate on TEST

final_results = {}

for model in MODELS:
    print(f"\n{'='*80}")
    print(f"MODEL: {model.upper()} - FINAL EVALUATION ON TEST SET")
    print(f"{'='*80}")
    
    final_results[model] = {}
    
    for task in TASKS:
        print(f"\n{'='*60}")
        print(f"TASK: {task.upper()}")
        print(f"{'='*60}")
        
        # Get label list
        if task == 'clarity':
            label_list = CLARITY_LABELS
            label_key = 'clarity_label'
        else:  # evasion
            label_list = EVASION_LABELS
            label_key = 'evasion_label'
        
        # Get TEST labels
        y_test = np.array([test_ds[i][label_key] for i in range(len(test_ds))])
        
        # Load TEST features
        X_test = storage.load_features(model, task, 'test')
        
        # Load TRAIN features and labels (to retrain on full train+dev)
        X_train = storage.load_features(model, task, 'train')
        train_ds = storage.load_split('train')
        y_train = np.array([train_ds[i][label_key] for i in range(len(train_ds))])
        
        # Load DEV features and labels (combine with train for final training)
        X_dev = storage.load_features(model, task, 'dev')
        dev_ds = storage.load_split('dev')
        y_dev = np.array([dev_ds[i][label_key] for i in range(len(dev_ds))])
        
        # Combine train + dev for final training
        X_train_full = np.vstack([X_train, X_dev])
        y_train_full = np.concatenate([y_train, y_dev])
        
        print(f"  Training on: {X_train_full.shape[0]} samples (train+dev)")
        print(f"  Testing on: {X_test.shape[0]} samples (TEST)")
        
        # Train all classifiers on full train+dev
        task_results = {}
        
        for classifier_name, clf in classifiers.items():
            print(f"\n  Training {classifier_name}...")
            
            # Train on full train+dev
            clf.fit(X_train_full, y_train_full)
            
            # Predict on TEST
            y_test_pred = clf.predict(X_test)
            
            # Get probabilities
            try:
                y_test_proba = clf.predict_proba(X_test)
            except AttributeError:
                y_test_proba = None
            
            # Compute metrics
            metrics = compute_all_metrics(y_test, y_test_pred, label_list, 
                                         task_name=f"TEST_{model}_{task}_{classifier_name}")
            
            # Print classification report
            print_classification_report(
                y_test, y_test_pred, label_list,
                task_name=f"TEST - {model} - {task} - {classifier_name}"
            )
            
            # Visualize
            if y_test_proba is not None:
                visualize_all_evaluation(
                    y_test, y_test_pred, y_test_proba, label_list,
                    task_name=f"TEST_{model}_{task}",
                    classifier_name=classifier_name,
                    save_dir=str(DATA_PATH / 'plots' / 'final_evaluation')
                )
            
            task_results[classifier_name] = {
                'metrics': metrics,
                'predictions': y_test_pred,
                'probabilities': y_test_proba
            }
            
            # Save TEST predictions and probabilities
            storage.save_predictions(
                y_test_pred, model, classifier_name, task, 'test'
            )
            if y_test_proba is not None:
                storage.save_probabilities(
                    y_test_proba, model, classifier_name, task, 'test'
                )
        
        # Print results table
        print_results_table(
            {name: {'metrics': res['metrics']} for name, res in task_results.items()},
            task_name=f"TEST - {model} - {task}",
            sort_by="Macro F1"
        )
        
        final_results[model][task] = task_results
        
        # Save final results
        experiment_id = f"FINAL_TEST_{model}_{task}"
        storage.save_results({
            'split': 'test',
            'model': model,
            'task': task,
            'n_test': len(y_test),
            'results': {
                name: {'metrics': res['metrics']}
                for name, res in task_results.items()
            }
        }, experiment_id)

print(f"\n{'='*80}")
print("✅ FINAL EVALUATION ON TEST SET COMPLETE!")
print(f"{'='*80}")
