# Contextual AI Reranker Evaluation Notebook

## Overview
This notebook demonstrates how to evaluate the Contextual AI reranker using datasets from Hugging Face, with proper metrics calculation including NDCG@10, MAP, and Recall.

### Key Features:
- 🎯 Evaluation on Hugging Face datasets
- 📊 Comprehensive metrics (NDCG@10, MAP, Recall@10, MRR)
- ⚡ Fast performance benchmarking
- 🔧 Robust evaluation framework with pytrec_eval


## 1. Setup and Installation


In [None]:
%pip install datasets pytrec_eval contextual-client numpy -q

In [None]:
import pytrec_eval
import numpy as np
from typing import List
from datasets import load_dataset
from contextual import ContextualAI
import time
import os

In [None]:
# Set your API keys here

# Get Hugging Face token
HF_TOKEN = os.getenv("hf_key)

# Get Contextual AI API key
CONTEXTUAL_API_KEY = os.getenv("CONTEXTUAL_API_KEY")

# Initialize Contextual AI client
from contextual import ContextualAI
client = ContextualAI(api_key=CONTEXTUAL_API_KEY)

## 2. Select and Load Dataset

Available datasets modified for reranking analysis available on Hugging Face:


In [None]:
# Available datasets for evaluation
AVAILABLE_DATASETS = {
    "touche2020": "ContextualAI/touche2020",
    "msmarco": "ContextualAI/msmarco",
    "treccovid": "ContextualAI/treccovid",
    "nq": "ContextualAI/nq",
    "hotpotqa": "ContextualAI/hotpotqa",
    "fiqa2018": "ContextualAI/fiqa2018"
}

# Select which dataset to use
DATASET_NAME = "touche2020"  # Change this to use a different dataset

print(f"Selected dataset: {AVAILABLE_DATASETS[DATASET_NAME]}")

# Load the dataset
dataset = load_dataset(AVAILABLE_DATASETS[DATASET_NAME], token=HF_TOKEN)
print(f"✅ Loaded {len(dataset['test'])} test examples")

# Show example
example = dataset['test'][0]
print(f"\nExample query: {example['query'][:100]}...")
print(f"Number of candidates: {len(example['candidate_docs'])}")


## 3. Define Evaluation Framework


In [None]:
def evaluate_reranker_robust(dataset, reranker_func, eval_strings=None):
    """
    Robust evaluation function that handles different pytrec_eval metric naming conventions
    """
    if eval_strings is None:
        eval_strings = {"ndcg_cut.10", "map", "recall_10"}

    qrels, results = {}, {}

    for sample in dataset:
        qid = str(sample["_id"])
        query = sample["query"]
        candidate_docs = sample["candidate_docs"]
        candidate_ids = sample["candidate_ids"]
        gt_ids = sample["gt_ids"]
        gt_qrels = sample["gt_qrels"]

        # Get scores from reranker
        candidate_scores = reranker_func(query, candidate_docs, candidate_ids)

        # Prepare qrels (ground truth relevance judgments)
        qrels[qid] = {str(t_id): int(_qrel) for t_id, _qrel in zip(gt_ids, gt_qrels)}

        # Prepare results (candidate scores)
        results[qid] = {str(cid): float(score) for cid, score in zip(candidate_ids, candidate_scores)}

    # Ensure non-empty qrels for pytrec_eval
    for qid in list(qrels.keys()):
        if len(qrels[qid]) == 0:
            qrels[qid] = {"dummy_id_for_pytrec_eval": 1}

    # Try to evaluate with the requested metrics
    try:
        evaluator = pytrec_eval.RelevanceEvaluator(qrels, eval_strings)
        scores = evaluator.evaluate(results)

        # Get the actual metric names from the first result
        if scores:
            first_score = list(scores.values())[0]
            actual_metrics = list(first_score.keys())
            print(f"Successfully computed metrics: {actual_metrics}")

            # Calculate average metrics using the actual metric names
            avg_scores = {}
            for metric in actual_metrics:
                values = [v[metric] for v in scores.values()]
                avg_scores[f"avg_{metric}"] = np.mean(values) if values else 0.0

            return avg_scores
        else:
            print("No scores returned from pytrec_eval")
            return {}

    except Exception as e:
        print(f"Error with pytrec_eval: {e}")
        print("Falling back to simple evaluation...")
        return evaluate_simple_fallback(dataset, reranker_func)



## 4. Contextual AI Reranker Function


In [None]:
def contextual_ai_reranker(query: str, candidate_docs: List[str], candidate_ids: List[str]) -> List[float]:
    """
    Contextual AI reranker implementation with FIXED score extraction

    Args:
        query: The search query
        candidate_docs: List of candidate document texts
        candidate_ids: List of candidate document IDs

    Returns:
        List of relevance scores for each candidate document
    """
    try:
        # Optional: Add instruction for the reranker
        instruction = ""

        # Choose model: full or mini version
        model = "ctxl-rerank-v2-instruct-multilingual"  # Full model
        # model = "ctxl-rerank-v2-instruct-multilingual-mini"  # Mini model (faster)

        # Call the Contextual AI reranker
        rerank_response = client.rerank.create(
            query=query,
            instruction=instruction,
            documents=candidate_docs,
            model=model
        )

        # Extract scores from the response
        response_dict = rerank_response.to_dict()

        # FIXED: Use 'relevance_score' instead of 'score'
        if 'results' in response_dict:
            # Create mapping from index to score
            index_to_score = {
                result.get('index', 0): result.get('relevance_score', 0.0)
                for result in response_dict['results']
            }

            # Return scores in original document order
            scores = [index_to_score.get(i, 0.0) for i in range(len(candidate_docs))]
        else:
            # Fallback: if response format is different
            scores = [1.0] * len(candidate_docs)

        return scores

    except Exception as e:
        print(f"Error calling Contextual AI reranker: {e}")
        # Fallback to uniform scores if API call fails
        return [1.0] * len(candidate_docs)

print("Note: Using 'relevance_score' field for proper score extraction")


## 5. Define Baseline Reranker (for comparison)


In [None]:
# Baseline reranker function
def simple_baseline_reranker_with_scores(query: str, candidate_docs: List[str], candidate_ids: List[str]) -> List[float]:
    """Simple baseline reranker that returns uniform scores (no reranking)"""
    return [1.0] * len(candidate_ids)

## 6. Dataset Analysis


In [None]:
def analyze_dataset_speed(dataset):
    """Analyze the dataset to understand processing requirements"""
    print("Dataset Analysis for Speed Verification")
    print("=" * 50)

    total_examples = len(dataset)
    print(f"Total examples: {total_examples}")

    # Analyze candidate document counts
    candidate_counts = []
    doc_lengths = []
    query_lengths = []

    for example in dataset:
        num_candidates = len(example.get('candidate_docs', []))
        candidate_counts.append(num_candidates)

        if 'candidate_docs' in example and example['candidate_docs']:
            doc_lengths.extend([len(doc) for doc in example['candidate_docs']])

        if 'query' in example:
            query_lengths.append(len(example['query']))

    print(f"\nDataset Statistics:")
    print(f"Average candidates per query: {np.mean(candidate_counts):.1f}")
    print(f"Min candidates: {min(candidate_counts)}")
    print(f"Max candidates: {max(candidate_counts)}")
    print(f"Average document length: {np.mean(doc_lengths):.0f} characters")
    print(f"Average query length: {np.mean(query_lengths):.0f} characters")

# Run analysis
analyze_dataset_speed(dataset['test'])

## 7. Run Baseline Evaluation


In [None]:
# Test baseline reranker
print("Testing baseline reranker...")
baseline_robust_results = evaluate_reranker_robust(dataset['test'], simple_baseline_reranker_with_scores)

print("\nBaseline Results:")
for metric, value in baseline_robust_results.items():
    print(f"  {metric}: {value:.4f}")

## 8. Run Contextual AI Reranker Evaluation


In [None]:
# Test Contextual AI reranker
print("Testing Contextual AI reranker...")
start_time = time.time()

contextual_ai_results = evaluate_reranker_robust(dataset['test'], contextual_ai_reranker)

elapsed_time = time.time() - start_time

print("\nContextual AI Results:")
for metric, value in contextual_ai_results.items():
    print(f"  {metric}: {value:.4f}")

print(f"\nProcessing time: {elapsed_time:.1f} seconds ({elapsed_time/60:.1f} minutes)")
print(f"Per example: {elapsed_time/len(dataset['test']):.2f} seconds")

## 10. Results Comparison


In [None]:
print("\n" + "="*50)
print("Comparison:")
print("Baseline Results:")
for metric, value in baseline_robust_results.items():
    print(f"  {metric}: {value:.4f}")

print("\nContextual AI Results:")
for metric, value in contextual_ai_results.items():
    print(f"  {metric}: {value:.4f}")

# Calculate improvement
print("\nImprovement over baseline:")
for metric in baseline_robust_results.keys():
    if metric in contextual_ai_results:
        baseline_val = baseline_robust_results[metric]
        contextual_val = contextual_ai_results[metric]
        improvement = ((contextual_val - baseline_val) / baseline_val) * 100 if baseline_val > 0 else 0
        print(f"  {metric}: {improvement:+.1f}%")


## 10. Test on Single Example (Debugging)


In [None]:
# Test on a single example to see how the reranker works
example = dataset['test'][1]

print(f"Query: {example['query']}")
print(f"Number of candidates: {len(example['candidate_docs'])}")

# Get scores from Contextual AI
scores = contextual_ai_reranker(
    example['query'],
    example['candidate_docs'],
    example['candidate_ids']
)

# Check if we're getting non-zero scores
non_zero_scores = [s for s in scores if s != 0.0]
print(f"\nNon-zero scores: {len(non_zero_scores)} out of {len(scores)}")
print(f"Score range: {min(scores):.4f} to {max(scores):.4f}")

# Show top 5 documents by score
doc_scores = list(zip(example['candidate_ids'], scores, example['candidate_docs']))
doc_scores.sort(key=lambda x: x[1], reverse=True)

print("\nTop 5 documents by relevance score:")
for i, (doc_id, score, text) in enumerate(doc_scores[:5]):
    print(f"\n{i+1}. Score: {score:.4f}")
    print(f"   ID: {doc_id}")
    print(f"   Text: {text[:200]}...")
