# Task 1: Reranking and Zero-Shot Classification - SOLUTION

Build a production-ready retrieve-rerank pipeline and zero-shot classifier.

In [None]:
from sentence_transformers import SentenceTransformer, CrossEncoder
from transformers import pipeline
import numpy as np
import json
from sklearn.metrics.pairwise import cosine_similarity
import torch

## Load Data

In [None]:
# Load queries and documents
with open('../fixtures/input/queries_documents.json', 'r') as f:
    queries_data = json.load(f)

# Load classification texts
with open('../fixtures/input/classification_texts.json', 'r') as f:
    classification_data = json.load(f)

print(f"Loaded {len(queries_data)} query sets")
print(f"Loaded {len(classification_data)} classification examples")

## Task 1: Baseline Bi-Encoder Search

In [None]:
# SOLUTION

# 1. Load bi-encoder model
bi_encoder = SentenceTransformer('all-MiniLM-L6-v2')

# 2. For each query, rank documents by cosine similarity
bi_encoder_results = {}

for item in queries_data:
    query_id = item['query_id']
    query = item['query']
    documents = item['documents']
    
    # Encode query and documents
    query_emb = bi_encoder.encode(query)
    doc_texts = [doc['text'] for doc in documents]
    doc_embs = bi_encoder.encode(doc_texts)
    
    # Calculate similarities
    similarities = cosine_similarity([query_emb], doc_embs)[0]
    
    # Get top-3 document IDs
    top_indices = np.argsort(similarities)[::-1][:3]
    top_doc_ids = [documents[idx]['doc_id'] for idx in top_indices]
    
    bi_encoder_results[query_id] = top_doc_ids

print("Bi-encoder results:")
for query_id, doc_ids in list(bi_encoder_results.items())[:2]:
    print(f"  {query_id}: {doc_ids}")

# TEST - Do not modify
assert bi_encoder is not None, "Bi-encoder not loaded"
assert len(bi_encoder_results) == len(queries_data), "Missing results"
for query_id, doc_ids in bi_encoder_results.items():
    assert len(doc_ids) == 3, f"Expected 3 results for {query_id}"
print("✓ Task 1 passed")

## Task 2: Cross-Encoder Reranking

In [None]:
# SOLUTION

# 1. Load cross-encoder model
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

# 2. For each query, rerank all documents
cross_encoder_results = {}

for item in queries_data:
    query_id = item['query_id']
    query = item['query']
    documents = item['documents']
    
    # Create query-document pairs
    doc_texts = [doc['text'] for doc in documents]
    pairs = [[query, doc_text] for doc_text in doc_texts]
    
    # Get cross-encoder scores
    scores = cross_encoder.predict(pairs)
    
    # Get top-3 document IDs
    top_indices = np.argsort(scores)[::-1][:3]
    top_doc_ids = [documents[idx]['doc_id'] for idx in top_indices]
    
    cross_encoder_results[query_id] = top_doc_ids

print("Cross-encoder results:")
for query_id, doc_ids in list(cross_encoder_results.items())[:2]:
    print(f"  {query_id}: {doc_ids}")

# TEST - Do not modify
assert cross_encoder is not None, "Cross-encoder not loaded"
assert len(cross_encoder_results) == len(queries_data), "Missing results"
for query_id, doc_ids in cross_encoder_results.items():
    assert len(doc_ids) == 3, f"Expected 3 results for {query_id}"
print("✓ Task 2 passed")

## Task 3: Calculate MRR

In [None]:
# SOLUTION

def calculate_mrr(results, ground_truth):
    """
    Calculate Mean Reciprocal Rank
    """
    reciprocal_ranks = []
    
    for query_id, ranked_docs in results.items():
        relevant_docs = ground_truth[query_id]
        
        # Find rank of first relevant document
        for rank, doc_id in enumerate(ranked_docs, start=1):
            if doc_id in relevant_docs:
                reciprocal_ranks.append(1.0 / rank)
                break
        else:
            # No relevant document found
            reciprocal_ranks.append(0.0)
    
    return np.mean(reciprocal_ranks)

# Prepare ground truth
ground_truth = {q['query_id']: q['relevant_docs'] for q in queries_data}

# Calculate MRR for both methods
mrr_bi_encoder = calculate_mrr(bi_encoder_results, ground_truth)
mrr_cross_encoder = calculate_mrr(cross_encoder_results, ground_truth)

print(f"Bi-Encoder MRR: {mrr_bi_encoder:.3f}")
print(f"Cross-Encoder MRR: {mrr_cross_encoder:.3f}")
improvement = (mrr_cross_encoder - mrr_bi_encoder) / mrr_bi_encoder * 100
print(f"Improvement: {improvement:.1f}%")

# TEST - Do not modify
assert mrr_bi_encoder > 0, "Bi-encoder MRR not calculated"
assert mrr_cross_encoder > 0, "Cross-encoder MRR not calculated"
assert mrr_cross_encoder >= mrr_bi_encoder, "Cross-encoder should improve MRR"
print("✓ Task 3 passed")

## Task 4: Calculate NDCG@3

In [None]:
# SOLUTION

def calculate_ndcg_at_k(results, ground_truth, k=3):
    """
    Calculate Normalized Discounted Cumulative Gain at K
    """
    ndcg_scores = []
    
    for query_id, ranked_docs in results.items():
        relevant_docs = set(ground_truth[query_id])
        
        # Create relevance vector (1 if relevant, 0 if not)
        relevances = [1 if doc_id in relevant_docs else 0 
                     for doc_id in ranked_docs[:k]]
        
        # Calculate DCG
        dcg = 0.0
        for i, rel in enumerate(relevances, start=1):
            dcg += rel / np.log2(i + 1)
        
        # Calculate IDCG (perfect ranking)
        ideal_relevances = sorted(relevances, reverse=True)
        idcg = 0.0
        for i, rel in enumerate(ideal_relevances, start=1):
            idcg += rel / np.log2(i + 1)
        
        # NDCG
        if idcg > 0:
            ndcg_scores.append(dcg / idcg)
        else:
            ndcg_scores.append(0.0)
    
    return np.mean(ndcg_scores)

# Calculate NDCG for both methods
ndcg_bi_encoder = calculate_ndcg_at_k(bi_encoder_results, ground_truth, k=3)
ndcg_cross_encoder = calculate_ndcg_at_k(cross_encoder_results, ground_truth, k=3)

print(f"Bi-Encoder NDCG@3: {ndcg_bi_encoder:.3f}")
print(f"Cross-Encoder NDCG@3: {ndcg_cross_encoder:.3f}")
improvement = (ndcg_cross_encoder - ndcg_bi_encoder) / ndcg_bi_encoder * 100
print(f"Improvement: {improvement:.1f}%")

# TEST - Do not modify
assert ndcg_bi_encoder > 0, "Bi-encoder NDCG not calculated"
assert ndcg_cross_encoder > 0, "Cross-encoder NDCG not calculated"
assert ndcg_cross_encoder >= ndcg_bi_encoder, "Cross-encoder should improve NDCG"
print("✓ Task 4 passed")

## Task 5: Zero-Shot Classification

In [None]:
# SOLUTION

# 1. Load zero-shot classification pipeline
zero_shot_classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli",
    device=0 if torch.cuda.is_available() else -1
)

# 2. For each text, predict top label
classification_results = {}

for item in classification_data:
    text_id = item['text_id']
    text = item['text']
    candidate_labels = item['candidate_labels']
    
    # Predict
    result = zero_shot_classifier(text, candidate_labels=candidate_labels)
    
    # Store top prediction
    classification_results[text_id] = result['labels'][0]

print("Sample predictions:")
for text_id, label in list(classification_results.items())[:3]:
    print(f"  {text_id}: {label}")

# TEST - Do not modify
assert zero_shot_classifier is not None, "Classifier not loaded"
assert len(classification_results) == len(classification_data), "Missing predictions"
print("✓ Task 5 passed")

## Task 6: Calculate Classification Accuracy

In [None]:
# SOLUTION

# Compare predictions to true_labels (first label)
correct = 0
total = len(classification_data)

for item in classification_data:
    text_id = item['text_id']
    predicted = classification_results[text_id]
    # Check if predicted matches any true label
    if predicted in item['true_labels']:
        correct += 1

accuracy = correct / total

print(f"Zero-shot Accuracy: {accuracy:.1%}")

# Show predictions
print("\nSample predictions:")
for item in classification_data[:5]:
    text_id = item['text_id']
    predicted = classification_results[text_id]
    actual = item['true_labels'][0]
    match = "✓" if predicted in item['true_labels'] else "✗"
    print(f"{match} {text_id}: predicted={predicted}, actual={actual}")

# TEST - Do not modify
assert accuracy > 0, "Accuracy not calculated"
assert accuracy >= 0.5, f"Accuracy too low: {accuracy:.1%}"
print("✓ Task 6 passed")

## Task 7: Multi-Label Classification

In [None]:
# SOLUTION

def calculate_multilabel_f1(predictions, ground_truth):
    """
    Calculate F1 for multi-label classification
    """
    f1_scores = []
    
    for text_id, pred_labels in predictions.items():
        true_labels = set(ground_truth[text_id])
        pred_labels = set(pred_labels)
        
        # Calculate TP, FP, FN
        tp = len(true_labels & pred_labels)
        fp = len(pred_labels - true_labels)
        fn = len(true_labels - pred_labels)
        
        # Calculate precision and recall
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        
        # Calculate F1
        if precision + recall > 0:
            f1 = 2 * (precision * recall) / (precision + recall)
        else:
            f1 = 0.0
        
        f1_scores.append(f1)
    
    return np.mean(f1_scores)

# Predict multiple labels with threshold
multilabel_predictions = {}
threshold = 0.5

for item in classification_data:
    text_id = item['text_id']
    text = item['text']
    candidate_labels = item['candidate_labels']
    
    # Multi-label prediction
    result = zero_shot_classifier(
        text,
        candidate_labels=candidate_labels,
        multi_label=True
    )
    
    # Filter by threshold
    predicted_labels = [
        label for label, score in zip(result['labels'], result['scores'])
        if score >= threshold
    ]
    
    multilabel_predictions[text_id] = predicted_labels

multilabel_ground_truth = {item['text_id']: item['true_labels'] 
                           for item in classification_data}

f1_score = calculate_multilabel_f1(multilabel_predictions, multilabel_ground_truth)

print(f"Multi-label F1: {f1_score:.3f}")

# Show sample predictions
print("\nSample multi-label predictions:")
for item in classification_data[:3]:
    text_id = item['text_id']
    predicted = multilabel_predictions[text_id]
    actual = item['true_labels']
    print(f"{text_id}:")
    print(f"  Predicted: {predicted}")
    print(f"  Actual: {actual}")

# TEST - Do not modify
assert len(multilabel_predictions) == len(classification_data), "Missing predictions"
assert f1_score > 0, "F1 not calculated"
assert f1_score >= 0.5, f"F1 too low: {f1_score:.3f}"
print("✓ Task 7 passed")

## Task 8: Handle Edge Cases

In [None]:
# SOLUTION

# Load edge cases
with open('../fixtures/edge_cases/test_cases.json', 'r') as f:
    edge_cases = json.load(f)

# Test reranking edge cases
reranking_edge_results = {}

print("Testing Reranking Edge Cases:\n")
for case in edge_cases['reranking_edge_cases']:
    case_name = case['case']
    query = case['query']
    documents = case['documents']
    
    # Rank with cross-encoder
    pairs = [[query, doc] for doc in documents]
    scores = cross_encoder.predict(pairs)
    
    # Get rankings
    ranked_indices = np.argsort(scores)[::-1]
    ranked_docs = [documents[i] for i in ranked_indices]
    
    reranking_edge_results[case_name] = ranked_docs[0]  # Top doc
    
    print(f"{case_name}:")
    print(f"  Challenge: {case['challenge']}")
    print(f"  Top result: {ranked_docs[0][:60]}...")
    print(f"  Score: {scores[ranked_indices[0]]:.3f}\n")

# Test classification edge cases
classification_edge_results = {}

print("\nTesting Classification Edge Cases:\n")
for case in edge_cases['classification_edge_cases']:
    case_name = case['case']
    text = case['text']
    labels = case['labels']
    expected = case.get('expected', 'N/A')
    
    # Classify
    result = zero_shot_classifier(text, candidate_labels=labels)
    predicted = result['labels'][0]
    confidence = result['scores'][0]
    
    classification_edge_results[case_name] = predicted
    
    match = "✓" if predicted == expected else "✗"
    print(f"{match} {case_name}:")
    print(f"  Challenge: {case['challenge']}")
    print(f"  Text: {text[:60]}...")
    print(f"  Predicted: {predicted} ({confidence:.2f})")
    print(f"  Expected: {expected}\n")

# TEST - Do not modify
assert len(reranking_edge_results) > 0, "No reranking edge cases tested"
assert len(classification_edge_results) > 0, "No classification edge cases tested"
print("✓ Task 8 passed")

## Summary

You've successfully:
- ✓ Built retrieve-rerank pipeline
- ✓ Measured improvement with MRR and NDCG
- ✓ Implemented zero-shot classification
- ✓ Handled multi-label scenarios
- ✓ Tested edge cases

**Key insights:**
- Cross-encoders improve ranking quality by 15-30%
- MRR and NDCG capture different aspects of ranking quality
- Zero-shot works well for clear categories
- Struggles with sarcasm, ambiguity, and negation
- Multi-label needs careful threshold tuning

**Next:** Apply these techniques in RAG pipelines (Module 5)!