# Information Retrieval Evaluation Pipeline
This notebook provides a template for evaluating query reformulation techniques using PyTerrier.
Pipeline stages: Dataset Loading → Preprocessing → Query Reformulation → Retrieval → Evaluation



## 1. Dataset Loading
This can be switched with another dataset, possibly requiring conversion to this format.


In [8]:
import pandas as pd
import pyterrier as pt
from pathlib import Path

class DatasetComponents:
    """Container for dataset components that must be provided"""
    def __init__(self, corpus_iter, queries_df, qrels_df):
        self.corpus_iter = corpus_iter  # Iterator yielding {'docno': str, 'text': str}
        self.queries_df = queries_df    # DataFrame with columns ['qid', 'query']
        self.qrels_df = qrels_df        # DataFrame with columns ['qid', 'docno', 'label']

def load_pt_dataset() -> DatasetComponents:
    """Load dataset using PyTerrier's built-in loader"""
    dataset = pt.get_dataset("irds:antique/test/non-offensive")

    return DatasetComponents(
        corpus_iter=dataset.get_corpus_iter(),
        queries_df=dataset.get_topics(),
        qrels_df=dataset.get_qrels()
    )

# Load the dataset
data = load_pt_dataset()


antique/test/non-offensive documents:   0%|          | 0/403666 [00:00<?, ?it/s][A

## 2. Preprocessing Pipeline
Currently, this does no preprocessing.

In [9]:
def preprocess_text(text: str) -> str:
    """Placeholder for text preprocessing logic"""
    return text

def preprocess_corpus(corpus_iter):
    """Generator that applies preprocessing to each document"""
    for doc in corpus_iter:
        yield {
            'docno': doc['docno'],
            'text': preprocess_text(doc['text'])
        }

def preprocess_queries(queries_df):
    """Apply preprocessing to queries dataframe"""
    queries_df = queries_df.copy()
    queries_df['query'] = queries_df['query'].apply(preprocess_text)
    return queries_df

# Apply preprocessing while maintaining iterator
preprocessed_corpus = preprocess_corpus(data.corpus_iter)
preprocessed_queries = preprocess_queries(data.queries_df)

antique/test/non-offensive documents:   0%|          | 0/403666 [16:42<?, ?it/s]


## 3. Query Reformulation

In [10]:
def reformulate_queries(queries_df):
    """Placeholder for query reformulation techniques"""
    modified_queries = queries_df.copy()
    # Placeholder for actual reformulation
    return modified_queries

reformulated_queries = reformulate_queries(preprocessed_queries)

## 4. Indexing Pipeline

In [11]:
index_path = Path.cwd() / "index"
index_ref = None

# Check if valid index exists
if (index_path / "data.properties").exists():
    try:
        index_ref = pt.IndexFactory.of(str(index_path))
        print(f"Loaded existing index from {index_path}")

        # Verify index contains documents
        if index_ref.getCollectionStatistics().getNumberOfDocuments() == 0:
            raise ValueError("Empty index - will rebuild")

    except Exception as e:
        print(f"Index loading failed ({str(e)}), rebuilding...")
        import shutil
        shutil.rmtree(index_path)
        index_ref = None

# Build new index if needed
if index_ref is None:
    print("Building new index...")
    index_ref = pt.index.IterDictIndexer(
        str(index_path),
        meta={"docno": 32, "text": 131072},
        type=pt.index.IndexingType.CLASSIC
    ).index(preprocessed_corpus)
    print(f"Built new index at {index_path}")

# Verify index
stats = index_ref.getCollectionStatistics()
print(f"Index contains {stats.getNumberOfDocuments()} documents")

Loaded existing index from C:\Users\thein\OneDrive\Documents\InformationRetrieval\llm-query-rewriting\index
Index contains 403666 documents


## 5. Retrieval Setup
Currently, using BM25 for retrieval.

In [12]:
bm25 = pt.BatchRetrieve(
    index_ref,
    wmodel="BM25",
    metadata=["docno", "text"],
    properties={"termpipelines": ""},
    controls={"qe": "off"}
)

  bm25 = pt.BatchRetrieve(


## 6. Evaluation Pipeline

In [13]:
# Run evaluation experiment
# metrics: https://pyterrier.readthedocs.io/en/latest/experiments.html#available-evaluation-measures
eval_metrics = ["map", "ndcg_cut_10", "P_10", "recall_100", "recip_rank"]

results = pt.Experiment(
    [bm25],
    reformulated_queries,
    data.qrels_df,
    eval_metrics,
    names=["BM25 Baseline"],
    baseline=0
)

results

Unnamed: 0,name,map,recip_rank,P_10,recall_100,ndcg_cut_10,map +,map -,map p-value,recip_rank +,...,recip_rank p-value,P_10 +,P_10 -,P_10 p-value,recall_100 +,recall_100 -,recall_100 p-value,ndcg_cut_10 +,ndcg_cut_10 -,ndcg_cut_10 p-value
0,BM25 Baseline,0.457005,0.938873,0.753977,0.657961,0.516219,,,,,...,,,,,,,,,,
