# Information Retrieval Evaluation Pipeline
This notebook provides a template for evaluating query reformulation techniques using PyTerrier.
Pipeline stages: Dataset Loading → Preprocessing → Query Reformulation → Retrieval → Evaluation



## 1. Dataset Loading
This can be switched with another dataset, possibly requiring conversion to this format.


In [2]:
import pandas as pd
import pyterrier as pt
from pathlib import Path
from datasets import load_dataset

class DatasetComponents:
    """Container for dataset components that must be provided"""
    def __init__(self, corpus_iter, queries_df, qrels_df):
        self.corpus_iter = corpus_iter  # Iterator yielding {'docno': str, 'text': str}
        self.queries_df = queries_df    # DataFrame with columns ['qid', 'query']
        self.qrels_df = qrels_df        # DataFrame with columns ['qid', 'docno', 'label']

def load_pt_dataset():
    """Load codec dataset"""
    docs = load_dataset("macavaney/codec")["train"]
    qrels = load_dataset('irds/codec', 'qrels', trust_remote_code=True)
    queries = load_dataset('irds/codec', 'queries', trust_remote_code=True)

    # Convert dataset to correct format
    corpus_iter = ({'docno': str(doc['id']), 'text': doc['contents']} for doc in docs)

    queries_df = pd.DataFrame(queries)[['query_id', 'query']]
    queries_df.columns = ['qid', 'query']

    qrels_df = pd.DataFrame(qrels)[['query_id', 'doc_id', 'relevance']]
    qrels_df.columns = ['qid', 'docno', 'label']

    return DatasetComponents(corpus_iter, queries_df, qrels_df)

# Load the dataset
data = load_pt_dataset()

[INFO] [starting] https://raw.githubusercontent.com/grill-lab/CODEC/main/raw_judgments/raw_document_judgments.txt
                                                        
[A                                                                                                                             [INFO] [finished] https://raw.githubusercontent.com/grill-lab/CODEC/main/raw_judgments/raw_document_judgments.txt: [00:00] [307kB] [11.2MB/s]
Generating qrels split: 6186 examples [00:00, 7490.68 examples/s]
[INFO] [starting] https://raw.githubusercontent.com/grill-lab/CODEC/main/topics/topics.json
                                                          
[A                                                                                                        [INFO] [finished] https://raw.githubusercontent.com/grill-lab/CODEC/main/topics/topics.json: [00:00] [47.2kB] [18.8MB/s]
Generating queries split: 42 examples [00:00, 127.26 examples/s]


## 2. Preprocessing Pipeline
Currently, this does no preprocessing.

In [3]:
if not pt.java.started():
    pt.java.init()

tokeniser = pt.java.autoclass("org.terrier.indexing.tokenisation.Tokeniser").getTokeniser()
def strip_markup(text):
    return " ".join(tokeniser.getTokens(text))


def preprocess_text(text: str) -> str:
    """Placeholder for text preprocessing logic"""
    return text

def preprocess_corpus(corpus_iter):
    """Generator that applies preprocessing to each document"""
    for doc in corpus_iter:
        yield {
            'docno': doc['docno'],
            'text': preprocess_text(doc['text'])
        }

def preprocess_queries(queries_df):
    """Apply preprocessing to queries dataframe"""
    queries_df = queries_df.copy()
    queries_df['query'] = queries_df['query'].apply(strip_markup)
    return queries_df

# Apply preprocessing while maintaining iterator
preprocessed_corpus = preprocess_corpus(data.corpus_iter)
preprocessed_queries = preprocess_queries(data.queries_df)

Java started and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]


## 3. Indexing Pipeline

In [4]:
index_path = Path.cwd() / "index"
index_ref = None

# Check if valid index exists
if (index_path / "data.properties").exists():
    try:
        index_ref = pt.IndexFactory.of(str(index_path))
        print(f"Loaded existing index from {index_path}")

        # Verify index contains documents
        if index_ref.getCollectionStatistics().getNumberOfDocuments() == 0:
            raise ValueError("Empty index - will rebuild")

    except Exception as e:
        print(f"Index loading failed ({str(e)}), rebuilding...")
        import shutil
        shutil.rmtree(index_path)
        index_ref = None

# Build new index if needed
if index_ref is None:
    print("Building new index...")
    index_ref = pt.index.IterDictIndexer(
        str(index_path),
        meta={"docno": 32, "text": 131072},
        type=pt.index.IndexingType.CLASSIC
    ).index(preprocessed_corpus)
    print(f"Built new index at {index_path}")

    print(index_ref.getCollectionStatistics())
    


Building new index...
21:45:23.045 [main] ERROR org.terrier.structures.indexing.Indexer -- Could not finish MetaIndexBuilder: 
java.io.IOException: Key 282f6a4a77e8a8c989e9d72038177201 is not unique: 507572,477277
For MetaIndex, to suppress, set metaindex.compressed.reverse.allow.duplicates=true
	at org.terrier.structures.collections.FSOrderedMapFile$MultiFSOMapWriter.mergeTwo(FSOrderedMapFile.java:1374)
	at org.terrier.structures.collections.FSOrderedMapFile$MultiFSOMapWriter.close(FSOrderedMapFile.java:1308)
	at org.terrier.structures.indexing.BaseMetaIndexBuilder.close(BaseMetaIndexBuilder.java:321)
	at org.terrier.structures.indexing.classical.BasicIndexer.indexDocuments(BasicIndexer.java:270)
	at org.terrier.structures.indexing.classical.BasicIndexer.createDirectIndex(BasicIndexer.java:388)
	at org.terrier.structures.indexing.Indexer.index(Indexer.java:377)
21:46:47.032 [main] WARN org.terrier.structures.indexing.Indexer -- Indexed 1 empty documents
Built new index at c:\Users\ale

AttributeError: 'org.terrier.querying.IndexRef' object has no attribute 'getCollectionStatistics'

## 4. Query Reformulation

In [33]:
import importlib
import classic_rewriting
importlib.reload(classic_rewriting)

def reformulate_queries_rm3(queries_df, index_ref):
    """Reformulate queries using RM3"""
    modified_queries = queries_df.copy()

    # Use RM3 for query expansion
    modified_queries = classic_rewriting.rewrite_queries_RM3(modified_queries, index_ref)
    
    # If you want to replace the original queries with expanded ones
    if 'expanded_query' in modified_queries.columns:
        modified_queries['query'] = modified_queries['expanded_query']
        modified_queries = modified_queries.drop(columns=['expanded_query'])

    return modified_queries

def reformulate_queries_bo1(queries_df, index_ref):
    """Reformulate queries using BO1"""
    modified_queries = queries_df.copy()

    # Use BO1 for query expansion
    modified_queries = classic_rewriting.rewrite_queries_BO1(modified_queries, index_ref)
    
    # If you want to replace the original queries with expanded ones
    if 'expanded_query' in modified_queries.columns:
        modified_queries['query'] = modified_queries['expanded_query']
        modified_queries = modified_queries.drop(columns=['expanded_query'])

    return modified_queries

reformulated_queries_rm3 = reformulate_queries_rm3(preprocessed_queries, index_ref)
reformulated_queries_bo1 = reformulate_queries_bo1(preprocessed_queries, index_ref)

22:30:47.937 [main] WARN org.terrier.structures.BaseCompressingMetaIndex -- Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 1.4 GiB of memory would be required.
22:30:48.186 [main] WARN org.terrier.structures.BaseCompressingMetaIndex -- Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 1.4 GiB of memory would be required.
22:30:58.133 [main] WARN org.terrier.structures.BaseCompressingMetaIndex -- Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 1.4 GiB of memory would be required.
22:30:58.358 [main] WARN org.terrier.structures.BaseCompressingMetaIndex -- Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 1.4 GiB of memory would be required.


Number of documents: 729824
Number of terms: 941881
Number of postings: 159765825
Number of fields: 0
Number of tokens: 273318564
Field names: []
Positions:   false



In [1]:
# Function to delete the index in case it should be recreated
import shutil
from pathlib import Path

def delete_index(index_path):
    """Deletes the index at the specified path."""
    if index_path.exists():
        shutil.rmtree(index_path)
        print(f"Deleted index at {index_path}")
    else:
        print("No index found to delete.")

Deleted index at C:\Users\thein\OneDrive\Documents\InformationRetrieval\llm-query-rewriting\index


## 5. Retrieval Setup
Currently, using BM25 for retrieval.

In [27]:
bm25 = pt.BatchRetrieve(
    index_ref,
    wmodel="BM25",
    metadata=["docno", "text"],
    properties={"termpipelines": ""},
    controls={"qe": "off"}
)

  bm25 = pt.BatchRetrieve(


22:15:34.237 [main] WARN org.terrier.structures.BaseCompressingMetaIndex -- Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 1.4 GiB of memory would be required.


## 6. Evaluation Pipeline

In [None]:
# Run evaluation experiment
# metrics: https://pyterrier.readthedocs.io/en/latest/experiments.html#available-evaluation-measures
import os
import json

# Alternative import approaches
try:
    # Try the standard import first
    from IPython.display import display, HTML
except ImportError:
    # Fallback options
    try:
        from IPython import display
        HTML = display.HTML
    except ImportError:
        # If IPython display isn't working, define a simple fallback
        def display(obj):
            print(obj)
        
        class HTML:
            def __init__(self, html_str):
                self.html = html_str
            
            def _repr_html_(self):
                return self.html


# Directory containing JSON files
results_dir = "./results"

# Evaluation metrics
eval_metrics = ["map", "ndcg_cut_10", "P_10", "recall_100", "recip_rank"]

results = pt.Experiment(
    [bm25],
    preprocessed_queries,
    data.qrels_df,
    eval_metrics,
    names=["BM25 Baseline"],
    baseline=0
)

# Display the results
print("Results for no query reformulation:")
display(HTML(results.to_html(index=False)))

# Display the results for reformulated queries
print("Results for reformulated queries using RM3:")
results_reformulated = pt.Experiment(
    [bm25],
    reformulated_queries_rm3,
    data.qrels_df,
    eval_metrics,
    names=["BM25 with RM3"],
    baseline=0
)
display(HTML(results_reformulated.to_html(index=False)))

# Display the results for reformulated queries
print("Results for reformulated queries using BO1:")
results_reformulated = pt.Experiment(
    [bm25],
    reformulated_queries_bo1,
    data.qrels_df,
    eval_metrics,
    names=["BM25 with BO1"],
    baseline=0
)
display(HTML(results_reformulated.to_html(index=False)))


# Loop through JSON files in the directory
for filename in os.listdir(results_dir):
    if filename.endswith(".json"):
        file_path = os.path.join(results_dir, filename)
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                data_json = json.load(f)

            # Convert JSON data to DataFrame
            queries_df = pd.DataFrame(data_json)[["query_id", "query_rewrite"]]
            queries_df.columns = ["qid", "query"]
            queries_df['query'] = queries_df['query'].apply(strip_markup)

            print(f"Processing file: {filename}")

            results = pt.Experiment(
                [bm25],
                queries_df,
                data.qrels_df,
                eval_metrics,
                names=["BM25 Baseline"],
                baseline=0
            )

            # Display the results
            display(HTML(results.to_html(index=False)))

        except Exception as e:
            print(f"Error processing {filename}: {e}")


Results for no query reformulation:


name,map,recip_rank,P_10,recall_100,ndcg_cut_10,map +,map -,map p-value,recip_rank +,recip_rank -,recip_rank p-value,P_10 +,P_10 -,P_10 p-value,recall_100 +,recall_100 -,recall_100 p-value,ndcg_cut_10 +,ndcg_cut_10 -,ndcg_cut_10 p-value
BM25 Baseline,0.284473,0.800519,0.633333,0.3696,0.401966,,,,,,,,,,,,,,,


Results for reformulated queries using RM3:


name,map,recip_rank,P_10,recall_100,ndcg_cut_10,map +,map -,map p-value,recip_rank +,recip_rank -,recip_rank p-value,P_10 +,P_10 -,P_10 p-value,recall_100 +,recall_100 -,recall_100 p-value,ndcg_cut_10 +,ndcg_cut_10 -,ndcg_cut_10 p-value
BM25 with RM3,0.30453,0.73106,0.585714,0.379398,0.374728,,,,,,,,,,,,,,,


Results for reformulated queries using BO1:


name,map,recip_rank,P_10,recall_100,ndcg_cut_10,map +,map -,map p-value,recip_rank +,recip_rank -,recip_rank p-value,P_10 +,P_10 -,P_10 p-value,recall_100 +,recall_100 -,recall_100 p-value,ndcg_cut_10 +,ndcg_cut_10 -,ndcg_cut_10 p-value
BM25 with BO1,0.322746,0.822587,0.661905,0.402822,0.429287,,,,,,,,,,,,,,,
