In [5]:
import pandas as pd 
from tqdm import tqdm

from pyserini.search import get_topics
from pyserini.search.lucene import LuceneSearcher
from pyserini.output_writer import get_output_writer, OutputFormat
from pyserini.query_iterator import get_query_iterator, TopicsFormat

from tools.scripts.msmarco.msmarco_doc_eval import compute_metrics_from_files

In [6]:
# Function to run the queries
def run_queries(output_path, searcher):

    # Parameters for the output writer
    output_format = 'msmarco'
    num_docs = 100
    
    if output_path is None:
        tag = 'Anserini'
    else:
        tag = output_path[:-4]
    
    topics = 'msmarco-doc-dev'
    query_iterator = get_query_iterator(topics, TopicsFormat(TopicsFormat.DEFAULT.value))
    topics = query_iterator.topics
    
    max_passage = False
    max_passage_delimiter = "#"
    max_passage_hits = 100

    # Output writer
    output_writer = get_output_writer(output_path, OutputFormat(output_format), 'w', max_hits=num_docs, tag=tag, 
                                      topics=topics, use_max_passage=max_passage, max_passage_delimiter=max_passage_delimiter,
                                      max_passage_hits=max_passage_hits)

    # Concurrent search
    batch_size = 64
    threads = 16
    
    # Search
    with output_writer:
        
        # Create the list of topics and topic ids
        batch_topics = list()
        batch_topic_ids = list()
        
        # Iterate over the topics
        for index, (topic_id, text) in enumerate(tqdm(query_iterator, total=len(topics.keys()))):
            
            # Append the topic id and topic text to the list
            batch_topic_ids.append(str(topic_id))
            batch_topics.append(text)
            
            #  If the batch size is reached or if the last topic is reached
            if (index == len(topics.keys()) - 1) or ((index + 1) % batch_size == 0):
                
                # Search the topics
                results = searcher.batch_search(batch_topics, batch_topic_ids, num_docs, threads, query_generator = None, fields = dict())
                results = [(id_, results[id_]) for id_ in batch_topic_ids]
                
                # Clear the lists
                batch_topic_ids.clear()
                batch_topics.clear()
            else:
                continue
            
            # Write the results
            for topic, hits in results:

                output_writer.write(topic, hits)

            results.clear()

In [7]:
# Parameters for the grid search
k1_list =  [2, 2.5, 3, 3.5, 4, 4.5, 5, 5.5]
b_list =  [0.25, 0.5, 0.75, 1]

In [8]:
# Dataframe to store the results
df = pd.DataFrame(columns=['k1', 'b', 'mrr@100'])

# Grid search
for k1 in k1_list:
    for b in b_list:
        
        # Set the searcher
        lucene_searcher = LuceneSearcher('./indexes/lucene-index-msmarco-doc/')
        lucene_searcher.set_bm25(k1=k1, b=b)
        
        # Set the output path and the reference file
        path_to_candidate = f'./runs/bestMatch.k{k1}.b{b}.txt'
        topics = get_topics('msmarco-doc-dev')
        path_to_reference = './pyserini/tools/topics-and-qrels/qrels.msmarco-doc.dev.txt'

        # Run the queries
        run_queries(path_to_candidate, lucene_searcher)

        # Evaluate the results for mrr@100
        exclude_qids = set()
        metrics = compute_metrics_from_files(path_to_reference, path_to_candidate, exclude_qids)

        # Add the results to the dataframe
        df = pd.concat([df, pd.DataFrame([[k1, b, metrics['MRR @100']]], columns=['k1', 'b', 'mrr@100'])])

# Save the results in a tex file
df.to_latex('BestMatch25.tex', index=False)

Using pre-defined topic order for msmarco-doc-dev


100%|██████████| 5193/5193 [04:24<00:00, 19.60it/s]


Quantity of Documents ranked for each query is as expected. Evaluating
Using pre-defined topic order for msmarco-doc-dev


100%|██████████| 5193/5193 [04:14<00:00, 20.42it/s]


Quantity of Documents ranked for each query is as expected. Evaluating
Using pre-defined topic order for msmarco-doc-dev


100%|██████████| 5193/5193 [03:52<00:00, 22.34it/s]


Quantity of Documents ranked for each query is as expected. Evaluating
Using pre-defined topic order for msmarco-doc-dev


100%|██████████| 5193/5193 [03:38<00:00, 23.77it/s]


Quantity of Documents ranked for each query is as expected. Evaluating
Using pre-defined topic order for msmarco-doc-dev


100%|██████████| 5193/5193 [04:17<00:00, 20.20it/s]


Quantity of Documents ranked for each query is as expected. Evaluating
Using pre-defined topic order for msmarco-doc-dev


100%|██████████| 5193/5193 [03:35<00:00, 24.11it/s]


Quantity of Documents ranked for each query is as expected. Evaluating
Using pre-defined topic order for msmarco-doc-dev


100%|██████████| 5193/5193 [03:26<00:00, 25.20it/s]


Quantity of Documents ranked for each query is as expected. Evaluating
Using pre-defined topic order for msmarco-doc-dev


100%|██████████| 5193/5193 [03:14<00:00, 26.64it/s]


Quantity of Documents ranked for each query is as expected. Evaluating
Using pre-defined topic order for msmarco-doc-dev


100%|██████████| 5193/5193 [03:59<00:00, 21.68it/s]


Quantity of Documents ranked for each query is as expected. Evaluating
Using pre-defined topic order for msmarco-doc-dev


100%|██████████| 5193/5193 [03:40<00:00, 23.55it/s]


Quantity of Documents ranked for each query is as expected. Evaluating
Using pre-defined topic order for msmarco-doc-dev


100%|██████████| 5193/5193 [03:34<00:00, 24.21it/s]


Quantity of Documents ranked for each query is as expected. Evaluating
Using pre-defined topic order for msmarco-doc-dev


100%|██████████| 5193/5193 [03:21<00:00, 25.76it/s]


Quantity of Documents ranked for each query is as expected. Evaluating
Using pre-defined topic order for msmarco-doc-dev


100%|██████████| 5193/5193 [02:36<00:00, 33.27it/s]


Quantity of Documents ranked for each query is as expected. Evaluating
Using pre-defined topic order for msmarco-doc-dev


100%|██████████| 5193/5193 [01:58<00:00, 43.78it/s]


Quantity of Documents ranked for each query is as expected. Evaluating
Using pre-defined topic order for msmarco-doc-dev


100%|██████████| 5193/5193 [01:52<00:00, 46.29it/s]


Quantity of Documents ranked for each query is as expected. Evaluating
Using pre-defined topic order for msmarco-doc-dev


100%|██████████| 5193/5193 [01:44<00:00, 49.48it/s]


Quantity of Documents ranked for each query is as expected. Evaluating
Using pre-defined topic order for msmarco-doc-dev


100%|██████████| 5193/5193 [02:10<00:00, 39.88it/s]


Quantity of Documents ranked for each query is as expected. Evaluating
Using pre-defined topic order for msmarco-doc-dev


100%|██████████| 5193/5193 [02:03<00:00, 42.15it/s]


Quantity of Documents ranked for each query is as expected. Evaluating
Using pre-defined topic order for msmarco-doc-dev


100%|██████████| 5193/5193 [01:58<00:00, 43.90it/s]


Quantity of Documents ranked for each query is as expected. Evaluating
Using pre-defined topic order for msmarco-doc-dev


100%|██████████| 5193/5193 [01:49<00:00, 47.48it/s]


Quantity of Documents ranked for each query is as expected. Evaluating
Using pre-defined topic order for msmarco-doc-dev


100%|██████████| 5193/5193 [02:13<00:00, 38.87it/s]


Quantity of Documents ranked for each query is as expected. Evaluating
Using pre-defined topic order for msmarco-doc-dev


100%|██████████| 5193/5193 [02:06<00:00, 41.11it/s]


Quantity of Documents ranked for each query is as expected. Evaluating
Using pre-defined topic order for msmarco-doc-dev


100%|██████████| 5193/5193 [01:57<00:00, 44.19it/s]


Quantity of Documents ranked for each query is as expected. Evaluating
Using pre-defined topic order for msmarco-doc-dev


100%|██████████| 5193/5193 [01:33<00:00, 55.61it/s]


Quantity of Documents ranked for each query is as expected. Evaluating
Using pre-defined topic order for msmarco-doc-dev


100%|██████████| 5193/5193 [01:52<00:00, 46.29it/s]


Quantity of Documents ranked for each query is as expected. Evaluating
Using pre-defined topic order for msmarco-doc-dev


100%|██████████| 5193/5193 [01:46<00:00, 48.86it/s]


Quantity of Documents ranked for each query is as expected. Evaluating
Using pre-defined topic order for msmarco-doc-dev


100%|██████████| 5193/5193 [01:42<00:00, 50.69it/s]


Quantity of Documents ranked for each query is as expected. Evaluating
Using pre-defined topic order for msmarco-doc-dev


100%|██████████| 5193/5193 [01:35<00:00, 54.47it/s]


Quantity of Documents ranked for each query is as expected. Evaluating
Using pre-defined topic order for msmarco-doc-dev


100%|██████████| 5193/5193 [01:55<00:00, 44.88it/s]


Quantity of Documents ranked for each query is as expected. Evaluating
Using pre-defined topic order for msmarco-doc-dev


100%|██████████| 5193/5193 [02:08<00:00, 40.54it/s]


Quantity of Documents ranked for each query is as expected. Evaluating
Using pre-defined topic order for msmarco-doc-dev


100%|██████████| 5193/5193 [01:57<00:00, 44.10it/s]


Quantity of Documents ranked for each query is as expected. Evaluating
Using pre-defined topic order for msmarco-doc-dev


100%|██████████| 5193/5193 [01:38<00:00, 52.91it/s]


Quantity of Documents ranked for each query is as expected. Evaluating


  df.to_latex('BestMatch25.tex', index=False)
