# This notebook contains the code to reproduce the experiment in `rank_eval` paper.

In [1]:
import random

import numpy as np
import pytrec_eval
from tabulate import tabulate

from rank_eval import evaluate, Qrels, Run

In [2]:
# Wrapper for pytrec_eval
def run_trec_metrics(qrels, run, metrics):
    evaluator = pytrec_eval.RelevanceEvaluator(qrels, metrics)
    results = evaluator.evaluate(run)
    return {m: np.mean([v[m] for v in results.values()]) for m in list(metrics)}


def run_single_trec_metric(qrels, run, metric):
    return run_trec_metrics(qrels, run, {metric})[metric]

In [3]:
def generate_qrels(query_count, max_relevant_per_query):
    qrels = {}
    for i in range(query_count):
        y_t = {}
        k = random.choice(range(1, max_relevant_per_query))
        for j in range(k):
            y_t[f"d{j}"] = random.choice([1, 2, 3, 4, 5])

        qrels[f"q{i}"] = y_t

    return qrels


def generate_run(query_count, result_count):
    run = {}
    for i in range(query_count):
        y_p = {}
        for j in range(result_count):
            y_p[f"d{j}"] = random.uniform(0.0, 1.0)

        run[f"q{i}"] = y_p

    return run

In [4]:
random.seed = 42
np.random.seed(42)

In [5]:
results = []
result_count = 100
max_relevant_per_query = 10

for query_count in [1_000_000]:
# for query_count in [1_000, 10_000, 100_000]:
    print(f"Queries: {query_count}")
    # Generate Qrels and Run
    trec_qrels = generate_qrels(query_count, max_relevant_per_query)
    trec_run = generate_run(query_count, result_count)

    re_qrels = Qrels.from_dict(trec_qrels).to_typed_list()
    re_run = Run.from_dict(trec_run).to_typed_list()

    x = %timeit -o -q run_single_trec_metric(trec_qrels, trec_run, "map")
    map_avg_time = round(x.average, 3) * 1000
    results.append([query_count, "pytrec_eval", 1, "map", map_avg_time, 1.0])

    x = %timeit -o -q run_single_trec_metric(trec_qrels, trec_run, "recip_rank")
    mrr_avg_time = round(x.average, 3) * 1000
    results.append([query_count, "pytrec_eval", 1, "mrr", mrr_avg_time, 1.0])

    x = %timeit -o -q run_single_trec_metric(trec_qrels, trec_run, "ndcg")
    ndcg_avg_time = round(x.average, 3) * 1000
    results.append([query_count, "pytrec_eval", 1, "ndcg", ndcg_avg_time, 1.0])

    for threads in [1, 2, 4, 8]:
        # Run metrics once to ensure they have been compiled
        evaluate(re_qrels, re_run, [f"map", f"mrr", f"ndcg"], threads=threads)

        x = %timeit -o -q evaluate(re_qrels, re_run, f"map", threads=threads)
        avg_time = max(round(x.average, 3) * 1000, 1)
        results.append([query_count, "rank_eval", threads, "map", avg_time, round(map_avg_time / avg_time, 1)])

        x = %timeit -o -q evaluate(re_qrels, re_run, f"mrr", threads=threads)
        avg_time = max(round(x.average, 3) * 1000, 1)
        results.append([query_count, "rank_eval", threads, "mrr", avg_time, round(mrr_avg_time / avg_time, 1)])

        x = %timeit -o -q evaluate(re_qrels, re_run, f"ndcg", threads=threads)
        avg_time = max(round(x.average, 3) * 1000, 1)
        results.append([query_count, "rank_eval", threads, "ndcg", avg_time, round(ndcg_avg_time / avg_time, 1)])

Queries: 1000000


In [6]:
print(tabulate(results, headers=["Query count", "Approach", "Threads", "Metric", "Avg. (ms)", "Speed-Up"]))

  Query count  Approach       Threads  Metric      Avg. (ms)    Speed-Up
-------------  -----------  ---------  --------  -----------  ----------
      1000000  pytrec_eval          1  map             29452         1
      1000000  pytrec_eval          1  mrr             29227         1
      1000000  pytrec_eval          1  ndcg            29659         1
      1000000  rank_eval            1  map              2019        14.6
      1000000  rank_eval            1  mrr               752        38.9
      1000000  rank_eval            1  ndcg             3357         8.8
      1000000  rank_eval            2  map              1226        24
      1000000  rank_eval            2  mrr               562        52
      1000000  rank_eval            2  ndcg             2299        12.9
      1000000  rank_eval            4  map               875        33.7
      1000000  rank_eval            4  mrr               442        66.1
      1000000  rank_eval            4  ndcg             1759 