First of all we need to install [ranx](https://github.com/AmenRa/ranx)

Mind that the first time you run any ranx' functions they may take a while as they must be compiled first

In [None]:
!pip install -U ranx

Download the data we need

In [1]:
import os
import requests

for file in ["qrels", "run_1", "run_2", "run_3", "run_4", "run_5"]:
    os.makedirs("notebooks/data", exist_ok=True)

    with open(f"notebooks/data/{file}.trec", "w") as f:
        master = f"https://raw.githubusercontent.com/AmenRa/ranx/master/notebooks/data/{file}.trec"
        f.write(requests.get(master).text)

Import

In [None]:
from ranx import Qrels, Run, evaluate, compare

Create Qrels and Run

In [None]:
# The standard way of creating Qrels and Run is converting Python Dictionaries
qrels_dict = { "q_1": { "d_12": 5, "d_25": 3 },
               "q_2": { "d_11": 6, "d_22": 1 } }

run_dict = { "q_1": { "d_12": 0.9, "d_23": 0.8, "d_25": 0.7,
                      "d_36": 0.6, "d_32": 0.5, "d_35": 0.4  },
             "q_2": { "d_12": 0.9, "d_11": 0.8, "d_25": 0.7,
                      "d_36": 0.6, "d_22": 0.5, "d_35": 0.4  } }

qrels = Qrels(qrels_dict)
run = Run(run_dict)

Evaluation

In [None]:
# Compute NDCG@5
score = evaluate(qrels, run, "ndcg@5")
print(score)

# Compute NDCG@3, MAP@5, and MRR
score_dict = evaluate(qrels, run, ["ndcg@3", "map@5", "mrr"])
print(score_dict)

Comparison

In [None]:
# Let's load qrels and runs from files and compare them
qrels = Qrels.from_file("notebooks/data/qrels.trec", kind="trec")

run_1 = Run.from_file("notebooks/data/run_1.trec", kind="trec")
run_2 = Run.from_file("notebooks/data/run_2.trec", kind="trec")
run_3 = Run.from_file("notebooks/data/run_3.trec", kind="trec")
run_4 = Run.from_file("notebooks/data/run_4.trec", kind="trec")
run_5 = Run.from_file("notebooks/data/run_5.trec", kind="trec")

In [None]:
# Compares different runs and performs statistical tests (Fisher's Randomization test)
report = compare(
    qrels,
    runs=[run_1, run_2, run_3, run_4, run_5],
    metrics=["map@100", "mrr@100", "ndcg@10"],
    max_p=0.01  # P-value threshold
)

# The comparison results are saved in a Report instance,
# which provides handy functionalities such as tabular formatting
# (superscripts denote statistical significance differences)
report

In [None]:
# A Report can also be exported as LaTeX table ready for scientific publications
print(report.to_latex())