The profiles can be viewed by running in the Linux command line:
```
tuna path/to/rerank_ff.prof --port=8000
```

In [1]:
# SETTINGS
device_type = "cpu" # "cpu" or "gpu"
k_s = 1000
in_memory = False
path_to_dir = "../../ff-data/"
h5_filename = "ff_msmarco-v1-passage.tct_colbert.h5"
ranking_filename = "msmarco-passage-test2019-sparse10000.txt"

In [2]:
from pathlib import Path

path_to_h5_file = Path(path_to_dir + h5_filename)
path_to_ranking_file = Path(path_to_dir + ranking_filename)

In [3]:
import os
import logging
import pyterrier as pt

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
logging.basicConfig(level=logging.INFO)

if not pt.started():
    pt.init(tqdm="notebook")

# Create profile directory
mem = "mem" if in_memory else "disk"
profile_dir = f"profiles/{h5_filename}/{device_type}_k{k_s}_{mem}/"
if not os.path.exists(profile_dir):
    os.makedirs(profile_dir)

device_name = "cuda" if device_type == "gpu" else "cpu"

PyTerrier 0.10.1 has loaded Terrier 5.9 (built by craigm on 2024-05-02 17:40) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [4]:
from fast_forward import OnDiskIndex, Mode, Ranking
from fast_forward.encoder import TCTColBERTQueryEncoder
import cProfile
import pstats

with cProfile.Profile() as profile:
    q_encoder = TCTColBERTQueryEncoder(
        "castorini/tct_colbert-msmarco", 
        device=device_name
    )
    ff_index = OnDiskIndex.load(
        path_to_h5_file,
        query_encoder=q_encoder, 
        mode=Mode.MAXP
    )

    if in_memory:
        ff_index = ff_index.to_memory()

stats = pstats.Stats(profile)
stats.sort_stats(pstats.SortKey.TIME)
stats.dump_stats(profile_dir + "index.prof")

100%|██████████| 8841823/8841823 [00:37<00:00, 233191.75it/s]


In [5]:
import ir_datasets
dataset = ir_datasets.load("msmarco-passage/trec-dl-2019/judged")
r = Ranking.from_file(
    path_to_ranking_file,
    {q.query_id: q.text for q in dataset.queries_iter()},
)

  df = pd.read_csv(


In [6]:
# standard re-ranking, probably takes a few min
with cProfile.Profile() as profile:
    ff_out = ff_index(r.cut(k_s))

stats = pstats.Stats(profile)
stats.sort_stats(pstats.SortKey.TIME)
stats.dump_stats(profile_dir + "rerank_ff.prof")

INFO:fast_forward.index:computed scores in 108.27997793000031 seconds


In [7]:
# re-ranking with early stopping, also takes a few min
with cProfile.Profile() as profile:
    ff_out_es = ff_index(
        r.cut(k_s),
        early_stopping=10,
        early_stopping_alpha=0.2,
        early_stopping_intervals=(800, 5000),
    )

stats = pstats.Stats(profile)
stats.sort_stats(pstats.SortKey.TIME)
stats.dump_stats(profile_dir + "rerank_ff_es.prof")

INFO:fast_forward.index:depth 800: 16 queries left
INFO:fast_forward.index:depth 5000: 16 queries left
INFO:fast_forward.index:computed scores in 186.87780905199998 seconds


In [8]:
from ir_measures import calc_aggregate, AP, RR
from fast_forward.util import to_ir_measures

print(
    "Lexical retrieval without re-ranking:\n",
    calc_aggregate(
        [AP(rel=2) @ 1000, RR(rel=2) @ 10], dataset.qrels_iter(), to_ir_measures(r)
    ),
    "\n\n... with fast-forward re-ranking:\n",
    calc_aggregate(
        [AP(rel=2) @ 1000, RR(rel=2) @ 10],
        dataset.qrels_iter(),
        to_ir_measures(r.interpolate(ff_out, 0.2)),
    ),
    "\n\n... with fast-forward re-ranking AND early stopping:\n",
    calc_aggregate(
        [RR(rel=2) @ 10],
        dataset.qrels_iter(),
        to_ir_measures(r.interpolate(ff_out_es, 0.2)),
    ),
)

Lexical retrieval without re-ranking:
 {AP(rel=2)@1000: 0.30128706043561426, RR(rel=2)@10: 0.7024178663713547} 

... with fast-forward re-ranking:
 {AP(rel=2)@1000: 0.43803324500109636, RR(rel=2)@10: 0.8941860465116279} 

... with fast-forward re-ranking AND early stopping:
 {RR(rel=2)@10: 0.8941860465116279}
