The profiles can be viewed by running in the Linux command line:
```
tuna path/to/rerank_ff.prof --port=8000
```

In [1]:
# SETTINGS
device_type = "cpu" # "cpu" or "gpu"
k_s = 1000
in_memory = False
h5_filename = "ff_msmarco-v1-passage.tct_colbert.h5"

In [3]:
import os
import logging
import pyterrier as pt
import datetime

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
logging.basicConfig(level=logging.INFO)

if not pt.started():
    pt.init(tqdm="notebook")

# Create profile directory
mem = "mem" if in_memory else "disk"
profile_dir = f"profiles/{h5_filename}/{device_type}_k{k_s}_{mem}/"
if not os.path.exists(profile_dir):
    os.makedirs(profile_dir)

device_name = "cuda" if device_type == "gpu" else "cpu"

INFO:numexpr.utils:Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.
PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [5]:
from pathlib import Path
from fast_forward import OnDiskIndex, Mode, Ranking
from fast_forward.encoder import TCTColBERTQueryEncoder
import cProfile
import pstats

with cProfile.Profile() as profile:
    q_encoder = TCTColBERTQueryEncoder(
        "castorini/tct_colbert-msmarco", 
        device=device_name
    )
    ff_index = OnDiskIndex.load(
        Path(f"../../{h5_filename}"), 
        query_encoder=q_encoder, 
        mode=Mode.MAXP
    )

    if in_memory:
        ff_index = ff_index.to_memory()

stats = pstats.Stats(profile)
stats.sort_stats(pstats.SortKey.TIME)
stats.dump_stats(profile_dir + "index.prof")

100%|██████████| 57638/57638 [00:00<00:00, 183651.77it/s]


In [6]:
import ir_datasets
dataset = ir_datasets.load("msmarco-passage/trec-dl-2019/judged")
r = Ranking.from_file(
    Path("../../msmarco-passage-test2019-sparse10000.txt"),
    {q.query_id: q.text for q in dataset.queries_iter()},
)

beir/fiqa documents:   0%|          | 0/57638 [00:00<?, ?it/s]

In [7]:
# standard re-ranking, probably takes a few min
with cProfile.Profile() as profile:
    ff_out = ff_index(r.cut(k_s))

stats = pstats.Stats(profile)
stats.sort_stats(pstats.SortKey.TIME)
stats.dump_stats(profile_dir + "rerank_ff.prof")

AttributeError: 'DataFrame' object has no attribute 'has_queries'

In [None]:
# re-ranking with early stopping, also takes a few min
with cProfile.Profile() as profile:
    ff_out_es = ff_index(
        r.cut(k_s),
        early_stopping=10,
        early_stopping_alpha=0.2,
        early_stopping_intervals=(800, 5000),
    )

stats = pstats.Stats(profile)
stats.sort_stats(pstats.SortKey.TIME)
stats.dump_stats(profile_dir + "rerank_ff_es.prof")

INFO:fast_forward.index:depth 800: 16 queries left
INFO:fast_forward.index:depth 5000: 16 queries left
INFO:fast_forward.index:computed scores in 167.9565338829998 seconds


In [None]:
from ir_measures import calc_aggregate, AP, RR
from fast_forward.util import to_ir_measures

print(
    "Lexical retrieval without re-ranking:\n",
    calc_aggregate(
        [AP(rel=2) @ 1000, RR(rel=2) @ 10], dataset.qrels_iter(), to_ir_measures(r)
    ),
    "\n\n... with fast-forward re-ranking:\n",
    calc_aggregate(
        [AP(rel=2) @ 1000, RR(rel=2) @ 10],
        dataset.qrels_iter(),
        to_ir_measures(r.interpolate(ff_out, 0.2)),
    ),
    "\n\n... with fast-forward re-ranking AND early stopping:\n",
    calc_aggregate(
        [RR(rel=2) @ 10],
        dataset.qrels_iter(),
        to_ir_measures(r.interpolate(ff_out_es, 0.2)),
    ),
)

Lexical retrieval without re-ranking:
 {RR(rel=2)@10: 0.7024178663713547, AP(rel=2)@1000: 0.30128706043561426} 

... with fast-forward re-ranking:
 {RR(rel=2)@10: 0.8941860465116279, AP(rel=2)@1000: 0.43803324500109636} 

... with fast-forward re-ranking AND early stopping:
 {RR(rel=2)@10: 0.8941860465116279}
