In [None]:
%pip install bm25s PyStemmer

Collecting bm25s
  Downloading bm25s-0.3.0-py3-none-any.whl.metadata (27 kB)
Collecting PyStemmer
  Downloading pystemmer-3.0.0-cp313-cp313-macosx_11_0_arm64.whl.metadata (4.0 kB)
Downloading bm25s-0.3.0-py3-none-any.whl (69 kB)
Downloading pystemmer-3.0.0-cp313-cp313-macosx_11_0_arm64.whl (240 kB)
Installing collected packages: PyStemmer, bm25s
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [bm25s]
[1A[2KSuccessfully installed PyStemmer-3.0.0 bm25s-0.3.0


In [1]:
import sys, json
from pathlib import Path
from collections import defaultdict

sys.path.insert(0, "scripts/")
from scripts.build_bm25 import build_bm25, load_bm25, bm25_search

# ── 1. Build BM25 indexes for all collections ────────────────────────────────
# Skip collections whose chunks.jsonl doesn't exist yet.
COLLECTIONS = ["A", "B", "C", "D"]
retrievers  = {}

for col in COLLECTIONS:
    chunks_path = Path(f"data/processed/{col}_chunks.jsonl")
    if not chunks_path.exists():
        print(f"[skip] {col}_chunks.jsonl not found")
        continue
    retrievers[col] = build_bm25(col)
    print()


# ── 2. Sparse-only runner test ───────────────────────────────────────────────
TEST_QUERIES = [
    # (query_string, expected_collection)
    ("Pittsburgh steel industry Carnegie history",          "A"),
    ("Andrew Mellon banker philanthropy art",               "A"),
    ("Carnegie Mellon University robotics computer science","A"),
    ("Pittsburgh Symphony Orchestra concert season",        "D"),
    ("Little Italy Days food festival Bloomfield",          "D"),
    ("Pittsburgh Steelers Super Bowl football",             "D"),
    ("Pennsylvania budget regulations 2024",                "B"),
    ("campus events CMU spring 2026",                       "C"),
]

print("=" * 70)
print("Sparse-only (BM25) retrieval test")
print("=" * 70)

TOP_K = 5
results_log = []

for query, expected_col in TEST_QUERIES:
    print(f"\nQuery : {query!r}")
    print(f"  Expected collection: {expected_col}")

    all_hits = []
    for col, ret in retrievers.items():
        hits = bm25_search(query, retriever=ret, top_k=TOP_K)
        for h in hits:
            h["_col"] = col   # tag with collection for cross-collection ranking
        all_hits.extend(hits)

    # Sort all hits across collections by BM25 score
    all_hits.sort(key=lambda h: -h["score"])

    print(f"  Top {min(TOP_K, len(all_hits))} cross-collection results:")
    for i, h in enumerate(all_hits[:TOP_K]):
        col_tag = h["_col"]
        mark    = "✓" if col_tag == expected_col else " "
        section = h.get("section") or h.get("md_title") or ""
        print(f"    [{i+1}] {mark} col={col_tag}  score={h['score']:.4f}  "
              f"chunk={h['chunk_id']}  sec={section[:40]}")

    top_col = all_hits[0]["_col"] if all_hits else None
    results_log.append({
        "query": query,
        "expected": expected_col,
        "top_hit_col": top_col,
        "correct": top_col == expected_col,
    })


# ── 3. Summary ───────────────────────────────────────────────────────────────
print("\n" + "=" * 70)
print("Summary")
print("=" * 70)
correct = sum(1 for r in results_log if r["correct"])
print(f"  Collection routing accuracy: {correct}/{len(results_log)}")
for r in results_log:
    mark = "✓" if r["correct"] else "✗"
    print(f"  {mark}  expected={r['expected']}  got={r['top_hit_col']}  "
          f"| {r['query'][:55]}")


# ── 4. Index file sizes ───────────────────────────────────────────────────────
print("\nIndex sizes on disk:")
for col in COLLECTIONS:
    bm25_dir = Path(f"data/processed/{col}_bm25")
    if bm25_dir.exists():
        kb = sum(f.stat().st_size for f in bm25_dir.iterdir()) / 1024
        print(f"  {col}_bm25/   {kb:.0f} KB")

  from .autonotebook import tqdm as notebook_tqdm


Building BM25 index — Collection A
  Loaded 592 chunks
  Tokenizing...
  Tokenized in 0.0s  (vocab size: 11,070)
  Indexing...
  Indexed in 0.0s


Finding newlines for mmindex: 100%|██████████| 217k/217k [00:00<00:00, 90.3MB/s]



  → Saved to data/processed/A_bm25  (862 KB total)
      corpus.jsonl                               212.4 KB
      corpus.mmindex.json                          4.3 KB
      data.csc.index.npy                         187.3 KB
      indices.csc.index.npy                      187.3 KB
      indptr.csc.index.npy                        86.6 KB
      params.index.json                            0.2 KB
      vocab.index.json                           183.9 KB

Building BM25 index — Collection B
  Loaded 1450 chunks
  Tokenizing...
  Tokenized in 0.0s  (vocab size: 3,948)
  Indexing...
  Indexed in 0.0s


Finding newlines for mmindex: 100%|██████████| 634k/634k [00:00<00:00, 174MB/s]


  → Saved to data/processed/B_bm25  (1292 KB total)
      corpus.jsonl                               619.4 KB
      corpus.mmindex.json                         11.1 KB
      data.csc.index.npy                         285.3 KB
      indices.csc.index.npy                      285.3 KB
      indptr.csc.index.npy                        31.0 KB
      params.index.json                            0.2 KB
      vocab.index.json                            60.0 KB

Building BM25 index — Collection C
  Loaded 962 chunks
  Tokenizing...
  Tokenized in 0.0s  (vocab size: 4,984)
  Indexing...





  Indexed in 0.0s


Finding newlines for mmindex: 100%|██████████| 317k/317k [00:00<00:00, 97.6MB/s]



  → Saved to data/processed/C_bm25  (679 KB total)
      corpus.jsonl                               309.9 KB
      corpus.mmindex.json                          7.2 KB
      data.csc.index.npy                         121.7 KB
      indices.csc.index.npy                      121.7 KB
      indptr.csc.index.npy                        39.1 KB
      params.index.json                            0.2 KB
      vocab.index.json                            79.4 KB

Building BM25 index — Collection D
  Loaded 1247 chunks
  Tokenizing...
  Tokenized in 0.1s  (vocab size: 273,928)
  Indexing...
  Indexed in 0.1s


Finding newlines for mmindex: 100%|██████████| 414k/414k [00:00<00:00, 97.9MB/s]


  → Saved to data/processed/D_bm25  (16832 KB total)
      corpus.jsonl                               404.7 KB
      corpus.mmindex.json                          9.4 KB
      data.csc.index.npy                        1480.8 KB
      indices.csc.index.npy                     1480.8 KB
      indptr.csc.index.npy                      2140.2 KB
      params.index.json                            0.2 KB
      vocab.index.json                         11316.3 KB

Sparse-only (BM25) retrieval test

Query : 'Pittsburgh steel industry Carnegie history'
  Expected collection: A
  Top 5 cross-collection results:
    [1] ✓ col=A  score=5.6650  chunk=A_wiki_Andrew Carnegie - Wikipedia__0000  sec=
    [2] ✓ col=A  score=5.5864  chunk=A_wiki_Andrew Carnegie - Wikipedia__0011  sec=Biography
    [3] ✓ col=A  score=5.3836  chunk=A_wiki_Andrew Carnegie - Wikipedia__0008  sec=Biography
    [4] ✓ col=A  score=5.3491  chunk=A_wiki_Andrew Carnegie - Wikipedia__0025  sec=Controversies
    [5] ✓ col=A  score=5.


