In [1]:
# Install dependencies (cpu-friendly)
%pip install -q sentence-transformers faiss-cpu torch numpy python-dotenv huggingface_hub

Note: you may need to restart the kernel to use updated packages.


In [2]:
import platform
import torch

print(f"Platform        : {platform.platform()}")
print(f"PyTorch version : {torch.__version__}")
print(f"CUDA available  : {torch.cuda.is_available()}")
print(f"MPS available   : {torch.backends.mps.is_available()}")
print(f"MPS built       : {torch.backends.mps.is_built()}")

Platform        : macOS-15.7.2-arm64-arm-64bit-Mach-O
PyTorch version : 2.10.0
CUDA available  : False
MPS available   : True
MPS built       : True


In [3]:
# Build FAISS indexes for all collections (A–D)
from pathlib import Path
import shutil
from scripts.build_index import build_index, PROCESSED_DIR

collections = ["A", "B", "C", "D"]
built = {}
missing = []

for col in collections:
    root_chunks = PROCESSED_DIR / f"{col}_chunks.jsonl"
    nested_chunks = PROCESSED_DIR / col / f"{col}_chunks.jsonl"
    # If only nested file exists, copy it to the expected root location for build_index
    if not root_chunks.exists() and nested_chunks.exists():
        root_chunks.parent.mkdir(parents=True, exist_ok=True)
        shutil.copyfile(nested_chunks, root_chunks)
        print(f"Copied {nested_chunks} -> {root_chunks} for build_index")
    if not root_chunks.exists():
        missing.append((col, nested_chunks if nested_chunks.exists() else root_chunks))
        continue
    index, chunks = build_index(col)
    built[col] = {"index": index, "chunks": chunks}
    print(f"[{col}] index built — ntotal={index.ntotal}")

if missing:
    msg = "\n".join([f"  [{col}] missing chunks: {path}" for col, path in missing])
    raise FileNotFoundError(
        "Chunk files missing. Run the chunking pipeline first for these collections.\n" + msg
    )

  from .autonotebook import tqdm as notebook_tqdm


Building FAISS index — Collection A
Model : BAAI/bge-m3
  Loaded 592 chunks
  Loading model : BAAI/bge-m3
  Device        : MPS


Loading weights: 100%|██████████| 391/391 [00:00<00:00, 2094.50it/s, Materializing param=pooler.dense.weight]                               


  Model loaded in 5.8s
  [truncate] 17/592 chunks truncated to 2000 chars
  Encoding 592 chunks on MPS (batch_size=8)...


Batches: 100%|██████████| 74/74 [00:38<00:00,  1.94it/s]


  Done in 38.3s  (15.5 chunks/s)
  Embedding shape : (592, 1024)  dtype: float32
  Vector norms    : mean=1.0000  min=1.0000  max=1.0000
  FAISS index     : IndexFlatIP  dim=1024  ntotal=592

  → Index : data/processed/A_index.faiss  (2368 KB)
  → Meta  : data/processed/A_index_meta.jsonl  (212 KB)
[A] index built — ntotal=592
Building FAISS index — Collection B
Model : BAAI/bge-m3
  Loaded 1450 chunks
  Loading model : BAAI/bge-m3
  Device        : MPS


Loading weights: 100%|██████████| 391/391 [00:00<00:00, 1787.05it/s, Materializing param=pooler.dense.weight]                               


  Model loaded in 6.8s
  [truncate] 3/1450 chunks truncated to 2000 chars
  Encoding 1450 chunks on MPS (batch_size=8)...


Batches: 100%|██████████| 182/182 [00:58<00:00,  3.12it/s]


  Done in 58.3s  (24.9 chunks/s)
  Embedding shape : (1450, 1024)  dtype: float32
  Vector norms    : mean=1.0000  min=1.0000  max=1.0000
  FAISS index     : IndexFlatIP  dim=1024  ntotal=1450

  → Index : data/processed/B_index.faiss  (5800 KB)
  → Meta  : data/processed/B_index_meta.jsonl  (619 KB)
[B] index built — ntotal=1450
Building FAISS index — Collection C
Model : BAAI/bge-m3
  Loaded 962 chunks
  Loading model : BAAI/bge-m3
  Device        : MPS


Loading weights: 100%|██████████| 391/391 [00:00<00:00, 1927.92it/s, Materializing param=pooler.dense.weight]                               


  Model loaded in 6.8s
  [truncate] 10/962 chunks truncated to 2000 chars
  Encoding 962 chunks on MPS (batch_size=8)...


Batches: 100%|██████████| 121/121 [00:20<00:00,  5.97it/s]


  Done in 20.3s  (47.4 chunks/s)
  Embedding shape : (962, 1024)  dtype: float32
  Vector norms    : mean=1.0000  min=1.0000  max=1.0000
  FAISS index     : IndexFlatIP  dim=1024  ntotal=962

  → Index : data/processed/C_index.faiss  (3848 KB)
  → Meta  : data/processed/C_index_meta.jsonl  (310 KB)
[C] index built — ntotal=962
Building FAISS index — Collection D
Model : BAAI/bge-m3
  Loaded 1434 chunks
  Loading model : BAAI/bge-m3
  Device        : MPS


Loading weights: 100%|██████████| 391/391 [00:00<00:00, 2066.87it/s, Materializing param=pooler.dense.weight]                               


  Model loaded in 6.9s
  [truncate] 171/1434 chunks truncated to 2000 chars
  Encoding 1434 chunks on MPS (batch_size=8)...


Batches: 100%|██████████| 180/180 [02:30<00:00,  1.19it/s]


  Done in 150.8s  (9.5 chunks/s)
  Embedding shape : (1434, 1024)  dtype: float32
  Vector norms    : mean=1.0000  min=1.0000  max=1.0000
  FAISS index     : IndexFlatIP  dim=1024  ntotal=1434

  → Index : data/processed/D_index.faiss  (5736 KB)
  → Meta  : data/processed/D_index_meta.jsonl  (460 KB)
[D] index built — ntotal=1434


In [4]:
# Load saved indexes and model for querying
from pathlib import Path
from sentence_transformers import SentenceTransformer
from scripts.build_index import load_index, search, MODEL_NAME, PROCESSED_DIR

model = SentenceTransformer(MODEL_NAME)

indexes = {}
missing = []
for col in ["A", "B", "C", "D"]:
    index_path = PROCESSED_DIR / f"{col}_index.faiss"
    if not index_path.exists():
        missing.append((col, index_path))
        continue
    idx, meta = load_index(col)
    indexes[col] = {"index": idx, "meta": meta}
    print(f"[{col}] ntotal={idx.ntotal}  meta={len(meta)}")

if missing:
    msg = "\n".join([f"  [{col}] missing index: {path}" for col, path in missing])
    raise FileNotFoundError(
        "Indexes missing. Run the build step for these collections first.\n" + msg
    )

Loading weights: 100%|██████████| 391/391 [00:00<00:00, 2115.39it/s, Materializing param=pooler.dense.weight]                               


  Loaded index: dim=1024  ntotal=592
[A] ntotal=592  meta=592
  Loaded index: dim=1024  ntotal=1450
[B] ntotal=1450  meta=1450
  Loaded index: dim=1024  ntotal=962
[C] ntotal=962  meta=962
  Loaded index: dim=1024  ntotal=1434
[D] ntotal=1434  meta=1434


In [5]:
# Example multi-collection search
TEST_QUERIES = [
    "What is the signature dish of Pamela's Diner?",
    "When was Carnegie Mellon University founded?",
    "What is the Pittsburgh city budget for 2025?",
    "Upcoming CMU events in May 2026",
    "Best brunch spots in Pittsburgh"
 ]
TOP_K = 5

for col, data in indexes.items():
    print(f"\n{'='*50}  [{col}]")
    for query in TEST_QUERIES:
        results = search(query, data["index"], data["meta"], model, top_k=TOP_K)
        print(f"Query: {query}")
        for r in results:
            print(f"  {r['score']:.4f}  {r['chunk_id']}  {r.get('section')}")


Query: What is the signature dish of Pamela's Diner?
  0.3638  A_wiki_Meadowcroft Rockshelter - Wikipedia__0008  Tourism and historical designations
  0.3586  A_wiki_List of Carnegie Mellon University people - Wikipedia__0023  Notable students and alumni
  0.3517  A_wiki_Carnegie Mellon University traditions - Wikipedia__0002  Alma Mater
  0.3489  A_wiki_Scotch'n'Soda Theatre - Wikipedia__0012  Notable alumni
  0.3486  A_wiki_List of Carnegie Mellon University people - Wikipedia__0057  Fictional alumni
Query: When was Carnegie Mellon University founded?
  0.6423  A_wiki_Carnegie Mellon University - Wikipedia__0000  
  0.5996  A_wiki_Mellon College of Science - Wikipedia__0001  History
  0.5858  A_wiki_Carnegie Mellon University traditions - Wikipedia__0002  Alma Mater
  0.5820  A_wiki_Carnegie Mellon School of Art - Wikipedia__0001  History
  0.5792  A_wiki_List of Carnegie Mellon University fraternities and sororities - Wikipedia__0000  
Query: What is the Pittsburgh city budget for 