In [None]:
# faiss-cpu is the correct package for macOS (faiss-gpu requires CUDA)
!pip install sentence-transformers faiss-cpu numpy torch

In [None]:
pip install python-dotenv huggingface_hub

In [1]:
import torch
import platform

print(f"Platform : {platform.platform()}")
print(f"PyTorch  : {torch.__version__}")
print(f"MPS available : {torch.backends.mps.is_available()}")
print(f"MPS built     : {torch.backends.mps.is_built()}")

Platform : macOS-15.7.2-arm64-arm-64bit-Mach-O
PyTorch  : 2.10.0
MPS available : True
MPS built     : True


In [None]:
!ls ~/.cache/huggingface/hub

In [1]:
from scripts.build_index import build_index

# then writes:
#   data/processed/A_index.faiss
#   data/processed/A_index_meta.jsonl
# index, chunks = build_index("A")
# index, chunks = build_index("B")
# index, chunks = build_index("C")
index, chunks = build_index("D")

  from .autonotebook import tqdm as notebook_tqdm


Building FAISS index — Collection D
Model : BAAI/bge-m3
  Loaded 1434 chunks
  Loading model : BAAI/bge-m3
  Device        : MPS


Loading weights: 100%|██████████| 391/391 [00:00<00:00, 1635.89it/s, Materializing param=pooler.dense.weight]                               


  Model loaded in 5.1s
  [truncate] 171/1434 chunks truncated to 2000 chars
  Encoding 1434 chunks on MPS (batch_size=8)...


Batches: 100%|██████████| 180/180 [01:52<00:00,  1.60it/s]

  Done in 112.9s  (12.7 chunks/s)
  Embedding shape : (1434, 1024)  dtype: float32
  Vector norms    : mean=1.0000  min=1.0000  max=1.0000
  FAISS index     : IndexFlatIP  dim=1024  ntotal=1434

  → Index : data/processed/D_index.faiss  (5736 KB)
  → Meta  : data/processed/D_index_meta.jsonl  (460 KB)





In [5]:
from scripts.build_index import build_index


In [2]:
# Cell 1 — 加载模型 + 所有 index
from scripts.build_index import load_index, search
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("BAAI/bge-m3")

indexes = {}
for col in ["A", "B", "C", "D"]:
    idx, meta = load_index(col)
    indexes[col] = {"index": idx, "meta": meta}
    print(f"[{col}] ntotal={idx.ntotal}  meta={len(meta)}")

Loading weights: 100%|██████████| 391/391 [00:00<00:00, 1932.94it/s, Materializing param=pooler.dense.weight]                               


  Loaded index: dim=1024  ntotal=592
[A] ntotal=592  meta=592
  Loaded index: dim=1024  ntotal=1450
[B] ntotal=1450  meta=1450
  Loaded index: dim=1024  ntotal=962
[C] ntotal=962  meta=962
  Loaded index: dim=1024  ntotal=1434
[D] ntotal=1434  meta=1434


In [4]:
# Cell 2 — 测试 search（改成你实际关心的 query）
TEST_QUERIES = [
    "What is the signature dish of Pamela's Diner?",
]
TOP_K = 5

for col, data in indexes.items():
    print(f"\n{'='*50}  [{col}]")
    for query in TEST_QUERIES:
        results = search(query, data["index"], data["meta"], model, top_k=TOP_K)
        for r in results:
            print(f"  {r['score']:.4f}  {r['chunk_id']}  {r.get('section')}")


  0.3638  A_wiki_Meadowcroft Rockshelter - Wikipedia__0008  Tourism and historical designations
  0.3586  A_wiki_List of Carnegie Mellon University people - Wikipedia__0023  Notable students and alumni
  0.3517  A_wiki_Carnegie Mellon University traditions - Wikipedia__0002  Alma Mater
  0.3489  A_wiki_Scotch'n'Soda Theatre - Wikipedia__0012  Notable alumni
  0.3486  A_wiki_List of Carnegie Mellon University people - Wikipedia__0057  Fictional alumni

  0.3839  B_2025_operating_budget__0813  **Pension Funding**
  0.3831  B_2025_operating_budget__1015  **Pension Funding**
  0.3828  B_2025_operating_budget__1014  **Pension Funding**
  0.3789  B_2025_operating_budget__1012  **Pension Funding**
  0.3769  B_2025_operating_budget__1011  **Pension Funding**

  0.4369  C_recurring_events_pittsburgh__0360  Theater
  0.4112  C_recurring_events_pittsburgh__0257  Other Stuff
  0.4108  C_recurring_events_pittsburgh__0292  Trivia
  0.3843  C_recurring_events_pittsburgh__0293  DJs
  0.3832  C_pgh_ev

In [11]:
results = search("When was Carnegie Mellon University founded?", indexes["A"]["index"], indexes["A"]["meta"], model, top_k=5)

for r in results:
    print(f"{r['score']:.4f}  {r['chunk_id']}  {r.get('section')}")

0.6423  A_wiki_Carnegie Mellon University - Wikipedia__0000  
0.5996  A_wiki_Mellon College of Science - Wikipedia__0001  History
0.5858  A_wiki_Carnegie Mellon University traditions - Wikipedia__0002  Alma Mater
0.5820  A_wiki_Carnegie Mellon School of Art - Wikipedia__0001  History
0.5792  A_wiki_List of Carnegie Mellon University fraternities and sororities - Wikipedia__0000  
