# Notebook 03 — Embeddings (E5) & FAISS index
Goal: build one-QA-per-chunk embeddings with E5 (L2-normalized) and store in FAISS + Parquet metadata.

## Setup & load sidecar

Purpose: load qa_meta.jsonl, create stable row ids, and prep passage strings for E5.

In [16]:
# --- 0) Setup & load ---
from pathlib import Path
import pandas as pd
import json

NOTEBOOK_DIR = Path.cwd()
ROOT = NOTEBOOK_DIR.parent

PROCESSED_DIR = ROOT / "data" / "processed"
INDEX_DIR     = ROOT / "data" / "index"
INDEX_DIR.mkdir(parents=True, exist_ok=True)

SIDEcar = PROCESSED_DIR / "jsonl" / "qa_meta.jsonl"
assert SIDEcar.exists(), f"Missing {SIDEcar}"

df = pd.read_json(SIDEcar, lines=True)
# stable row id aligned with embedding order
df = df.reset_index(drop=True).rename_axis("rid").reset_index()
# E5 passage text (keep Q + A together)
df["passage"] = "passage: " + df["question"].astype(str).str.strip() + "\n" + df["answer"].astype(str).str.strip()

print("Rows:", len(df))
print("Columns:", list(df.columns)[:10], "…")
print(df[["rid","qa_id","competition","topic","page_start","page_end"]].head(3))

Rows: 102
Columns: ['rid', 'qa_id', 'competition', 'topic', 'stage', 'page_start', 'page_end', 'question', 'answer', 'dates'] …
   rid        qa_id competition        topic  page_start  page_end
0    0  ADRES-QG001       ADRES  eligibility          19        19
1    1  ADRES-QG002       ADRES  eligibility          19        19
2    2  ADRES-QG003       ADRES  eligibility          19        19


## Build chunks from sidecar (QA + micro-chunks)

Purpose: create meta with content strings ready for embedding (no model yet).

In [17]:
# --- 1) Build chunks (1 main chunk per QA + tiny micro-chunks) ---
import pandas as pd

records = []
rid = 0

for row in df.itertuples(index=False):
    # main chunk: atomic Q&A
    content_main = f"Q: {str(row.question).strip()}\nA: {str(row.answer).strip()}"
    records.append({
        "rid": rid, "qa_id": row.qa_id, "competition": row.competition, "topic": row.topic,
        "stage": row.stage, "page_start": row.page_start, "page_end": row.page_end,
        "chunk_type": "qa", "content": content_main
    }); rid += 1

    # micro-chunks: formulas (BSP/equations) — in this dataset they are plain strings
    for f in (row.formulas or []):
        f_str = (f or "").strip()
        if f_str:
            records.append({
                "rid": rid, "qa_id": row.qa_id, "competition": row.competition, "topic": "scoring",
                "stage": row.stage, "page_start": row.page_start, "page_end": row.page_end,
                "chunk_type": "formula", "content": f"Formula: {f_str}"
            }); rid += 1

    # micro-chunks: limits/dates (compact retrieval signals)
    nums_list = list({(n if isinstance(n, str) else " ".join(n)) for n in (row.numbers or [])})
    # dates are dicts like {"raw": "...", "iso": "..."}; keep raw for retrieval
    raw_dates = []
    for d in (row.dates or []):
        if isinstance(d, dict) and "raw" in d:
            raw_dates.append(d["raw"])
        elif isinstance(d, str):
            raw_dates.append(d)
    raw_dates = list(set(raw_dates))

    if nums_list or raw_dates:
        compact = " | ".join(
            [ "; ".join(sorted(nums_list)) ] if raw_dates == [] else
            [ "; ".join(sorted(nums_list)) , "; ".join(sorted(raw_dates)) ] if nums_list else
            [ "; ".join(sorted(raw_dates)) ]
        )
        records.append({
            "rid": rid, "qa_id": row.qa_id, "competition": row.competition, "topic": row.topic,
            "stage": row.stage, "page_start": row.page_start, "page_end": row.page_end,
            "chunk_type": "limits", "content": compact
        }); rid += 1

meta = pd.DataFrame.from_records(records)
print(len(meta), meta["chunk_type"].value_counts())

137 chunk_type
qa         102
limits      26
formula      9
Name: count, dtype: int64


## Embedding config & deps

Purpose: set model id, batch size, device; import libs; define output paths.

In [19]:
# --- 2) Embedding config & deps ---
import numpy as np
import torch, faiss
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

EMBED_MODEL_ID = "intfloat/multilingual-e5-base"  # good TR/EN quality
BATCH_SIZE = 24                                    # safe for M2/8GB; raise if it feels comfy

DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"
META_PATH  = INDEX_DIR / "meta.parquet"
FAISS_PATH = INDEX_DIR / "faiss_e5.index"

print("Device:", DEVICE)
print("Model:", EMBED_MODEL_ID)
print("Batch size:", BATCH_SIZE)
print("Meta path:", META_PATH)
print("FAISS path:", FAISS_PATH)

Device: mps
Model: intfloat/multilingual-e5-base
Batch size: 24
Meta path: /Users/macbook/T3/rag-tekno/data/index/meta.parquet
FAISS path: /Users/macbook/T3/rag-tekno/data/index/faiss_e5.index


## Load E5 model + tiny probe

Purpose: make sure the model runs on your device.

In [21]:
# --- 3) Load the E5 model (multilingual), prefer Apple MPS if available ---
from sentence_transformers import SentenceTransformer

print("Device:", DEVICE)
model = SentenceTransformer(EMBED_MODEL_ID, device=DEVICE)

# ✅ quick probe
_probe = model.encode(["passage: test", "passage: deneme"], normalize_embeddings=True)
print("Probe embeddings OK, dim:", _probe.shape[1])

Device: mps
Probe embeddings OK, dim: 768


## Embed all chunks (with E5 prefixes)

Purpose: produce L2-normalized vectors.

In [22]:
# --- 4) Embed all chunks with required E5 prefix and L2 normalization ---
from tqdm import tqdm
import numpy as np

texts = ["passage: " + t for t in meta["content"].tolist()]
embeddings = []

for i in tqdm(range(0, len(texts), BATCH_SIZE)):
    batch = texts[i:i+BATCH_SIZE]
    embs = model.encode(batch, normalize_embeddings=True, show_progress_bar=False)
    embeddings.append(embs)

emb = np.vstack(embeddings).astype("float32")
print("Emb shape:", emb.shape)

100%|██████████| 6/6 [00:11<00:00,  1.86s/it]

Emb shape: (137, 768)





### Shape check

Purpose: sanity that we have one vector per chunk.

In [23]:
# CHECK: embedding shape aligns with meta rows
assert emb.shape[0] == len(meta), "Mismatch: embeddings count != meta rows"
print("Embeddings OK:", emb.shape, "rows in meta:", len(meta))

Embeddings OK: (137, 768) rows in meta: 137


## Build + save FAISS and metadata

Purpose: store index & metadata for retrieval.

In [24]:
# --- 5) Build FAISS index (IP on normalized vectors ≈ cosine) and persist ---
import faiss

index = faiss.IndexFlatIP(emb.shape[1])
index.add(emb)

FAISS_PATH = INDEX_DIR / "faiss_e5.index"
META_PATH  = INDEX_DIR / "meta.parquet"

faiss.write_index(index, str(FAISS_PATH))
meta.to_parquet(META_PATH, index=False)

print("Saved index →", FAISS_PATH)
print("Saved metadata →", META_PATH)

Saved index → /Users/macbook/T3/rag-tekno/data/index/faiss_e5.index
Saved metadata → /Users/macbook/T3/rag-tekno/data/index/meta.parquet


## Reload + toy query

Purpose: verify search round-trip and metadata alignment.

In [25]:
# CHECK: reload index, run a toy query using E5 'query:' prefix
index2 = faiss.read_index(str(FAISS_PATH))
sample_q = meta.iloc[0]["content"].split("\n")[0]  # first question line
qvec = model.encode(["query: " + sample_q], normalize_embeddings=True).astype("float32")
D, I = index2.search(qvec, k=5)

print("Top-5 distances:", D[0])
print("Top-5 rids:", I[0])
print(meta.iloc[I[0]][["qa_id","chunk_type","competition","topic"]])

Top-5 distances: [0.88475597 0.85862666 0.8555093  0.83911455 0.8373387 ]
Top-5 rids: [ 0 68 99 27 90]
          qa_id chunk_type competition        topic
0   ADRES-QG001         qa       ADRES  eligibility
68     HSS-Q001         qa         HSS  eligibility
99     HSS-Q023         qa         HSS         team
27  ADRES-QG024         qa       ADRES  eligibility
90     HSS-Q018         qa         HSS         team


In [28]:
assert index2.ntotal == len(meta), "FAISS index size mismatch"