### Embedding and Vector indexing 

In [2]:
# Load news_clean.csv and sanity-check

from pathlib import Path
import pandas as pd

print("=== Embeddings · Step A: Load & sanity-check ===")

DATA_DIR = Path("../data").resolve()
news_path = DATA_DIR / "news_clean.csv"

print(f"Looking for: {news_path}")
if not news_path.exists():
    raise FileNotFoundError(f"Could not find {news_path}. Did you run the news cleaning notebook?")

df = pd.read_csv(news_path)

print(f"Loaded rows: {len(df):,}")
print("Columns:", list(df.columns))

required = ["date", "title", "source", "doc_id"]
missing = [c for c in required if c not in df.columns]
if missing:
    raise AssertionError(f"Missing required columns: {missing}")

# Light cleanup (no mutation yet)
null_title = df["title"].isna().sum()
null_source = df["source"].isna().sum()
dupe_ids = df["doc_id"].duplicated().sum()

print(f"Null titles:  {null_title:,}")
print(f"Null sources: {null_source:,}")
print(f"Duplicate doc_id values: {dupe_ids:,}")

print("\nPreview (5 rows):")
display(df.head(5)[["date", "title", "source", "doc_id"]])


=== Embeddings · Step A: Load & sanity-check ===
Looking for: /Users/valentinreateguirangel/Documents/MSc Machine Learning/Finance_RAG_why_move/finance-rag-why-move/data/news_clean.csv
Loaded rows: 887,221
Columns: ['date', 'title', 'source', 'doc_id']
Null titles:  0
Null sources: 0
Duplicate doc_id values: 0

Preview (5 rows):


Unnamed: 0,date,title,source,doc_id
0,2020-06-01,Agilent Technologies Announces Pricing of $5……...,GuruFocus,news_0
1,2020-05-18,Agilent (A) Gears Up for Q2 Earnings: What's i...,Zacks,news_1
2,2020-05-15,J.P. Morgan Asset Management Announces Liquida...,GuruFocus,news_2
3,2020-05-15,"Pershing Square Capital Management, L.P. Buys ...",GuruFocus,news_3
4,2020-05-12,Agilent Awards Trilogy Sciences with a Golden ...,GuruFocus,news_4


In [5]:
# Load embedding model and prepare Chroma index directory

from pathlib import Path
import torch
from sentence_transformers import SentenceTransformer

print("=== Embeddings · Step B: model + paths ===")

#  Choose device
if torch.cuda.is_available():
    device = "cuda"
elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
    device = "mps"  # Apple Silicon GPU
else:
    device = "cpu"
print(f"Device selected: {device}")

#  Load model
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
print(f"Loading model: {MODEL_NAME} ...")
model = SentenceTransformer(MODEL_NAME, device=device)
print("Model loaded.")

# quick embedding smoke test (tiny)
test_vec = model.encode(["hello world"], normalize_embeddings=True)
assert test_vec.shape == (1, 384), f"Unexpected embedding shape: {test_vec.shape}"
print(f"Embedding dim: {test_vec.shape[1]} (expected 384)")

# Prepare Chroma output directory
INDEX_DIR = Path("../data/chroma_index/why-move-v1").resolve()
INDEX_DIR.mkdir(parents=True, exist_ok=True)
print(f"Chroma index directory: {INDEX_DIR}")

=== Embeddings · Step B: model + paths ===
Device selected: mps
Loading model: sentence-transformers/all-MiniLM-L6-v2 ...
Model loaded.
Embedding dim: 384 (expected 384)
Chroma index directory: /Users/valentinreateguirangel/Documents/MSc Machine Learning/Finance_RAG_why_move/finance-rag-why-move/data/chroma_index/why-move-v1


In [None]:
# Embed first 100 titles and upsert into Chroma (v0.5+)

import chromadb
from chromadb.utils import embedding_functions  # (not required, but handy later)

print("=== Test insert (clean) — 100 titles into Chroma ===")

#  Slice a safe test batch
df_test = df.head(100).copy()
assert {"title", "doc_id", "date", "source"}.issubset(df_test.columns), "Missing required columns"
df_test["title"] = df_test["title"].astype(str).str.strip()
df_test = df_test[df_test["title"] != ""]
print(f"Rows selected for test: {len(df_test)}")

#  Prepare inputs
texts = df_test["title"].tolist()
ids = [f"{i}-t" for i in df_test["doc_id"].tolist()]  # '-t' suffix to mark test inserts
metas = df_test[["date", "source", "doc_id"]].to_dict(orient="records")

#  Embed (normalize for cosine similarity)
print("Encoding test titles...")
embs = model.encode(texts, normalize_embeddings=True).tolist()
print(f"Embeddings shape: ({len(embs)}, {len(embs[0]) if embs else '??'})")

#  Connect to persistent Chroma collection
COLLECTION_NAME = "why-move-v1"
client = chromadb.PersistentClient(path=str(INDEX_DIR))
collection = client.get_or_create_collection(
    name=COLLECTION_NAME,
    metadata={"hnsw:space": "cosine"},
)
print(f"Collection ready: {collection.name}")

# Upsert
collection.upsert(
    ids=ids,
    documents=texts,
    embeddings=embs,
    metadatas=metas,
)

# Confirm
count = collection.count()
print(f" Test upsert OK. Collection count now: {count}")

# Quick query sanity check
q = texts[0][:48]
res = collection.query(query_texts=[q], n_results=3)
print("\nSample query:", q)
print("Top matches:")
for t in res.get("documents", [[]])[0]:
    print(" -", (t[:80] + "...") if len(t) > 80 else t)

print("\nNote: This was a 100-row smoke test. Next we’ll batch through all rows safely.")

=== Test insert (clean) — 100 titles into Chroma ===
Rows selected for test: 100
Encoding test titles...


Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


Embeddings shape: (100, 384)
Collection ready: why-move-v1
✅ Test upsert OK. Collection count now: 100


/Users/valentinreateguirangel/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:10<00:00, 8.05MiB/s]
Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given



Sample query: Agilent Technologies Announces Pricing of $5…… M
Top matches:
 - Agilent Technologies Announces Pricing of $5…… Million of Senior Notes
 - Agilent Technologies (A) Surpasses Q4 Earnings and Revenue Estimates
 - Agilent Technologies (A) Earnings Expected to Grow: Should You Buy?

Note: This was a 100-row smoke test. Next we’ll batch through all rows safely.


In [8]:
#  Batch-embed all titles and upsert into Chroma (resume-friendly)

import json
import math
from datetime import datetime
from pathlib import Path
import chromadb
import numpy as np

print("=== Batch embedding & upsert into Chroma ===")

# --- Config (tweak these safely) ---
COLLECTION_NAME = "why-move-v1"
BATCH_SIZE = 1000          # adjust based on your machine (memory vs speed)
MAX_ROWS = None            # e.g., 100_000 for a partial run; None = all rows
START_IDX = 0              # set >0 to resume from a later row

# --- Input dataframe checks ---
assert {"title","doc_id","date","source"}.issubset(df.columns), "df missing required columns"
df_work = df.copy()
if MAX_ROWS is not None:
    df_work = df_work.iloc[:MAX_ROWS]

total_rows = len(df_work)
print(f"Total rows considered this run: {total_rows:,}")
assert total_rows > 0, "No rows to process."

# Clean titles and drop empties
df_work["title"] = df_work["title"].astype(str).str.strip()
df_work = df_work[df_work["title"] != ""]
total_rows = len(df_work)
print(f"Rows after stripping empty titles: {total_rows:,}")

# Make sure doc_id uniqueness
dupes = df_work["doc_id"].duplicated().sum()
if dupes:
    print(f"[WARN] Found {dupes} duplicate doc_id rows; keeping first occurrences.")
    df_work = df_work.drop_duplicates(subset=["doc_id"])
    total_rows = len(df_work)

# --- Connect Chroma persistent collection ---
client = chromadb.PersistentClient(path=str(INDEX_DIR))
collection = client.get_or_create_collection(
    name=COLLECTION_NAME,
    metadata={"hnsw:space": "cosine"},
)
print(f"Collection: {collection.name}")

# --- Batching ---
num_batches = math.ceil((total_rows - START_IDX) / BATCH_SIZE) if total_rows > START_IDX else 0
print(f"Processing from index {START_IDX:,} in {num_batches} batches of {BATCH_SIZE}.")

processed = 0
for i in range(START_IDX, total_rows, BATCH_SIZE):
    j = min(i + BATCH_SIZE, total_rows)
    batch = df_work.iloc[i:j]

    texts = batch["title"].tolist()
    ids   = batch["doc_id"].tolist()
    metas = batch[["date","source","doc_id"]].to_dict(orient="records")

    # Embed — normalize for cosine similarity
    embs = model.encode(texts, normalize_embeddings=True)
    if isinstance(embs, list):
        embs = np.asarray(embs, dtype=np.float32)
    assert embs.shape[0] == len(batch), f"Embedding count mismatch at [{i}:{j}]"

    # Upsert into collection (idempotent for re-runs)
    collection.upsert(
        ids=ids,
        documents=texts,
        embeddings=embs.tolist(),
        metadatas=metas,
    )

    processed += len(batch)
    if ((i // BATCH_SIZE) + 1) % 10 == 0 or j == total_rows:
        print(f"  - Up to row {j:,} / {total_rows:,} (this run). Collection count: {collection.count()}")

print(f"\n Done. Total processed this run: {processed:,}")
print(f"Chroma collection count now: {collection.count():,}")

# --- Write a tiny manifest for reproducibility ---
manifest = {
    "collection": COLLECTION_NAME,
    "persist_directory": str(INDEX_DIR),
    "model": "sentence-transformers/all-MiniLM-L6-v2",
    "created_or_updated_at": datetime.utcnow().isoformat() + "Z",
    "batch_size": BATCH_SIZE,
    "rows_indexed_this_run": processed,
    "total_rows_input_this_run": total_rows,
    "schema": ["date","title","source","doc_id"],
}
manifest_path = Path("../data/index_manifest.json").resolve()
with open(manifest_path, "w") as f:
    json.dump(manifest, f, indent=2)
print(f"Manifest written to: {manifest_path}")

=== Batch embedding & upsert into Chroma ===
Total rows considered this run: 887,221


Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


Rows after stripping empty titles: 887,221
Collection: why-move-v1
Processing from index 0 in 888 batches of 1000.
  - Up to row 10,000 / 887,221 (this run). Collection count: 10100
  - Up to row 20,000 / 887,221 (this run). Collection count: 20100
  - Up to row 30,000 / 887,221 (this run). Collection count: 30100
  - Up to row 40,000 / 887,221 (this run). Collection count: 40100
  - Up to row 50,000 / 887,221 (this run). Collection count: 50100
  - Up to row 60,000 / 887,221 (this run). Collection count: 60100
  - Up to row 70,000 / 887,221 (this run). Collection count: 70100
  - Up to row 80,000 / 887,221 (this run). Collection count: 80100
  - Up to row 90,000 / 887,221 (this run). Collection count: 90100
  - Up to row 100,000 / 887,221 (this run). Collection count: 100100
  - Up to row 110,000 / 887,221 (this run). Collection count: 110100
  - Up to row 120,000 / 887,221 (this run). Collection count: 120100
  - Up to row 130,000 / 887,221 (this run). Collection count: 130100
  - Up