In [1]:
# RAG  Retrieval sanity test for a single event date

from pathlib import Path
import pandas as pd
import chromadb

print("=== RAG · M5.A: Retrieval sanity test ===")

# --- Inputs you can tweak ---
TARGET_DATE = "2011-08-09"   # your event day
WINDOW_DAYS = 2              # ±N window
TOP_K = 8                    # how many items to retrieve
QUERY_TEXT = "Apple AAPL"    # simple semantic query

# --- Paths ---
DATA_DIR = Path("../data").resolve()
INDEX_DIR = (DATA_DIR / "chroma_index" / "why-move-v1").resolve()
COLLECTION_NAME = "why-move-v1"

# --- Load event days (for info) ---
ev_path = DATA_DIR / "event_days.csv"
if ev_path.exists():
    df_ev = pd.read_csv(ev_path)
    print(f"Event days loaded: {len(df_ev)} rows. Looking for {TARGET_DATE}…",
          "FOUND" if (df_ev.astype(str)["Date"].str[:10] == TARGET_DATE).any() else "not found in file")
else:
    print("Note: event_days.csv not found — skipping this info check.")

# --- Load cleaned news ---
news_path = DATA_DIR / "news_clean.csv"
assert news_path.exists(), f"Missing {news_path}"
df_news = pd.read_csv(news_path, dtype={"date":"string","title":"string","source":"string","doc_id":"string"})
print(f"News rows: {len(df_news):,}")

# --- Build the date window list (YYYY-MM-DD strings) ---
td = pd.to_datetime(TARGET_DATE)
window = pd.date_range(td - pd.Timedelta(days=WINDOW_DAYS),
                       td + pd.Timedelta(days=WINDOW_DAYS), freq="D")
window_str = window.strftime("%Y-%m-%d").tolist()
print(f"Date window ({-WINDOW_DAYS}..+{WINDOW_DAYS}): {window_str}")

# --- Quick candidate count in CSV (sanity only) ---
candidates = df_news[df_news["date"].isin(window_str)]
print(f"CSV candidates in window: {len(candidates):,}")

# --- Connect to Chroma persistent collection ---
client = chromadb.PersistentClient(path=str(INDEX_DIR))
collection = client.get_or_create_collection(
    name=COLLECTION_NAME,
    metadata={"hnsw:space": "cosine"},
)

# --- Query Chroma with a metadata filter on 'date' (within window) ---
def do_query(window_list, k):
    if len(window_list) == 0:
        return None
    return collection.query(
        query_texts=[QUERY_TEXT],
        n_results=min(k, 20),              # guard: don't request more than 20 at once
        where={"date": {"$in": window_list}},
    )

res = do_query(window_str, TOP_K)

# --- Fallbacks if nothing found in a tight window ---
if not res or not res.get("ids") or len(res["ids"][0]) == 0:
    print("0 candidates in ±2 days → widening to ±5…")
    window = pd.date_range(td - pd.Timedelta(days=5), td + pd.Timedelta(days=5), freq="D")
    window_str = window.strftime("%Y-%m-%d").tolist()
    res = do_query(window_str, TOP_K)

if (not res) or (len(res.get("ids", [[]])[0]) == 0):
    print("Still 0 → widening to ±7…")
    window = pd.date_range(td - pd.Timedelta(days=7), td + pd.Timedelta(days=7), freq="D")
    window_str = window.strftime("%Y-%m-%d").tolist()
    res = do_query(window_str, TOP_K)

# --- Pretty preview ---
docs   = res.get("documents", [[]])[0] if res else []
metas  = res.get("metadatas", [[]])[0] if res else []
scores = res.get("distances", [[]])[0] if res else []  # for cosine, lower is closer in Chroma's HNSW cosine space

print(f"\nRetrieved: {len(docs)} (top-{TOP_K}) in window {window_str[0]} .. {window_str[-1]}")
if len(docs) == 0:
    print("No results — try a larger window or different QUERY_TEXT.")
else:
    # Build a tiny table
    rows = []
    for m, d, s in zip(metas, docs, scores):
        rows.append({
            "date": m.get("date", ""),
            "source": m.get("source", ""),
            "title": (d[:120] + "…") if len(d) > 120 else d,
            "approx_score": round(float(s), 4) if isinstance(s, (int, float)) else s
        })
    df_prev = pd.DataFrame(rows).sort_values(["date","approx_score"]).reset_index(drop=True)
    display(df_prev)

print("\n Retrieval sanity test done. If this looks good, next we’ll wire this into the LLM JSON step.")

=== RAG · M5.A: Retrieval sanity test ===
Event days loaded: 145 rows. Looking for 2011-08-09… FOUND


Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


News rows: 887,221
Date window (-2..+2): ['2011-08-07', '2011-08-08', '2011-08-09', '2011-08-10', '2011-08-11']
CSV candidates in window: 383


Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given



Retrieved: 8 (top-8) in window 2011-08-07 .. 2011-08-11


Unnamed: 0,date,source,title,approx_score
0,2011-08-09,Zacks,Smartphones Drive U.S. Cellular 2Q - Analyst Blog,0.695
1,2011-08-09,Zacks,StanCorp Financial (SFG) - Bear of the Day,0.7248
2,2011-08-10,webmaster,Nokia's U.S. N9 Plans; AOL Tanks,0.6158
3,2011-08-10,webmaster,"5 Stocks to Watch: Bank of America, Cisco",0.7232
4,2011-08-11,webmaster,AOL Plans $250 Million Stock Buyback,0.6997
5,2011-08-11,Zacks,TV on Tablets - A Reality - Analyst Blog,0.7083
6,2011-08-11,webmaster,AOL to Buy Back $250M in Stock,0.72
7,2011-08-11,Zacks,Activision Promotes PROTOTYPE 2 - Analyst Blog,0.7206



 Retrieval sanity test done. If this looks good, next we’ll wire this into the LLM JSON step.


In [2]:
import os
print("Key visible to Python?", "OPENAI_API_KEY" in os.environ)

Key visible to Python? False
