# Evaluation and Metrics
Run a small test set (10–20 Qs) to sanity-check correctness and refusals.

### Load and / or store model with auto-CUDA 

In [48]:
import os

# Tell Hugging Face to skip TensorFlow/Flax so they never import TensorFlow (TF).
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["TRANSFORMERS_NO_FLAX"] = "1"

# Quiet TF logs if something still pulls it in.
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"  # 1=INFO, 2=WARNING, 3=ERROR

In [57]:
# General imports 
import os, sys, importlib, json, faiss
from pathlib import Path

# --- Needed for ealuating test_prompts ---
import time, pandas as pd
from datetime import datetime
from sentence_transformers import SentenceTransformer

import scripts.nb01_helper as h
importlib.reload(h)

<module 'scripts.nb01_helper' from 'C:\\Users\\oneps\\Documents\\Research_Dev_Documents\\DataEden_Github\\TEPP-2-SlideHunt-Repo\\SlideHunt\\scripts\\nb01_helper.py'>

In [53]:
# Pull just what functions we need
load_store = h.load_store
make_router = h.make_router
search = h.search

# Robust import of nb01_helper from scripts/ 
# That way, every notebook can import from scripts/ safely, even if it’s 
# nested in notebooks/ or being run from a different working directory.

def find_repo_root(marker_rel="scripts/nb01_helper.py", max_up=6):
    """
    Walk upward from CWD until we find the repo root (where marker exists).
    If not found, return CWBD.
    Args:
        marker_rel (str): Relative path from repo root to a file that must exist.
        max_up (int): How many Levels to search upward.
    Returns:
        Normalized Path to repo root or CWD if not found.
   """
    here = Path.cwd().resolve()
    probe = here
    for _ in range(max_up):
        if (probe / marker_rel).exists():
            return probe
        probe = probe.parent
    return here

repo_root = find_repo_root()
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))

In [35]:
# Load saved artifacts (FAISS index, fact snippets, metadata)
# These were previously built & saved with save_store() function.
index, facts, metas = load_store()
# ensure query embeddings must match the ones stored in FAISS
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
# Sanity check: index size must align with number of facts/metas
assert index.ntotal == len(facts) == len(metas), "Index/facts/metas are misaligned!"
# Precompute route embeddings for auto domain detection ("technical", "career", etc.)
router = make_router(model)


In [54]:
test_prompts = [
    "When does phase 2 begin?",
    "Any way of saying June 9th?",
    "Where can I find my instructor's email?",
    "Under Course Team Contact Information?",
    "What was the last TLAB about?",
    "An explanation of making a music recommender.",
    "What lecture slides do we review pivot tables?",
    "What lecture slides did we learn about control flow?",
    "Can you give me a bullet point list of the most important concepts covered about SQL?",
    "I'd like to know when pahse 2 commences, so I can prepare, thanks!",
    "Give me a summary of P2W2's material.",
    "Where did we define precision vs. recall?",
    "Explain the linear regression formula.",
    "What are the steps for PCA?",
    "What's the difference between an L1 and L2 penalty?",
    "Where can I find information on?",
    "So I want to know more about XGboost and its trade-offs with AdaBoost.",
    "Which slides describe the bias-variance trade-off (summary + cite both pages)?",
    "pls where did u all show that 'log loss' thing ??? and that \"slide w/ the blue curve comparing roc stuff—where?\"",
    "Can I see other students grades??"
]

In [55]:
# Run each query through the search function
for q in test_prompts:
    scope, hits = search(
        query=q,               # natural language question
        model=model,           # sentence-transformers embedder
        index=index,           # FAISS vector index
        facts=facts,           # list of fact text snippets
        metas=metas,           # metadata dicts (course/module/item info)
        router_emb=router,     # domain router embeddings
        k=4,                   # return top-4 matches
        scope="auto"           # let router decide technical/career/all
    )

    # Print query + resolved scope
    print(f"\nQ: {q}   [scope={scope}]")

    # If no results, notify and skip
    if not hits:
        print("  (no hits)")
        continue

    # Otherwise, print out citation-style matches with score
    for h in hits:
        m = h["meta"]
        cite = f"{m['course_name']} > {m['module_name']} > {m['item_title']} ({m['type']})"
        if m.get("url"):  # add URL if available
            cite += f"  [{m['url']}]"
        print(f"  {h['score']:.3f} :: {cite}")



Q: When does phase 2 begin?   [scope=all]
  0.476 :: IF '25 Data Science Cohort A > P2W12 (8/25 - 8/29) End of Phase Project Week > [TEPP] Phase 2 Portfolio Project Checkpoint #2 (Due 8/27) (Assignment)
  0.434 :: IF '25 Data Science Cohort A > P2W11 (8/18-8/22) Agents & End of Phase Project > [TEPP] Phase 2 Portfolio Project Checkpoint #1 (Due 8/20) (Assignment)
  0.426 :: IF '25 Data Science Cohort A > P2W11 (8/18-8/22) Agents & End of Phase Project > [TEPP] Phase 2 Portfolio Project (Due 8/29) (Assignment)
  0.425 :: IF '25 Data Science Cohort A > P2W1 (6/9-6/13) Python & SQL Review > 💻 W1D1 (6/9) Introduction to Phase 2 (Page)  [https://tkh.instructure.com/courses/172/pages/w1d1-6-slash-9-introduction-to-phase-2]

Q: Any way of saying June 9th?   [scope=technical]
  0.230 :: Foundations Course > Week 1: Foundations For Success (Jan. 27th-Jan. 30th) > (W1D1) After Class Assignment: Reflecting on Your Support System (Assignment)
  0.225 :: IF '25 Data Science Cohort A > P2W7 (7/21-7

### 🧪 Evaluate Test Prompts (latency + scope + citation)

In [58]:
# -------------------------------------------------------------------
# Optional: set BASE once (repo root) via env var or hardcoded fallback
# Also if modifying additional prompts or saving to csv file
# BASE is only used for saving the CSV; if unset, we fallback to CWD/outputs.
# -------------------------------------------------------------------

# Point this ONCE to the repo root (hardcoded in env var)
# Will use when serch() import from scripts.nb01_helper and loaded 
#BASE = Path(os.getenv("SLIDEHUNT", r"C:\Copy The SLIDEHUNTER_BASE\From .env file\and Place Here")).resolve()

# Ensure that SLIDEHUNTER_BASE is reolvable
try:
    BASE
except NameError:
    BASE = Path(os.getenv("SLIDEHUNTER_BASE", os.getcwd())).resolve()

# Evaluate a lis of test queries againts the search() retriever
# with auto-routing and top-1 citations.
def evaluate_prompts(prompts, k=4, scope="auto", save_csv=True):
    """
    Evaluate retrieval quality over a list of prompts using the `search` retriever.

    For each prompt, this:
      1) Calls `search(...)` (auto-routing by default),
      2) Records latency (seconds), chosen scope, top-1 score,
      3) Builds a human-readable citation (course › module › item [+url]),
      4) Adds a wall-clock timestamp for when the row was evaluated.

    NOTE:
      - This function expects the following globals to already exist in the notebook:
        `model` (SentenceTransformer), `index` (FAISS), `facts` (List[str]),
        `metas` (List[dict]), `router` (from make_router(model)), and `BASE` (Path).
      - `latency_s` is the elapsed time (seconds) for the search call.

    Args:
        prompts (List[str]): Test questions.
        k (int, optional): Top-k to retrieve (default 4).
        scope (str, optional): "auto", "technical", "career", or "all" (default "auto").
        save_csv (bool, optional): Save results to <BASE>/outputs/eval_prompts.csv (default True).

    Returns:
        pd.DataFrame: One row per prompt with latency, top-1 score, domain, citation, and timestamp.
    """
    rows = []
    for q in prompts:
        t0 = time.time()  # start timer
        # Stateless helper; we pass in all artifacts explicitly
        sc, hits = search(
            query=q,               # natural-language question
            model=model,           # SentenceTransformers embedder (must match index build)
            index=index,           # FAISS vector index
            facts=facts,           # fact snippets (aligned with FAISS vectors)
            metas=metas,           # metadata dicts (aligned 1:1 with facts)
            router_emb=router,     # domain router embeddings
            k=k,                   # top-k results to return
            scope=scope            # "auto" lets router pick technical/career/all
        )

        dt = time.time() - t0     # elapsed seconds for this query
        ts = datetime.now().isoformat(timespec="seconds")  # wall-clock timestamp

        if hits:
            h0 = hits[0]          # top-1
            m = h0["meta"]
            # Human-readable citation (include URL if available)
            cite = f"{m['course_name']} > {m['module_name']} > {m['item_title']} ({m['type']})"
            if m.get("url"):
                cite += f" [{m['url']}]"

            rows.append({
                "query": q,
                "scope": sc,                       # scope selected (e.g., 'technical', 'career', 'all')
                "latency_s": round(dt, 3),         # elapsed time in SECONDS (this is your timing metric)
                "top_score": round(h0["score"], 3),
                "top_domain": m.get("domain"),
                "citation": cite,
                "evaluated_at": ts,                # wall-clock time the row was produced
            })
        else:
            rows.append({
                "query": q,
                "scope": sc,
                "latency_s": round(dt, 3),
                "top_score": None,
                "top_domain": None,
                "citation": "(no hits)",
                "evaluated_at": ts,
            })

    # Build the result table
    df = pd.DataFrame(rows)

    # Pretty-print the table in the notebook
    with pd.option_context("display.max_colwidth", 100, "display.width", 140):
        print(df.to_string(index=False))

    # Optionally save to CSV
    if save_csv:
        outdir = Path(BASE) / "outputs"
        outdir.mkdir(parents=True, exist_ok=True)
        path = outdir / "eval_prompts.csv"
        df.to_csv(path, index=False)
        # print(f"saved: {path}")

    # Short summary section
    coverage = (df["citation"] != "(no hits)").mean()
    avg_score = df["top_score"].dropna().mean() if df["top_score"].notna().any() else None
    print(f"\nCoverage: {coverage:.0%}   Avg top_score: {None if avg_score is None else round(avg_score, 3)}")
    print("By domain (top-1):")
    print(df["top_domain"].value_counts(dropna=True))

    return df

# run on our test_prompts to evaluate output
_ = evaluate_prompts(test_prompts, k=4, scope="auto", save_csv=True)

                                                                                                           query     scope  latency_s  top_score top_domain                                                                                                                                                                                                                                   citation        evaluated_at
                                                                                        When does phase 2 begin?       all      0.036      0.476  technical                                                                                      IF '25 Data Science Cohort A > P2W12 (8/25 - 8/29) End of Phase Project Week > [TEPP] Phase 2 Portfolio Project Checkpoint #2 (Due 8/27) (Assignment) 2025-08-29T18:54:11
                                                                                     Any way of saying June 9th? technical      0.022      0.230  technical                       

In [62]:
## Some EDA of outputs/eval_prompts.csv
slidehunter_df = pd.read_csv("../outputs/eval_prompts.csv")
slidehunter_df


Unnamed: 0,query,scope,latency_s,top_score,top_domain,citation,evaluated_at
0,When does phase 2 begin?,all,0.036,0.476,technical,IF '25 Data Science Cohort A > P2W12 (8/25 - 8...,2025-08-29T18:54:11
1,Any way of saying June 9th?,technical,0.022,0.23,technical,Foundations Course > Week 1: Foundations For S...,2025-08-29T18:54:12
2,Where can I find my instructor's email?,technical,0.025,0.332,technical,IF '25 Data Science Cohort A > Fellow Resource...,2025-08-29T18:54:12
3,Under Course Team Contact Information?,all,0.02,0.444,technical,IF '25 Data Science Cohort A > Fellow Resource...,2025-08-29T18:54:12
4,What was the last TLAB about?,technical,0.023,0.334,technical,IF '25 Data Science Cohort A > P2W9 (8/4-8/8) ...,2025-08-29T18:54:12
5,An explanation of making a music recommender.,technical,0.02,0.465,technical,IF '25 Data Science Cohort A > P2W9 (8/4-8/8) ...,2025-08-29T18:54:12
6,What lecture slides do we review pivot tables?,technical,0.023,0.563,technical,IF '25 Data Science Cohort A > P1W6 (4/14-4/18...,2025-08-29T18:54:12
7,What lecture slides did we learn about control...,technical,0.022,0.389,technical,Foundations Course > Week 1: Foundations For S...,2025-08-29T18:54:12
8,Can you give me a bullet point list of the mos...,technical,0.02,0.577,technical,IF '25 Data Science Cohort A > P1W9 (5/5-5/9)...,2025-08-29T18:54:12
9,"I'd like to know when pahse 2 commences, so I ...",all,0.029,0.361,technical,IF '25 Data Science Cohort A > P2W1 (6/9-6/13)...,2025-08-29T18:54:12
