# 03 — Tiny Evaluation
Run a small test set (10–20 Qs) to sanity-check correctness and refusals.

## Load store & model with auto-CUDA 

In [3]:
import os

# Tell Hugging Face to skip TensorFlow/Flax so they never import TensorFlow (TF).
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["TRANSFORMERS_NO_FLAX"] = "1"

# Quiet TF logs if something still pulls it in.
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"  # 1=INFO, 2=WARNING, 3=ERROR


In [None]:
test_prompts = [
    "When does phase 2 begin?",
    "Any way of saying June 9th?",
    "Where can I find my instructor's email?",
    "Under Course Team Contact Information?",
    "What was the last TLAB about?",
    "An explanation of making a music recommender.",
    "What lecture slides do we review pivot tables?",
    "What lecture slides did we learn about control flow?",
    "Can you give me a bullet point list of the most important concepts covered about SQL?",
    "I'd like to know when pahse 2 commences, so I can prepare, thanks!",
    "Give me a summary of P2W2's material.",
    "Where did we define precision vs. recall?",
    "Explain the linear regression formula.",
    "What are the steps for PCA?",
    "What's the difference between an L1 and L2 penalty?",
    "Where can I find information on?",
    "So I want to know more about XGboost and its trade-offs with AdaBoost.",
    "Which slides describe the bias-variance trade-off (summary + cite both pages)?",
    "pls where did u all show that 'log loss' thing ??? and that \"slide w/ the blue curve comparing roc stuff—where?\"",
    "Can I see other students grades??"
]

🧪 Evaluate a list of prompts (latency + scope + citation)

In [None]:
# Evaluate prompts with auto-routing and top-1 citation
import time, pandas as pd


# Evaluate a lis of test queries againts the search() retriever
def evaluate_prompts(prompts, k=4, scope="auto", save_csv=True):
    rows = []
    for q in prompts:
        t0 = time.time()                         # starts the timer
        sc, hits = search(q, k=k, scope=scope)   # uses search() function with router
        dt = time.time() - t0 # seconds

        if hits:                            # if search returned results
            h0 = hits[0]                    # take the top result
            m = h0["meta"]                  # metadata for the top result
            # Build a human-readable citation string
            cite = f"{m['course_name']} › {m['module_name']} › {m['item_title']} ({m['type']})"
            if m.get('url'): cite += f" [{m['url']}]"
            rows.append({
                "query": q,
                "scope": sc,                # which domain the router chose (career/technical)
                "latency_s": round(dt, 3),  # search time
                "top_score": round(h0["score"], 3), #similarity score of top hit
                "top_domain": m.get("domain"),      # domain label from metadata
                "citation": cite                    # readable reference for top hit
            })
        else:  # no hits found
            rows.append({
                "query": q, "scope": sc, "latency_s": round(dt, 3),
                "top_score": None, "top_domain": None, "citation": "(no hits)"
            })
    # convert results to Dataframe for an easy display and saving
    df = pd.DataFrame(rows)

    # pretty print the DF
    with pd.option_context("display.max_colwidth", 80, "display.width", 120):
        print(df.to_string(index=False))

    if save_csv:
        # save results as a CSV inside a folder
        outdir = os.path.join(BASE, "outputs")
        os.makedirs(outdir, exist_ok=True)
        path = os.path.join(outdir, "eval_prompts.csv")
        df.to_csv(path, index=False)
        print("\n saved:", path)

    # quick summary
    coverage = (df["citation"] != "(no hits)").mean()
    avg_score = df["top_score"].dropna().mean() if df["top_score"].notna().any() else None
    print(f"\nCoverage: {coverage:.0%}   Avg top_score: {None if avg_score is None else round(avg_score,3)}")
    print("By domain (top-1):")
    print(df["top_domain"].value_counts(dropna=True))

    return df

# run on our test_prompts evaluation
_ = evaluate_prompts(test_prompts, k=4, scope="auto", save_csv=True)
