In [None]:
# Cell 1 — Imports & metric functions
import os
import math
import csv
from typing import List, Dict, Any
from collections import defaultdict

from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

# Robust doc id extractor (works with LangChain Document objects)
def _get_doc_id(doc):
    if hasattr(doc, "metadata") and isinstance(doc.metadata, dict):
        for key in ("id", "source", "doc_id", "document_id"):
            if key in doc.metadata:
                return str(doc.metadata[key])
        if "source" in doc.metadata:
            return str(doc.metadata["source"])
    if hasattr(doc, "id"):
        return str(getattr(doc, "id"))
    return str(hash(getattr(doc, "page_content", "")[:200]))

def precision_at_k(retrieved_ids: List[str], relevant_ids: set, k: int) -> float:
    topk = retrieved_ids[:k]
    if len(topk) == 0:
        return 0.0
    tp = sum(1 for i in topk if i in relevant_ids)
    return tp / len(topk)

def recall_at_k(retrieved_ids: List[str], relevant_ids: set, k: int) -> float:
    topk = retrieved_ids[:k]
    if len(relevant_ids) == 0:
        return 0.0
    tp = sum(1 for i in topk if i in relevant_ids)
    return tp / len(relevant_ids)

def reciprocal_rank(retrieved_ids: List[str], relevant_ids: set) -> float:
    for idx, doc_id in enumerate(retrieved_ids, start=1):
        if doc_id in relevant_ids:
            return 1.0 / idx
    return 0.0

def dcg_at_k(retrieved_ids: List[str], rel_scores: Dict[str, float], k:int) -> float:
    dcg = 0.0
    for i, doc_id in enumerate(retrieved_ids[:k], start=1):
        rel = rel_scores.get(doc_id, 0.0)
        dcg += (2**rel - 1) / math.log2(i + 1)
    return dcg

def ndcg_at_k(retrieved_ids: List[str], rel_scores: Dict[str, float], k:int) -> float:
    dcg = dcg_at_k(retrieved_ids, rel_scores, k)
    ideal_rels = sorted(rel_scores.values(), reverse=True)[:k]
    idcg = 0.0
    for i, rel in enumerate(ideal_rels, start=1):
        idcg += (2**rel - 1) / math.log2(i + 1)
    if idcg == 0:
        return 0.0
    return dcg / idcg

def evaluate_retriever(
    vectorstore,
    test_queries: List[Dict[str,Any]],
    k_values: List[int] = [1,3,5],
    return_per_query: bool = False
):
    results = {k: {"precision": [], "recall": [], "mr": [], "ndcg": []} for k in k_values}
    per_query = []

    for q in test_queries:
        query_text = q["query"]
        relevant_ids = set(map(str, q.get("relevant_docs", [])))
        rel_scores = {str(d): float(s) for d, s in q.get("rel_scores", {}).items()} if q.get("rel_scores") else {rid:1.0 for rid in relevant_ids}

        max_k = max(k_values)
        docs = vectorstore.similarity_search(query_text, k=max_k)
        retrieved_ids = [_get_doc_id(d) for d in docs]

        query_metrics = {"query": query_text}
        for k in k_values:
            p = precision_at_k(retrieved_ids, relevant_ids, k)
            r = recall_at_k(retrieved_ids, relevant_ids, k)
            rr = reciprocal_rank(retrieved_ids, relevant_ids)
            ndcg = ndcg_at_k(retrieved_ids, rel_scores, k)
            results[k]["precision"].append(p)
            results[k]["recall"].append(r)
            results[k]["mr"].append(rr)
            results[k]["ndcg"].append(ndcg)
            query_metrics[f"P@{k}"] = p
            query_metrics[f"R@{k}"] = r
            query_metrics[f"RR"] = rr
            query_metrics[f"nDCG@{k}"] = ndcg

        per_query.append(query_metrics)

    summary = {}
    for k in k_values:
        summary[k] = {
            "precision@{}".format(k): sum(results[k]["precision"]) / len(results[k]["precision"]) if results[k]["precision"] else 0.0,
            "recall@{}".format(k): sum(results[k]["recall"]) / len(results[k]["recall"]) if results[k]["recall"] else 0.0,
            "MRR": sum(results[k]["mr"]) / len(results[k]["mr"]) if results[k]["mr"] else 0.0,
            "nDCG@{}".format(k): sum(results[k]["ndcg"]) / len(results[k]["ndcg"]) if results[k]["ndcg"] else 0.0,
        }

    if return_per_query:
        return {"summary": summary, "per_query": per_query}
    return {"summary": summary}

In [2]:
# Cell 2 — Load the existing Chroma DB (or use in-memory variable)
# Try to reuse a `vectorstore` variable if it already exists in this notebook.
try:
    vectorstore  # noqa: F821
    print("Using existing 'vectorstore' variable in notebook.")
except NameError:
    # import Chroma loader depending on your environment
    try:
        from langchain_chroma import Chroma
    except Exception:
        # fallback to community import
        from langchain_community.vectorstores import Chroma

    persist_directory = "./chroma_db"
    if os.path.exists(persist_directory):
        print("Loading Chroma from", persist_directory)
        # Use the same embedding function you used originally (HuggingFaceEmbeddings)
        try:
            from langchain_community.embeddings import HuggingFaceEmbeddings
            embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
            vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
        except Exception:
            # generic loader
            vectorstore = Chroma(persist_directory=persist_directory)
        print("Chroma loaded.")
    else:
        raise RuntimeError("No 'vectorstore' variable and no './chroma_db' directory found. Run Task 3 to build it first.")

Loading Chroma from ./chroma_db


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


Chroma loaded.


In [3]:
# Cell 3 — Inspect documents stored in Chroma
# We'll retrieve all stored documents (or a sample) and print id + preview
docs_all = vectorstore._collection.get()["documents"] if hasattr(vectorstore, "_collection") else None

# Safer approach: use similarity_search with an empty query or re-fetch by retrieving top-n for a random token
sample_docs = vectorstore.similarity_search("", k=50)  # many vectorstores ignore empty query but still return top items
print(f"Found {len(sample_docs)} chunks (sample). Showing id + preview:\n")
for i, d in enumerate(sample_docs):
    doc_id = _get_doc_id(d)
    preview = getattr(d, "page_content", "")[:220].replace("\n", " ")
    print(f"[{i}] id={doc_id}  preview={preview}")
    if i >= 29:
        break

# If sample_docs is empty, try reading splits if you have the 'splits' variable
if not sample_docs:
    try:
        print("No sample from vectorstore; checking for 'splits' variable in notebook...")
        for i, d in enumerate(splits):
            print(f"[{i}] id={_get_doc_id(d)} preview={d.page_content[:200].replace(chr(10),' ')}")
            if i >= 29:
                break
    except Exception:
        pass

Found 1 chunks (sample). Showing id + preview:

[0] id=e706b047-2538-4191-abdd-a686efc58efa  preview=Artificial Intelligence (AI) enables machines to learn from experience,     adapt to new inputs, and perform human-like tasks.


In [None]:
# Cell 4 — Prepare test queries (two ways)

# Option A: manual small test set (recommended for correctness)
# Each entry now includes the relevant doc id and an expected answer summary for DeepEval
test_queries_manual = [
    {
        "name": "ai_capabilities",
        "query": "What can artificial intelligence systems do?",
        "relevant_docs": ["ai_overview"],
        "expected_answer": "AI systems can learn from experience, adapt to new inputs, and perform human-like tasks such as perception, reasoning, planning, language understanding, and decision making across many domains.",
    },
    {
        "name": "deep_learning_definition",
        "query": "Give a short definition of deep learning.",
        "relevant_docs": ["deep_learning_intro"],
        "expected_answer": "Deep learning uses multi-layer neural networks to learn hierarchical feature representations from data and excels on tasks like vision, speech, and language when sufficient data and compute are available.",
    },
    {
        "name": "training_process",
        "query": "How do neural networks learn during training?",
        "relevant_docs": ["neural_network_training"],
        "expected_answer": "Neural networks learn by comparing predictions to ground truth, computing loss, backpropagating errors, and updating weights with gradient-based optimization over many epochs.",
    },
    {
        "name": "ml_vs_dl",
        "query": "Differentiate machine learning and deep learning in one sentence.",
        "relevant_docs": ["ml_vs_dl"],
        "expected_answer": "Machine learning covers many algorithms including supervised and unsupervised methods, while deep learning uses deep neural networks to learn end-to-end representations from raw inputs.",
    },
]

# Option B: heuristic auto-generation (quick, not perfect)
# This creates queries by extracting keywords from the first N docs and picking them as 'relevant'
def create_heuristic_queries_from_docs(docs, n_queries=5):
    queries = []
    for d in docs[:n_queries]:
        text = getattr(d, "page_content", "")
        # pick first 6 words as a short query heuristic
        words = [w for w in text.split() if len(w) > 2]
        q = " ".join(words[:6])
        queries.append({"query": q, "relevant_docs": [_get_doc_id(d)]})
    return queries

test_queries_auto = create_heuristic_queries_from_docs(sample_docs, n_queries=5)

# Choose which to use:
test_queries = test_queries_manual  # <--- replace with test_queries_auto to try automatic
print("Prepared test_queries (preview):")
for t in test_queries:
    print(t)

Prepared test_queries (preview):
{'query': 'What can AI do?', 'relevant_docs': ["<replace_with_doc_id_that_contains 'Artificial Intelligence' >"]}
{'query': 'Explain deep learning in short', 'relevant_docs': ['<replace_with_another_doc_id>']}


In [5]:
# Cell 5 — Evaluate
k_values = [1, 3, 5]
results = evaluate_retriever(vectorstore, test_queries, k_values=k_values, return_per_query=True)

print("=== SUMMARY ===")
for k, metrics in results["summary"].items():
    print(f"K={k}:")
    for metric_name, val in metrics.items():
        print(f"  {metric_name}: {val:.4f}")

print("\n=== Per-query results ===")
for pq in results["per_query"]:
    print(pq)

=== SUMMARY ===
K=1:
  precision@1: 0.0000
  recall@1: 0.0000
  MRR: 0.0000
  nDCG@1: 0.0000
K=3:
  precision@3: 0.0000
  recall@3: 0.0000
  MRR: 0.0000
  nDCG@3: 0.0000
K=5:
  precision@5: 0.0000
  recall@5: 0.0000
  MRR: 0.0000
  nDCG@5: 0.0000

=== Per-query results ===
{'query': 'What can AI do?', 'P@1': 0.0, 'R@1': 0.0, 'RR': 0.0, 'nDCG@1': 0.0, 'P@3': 0.0, 'R@3': 0.0, 'nDCG@3': 0.0, 'P@5': 0.0, 'R@5': 0.0, 'nDCG@5': 0.0}
{'query': 'Explain deep learning in short', 'P@1': 0.0, 'R@1': 0.0, 'RR': 0.0, 'nDCG@1': 0.0, 'P@3': 0.0, 'R@3': 0.0, 'nDCG@3': 0.0, 'P@5': 0.0, 'R@5': 0.0, 'nDCG@5': 0.0}


In [None]:
# Cell 6 — Build a simple RAG QA helper (LangChain + OpenAI)
if not os.getenv("OPENAI_API_KEY"):
    raise EnvironmentError(
        "OPENAI_API_KEY must be set before running the QA chain or DeepEval metrics."
    )

retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

qa_prompt = ChatPromptTemplate.from_template(
    """You are a helpful assistant. Answer the user's question using only the provided context.\n"
    """Context:\n{context}\n\nQuestion: {question}\n\nAnswer in two concise sentences that stay faithful to the context."""
)

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.2)
parser = StrOutputParser()


def run_rag(question: str):
    """Retrieve supporting chunks and generate an answer."""
    docs = retriever.invoke(question)
    context = "\n\n".join(doc.page_content for doc in docs)
    answer = (qa_prompt | llm | parser).invoke({"context": context, "question": question})
    return answer, docs


In [6]:
# Cell 6 — Save results to CSV
out_csv = "rag_evaluation_per_query.csv"
per_query = results["per_query"]
if per_query:
    keys = sorted(per_query[0].keys())
    with open(out_csv, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=keys)
        writer.writeheader()
        for row in per_query:
            writer.writerow(row)
    print("Per-query results saved to", out_csv)
else:
    print("No per-query results to save.")

Per-query results saved to rag_evaluation_per_query.csv
