# DeepEval RAG Evaluation
This notebook evaluates the existing Chroma-based RAG pipeline with DeepEval metrics.


In [1]:
from __future__ import annotations

import os
from dataclasses import dataclass
from typing import List, Dict, Any
from pathlib import Path

# Workaround for DeepEval permission issues
# DeepEval tries to create a .deepeval directory in the current working directory
# If you get permission errors, ensure you have write access to the current directory
# or run the notebook from a location where you have write permissions

# Store original working directory before any changes
_ORIGINAL_CWD = os.getcwd()
deepeval_dir = Path(".deepeval")

try:
    # Try to create and test write access to .deepeval directory
    if not deepeval_dir.exists():
        deepeval_dir.mkdir(exist_ok=True, mode=0o755)
    # Test write permissions
    test_file = deepeval_dir / ".test_write"
    test_file.write_text("test")
    test_file.unlink()
    print(f"✅ DeepEval config directory ready: {deepeval_dir.absolute()}")
except PermissionError:
    # If we can't write to current directory, try user's temp directory
    import tempfile
    temp_base = Path(tempfile.gettempdir())
    new_cwd = temp_base / "deepeval_work"
    new_cwd.mkdir(exist_ok=True, mode=0o755)
    os.chdir(new_cwd)
    print(f"⚠️  Changed working directory to: {new_cwd}")
    print(f"   (Original: {_ORIGINAL_CWD})")
    print("   DeepEval will create .deepeval here instead.")

from deepeval.evaluate import evaluate
from deepeval.metrics import (
    ContextualPrecisionMetric,
    ContextualRecallMetric,
    ContextualRelevancyMetric,
    FaithfulnessMetric,
    AnswerRelevancyMetric,
)
from deepeval.models import GPTModel
from deepeval.test_case import LLMTestCase

from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser


✅ DeepEval config directory ready: d:\cellula_NLP\task_rag\.deepeval


  from .autonotebook import tqdm as notebook_tqdm

For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  return _bootstrap._gcd_import(name[level:], package, level)


In [2]:
# Use absolute path for Chroma DB to avoid issues if working directory changes
PERSIST_DIRECTORY = str(Path(_ORIGINAL_CWD) / "chroma_db")
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
GENERATION_MODEL = "gpt-4o-mini"
DEEP_EVAL_MODEL = "gpt-4o-mini"
RETRIEVE_TOP_K = 3

# Workaround for langchain-openai version compatibility
# If you get a 'proxies' error, try: pip install --upgrade langchain-openai openai
import warnings
warnings.filterwarnings("ignore", category=UserWarning)


@dataclass
class RagQuery:
    name: str
    query: str
    expected_answer: str
    relevant_doc_ids: List[str]


TEST_QUERIES: List[RagQuery] = [
    RagQuery(
        name="ai_capabilities",
        query="What can artificial intelligence systems do?",
        expected_answer=(
            "AI systems learn from experience, adapt to new inputs, and perform human-like tasks "
            "such as perception, reasoning, planning, language understanding, and decision making."
        ),
        relevant_doc_ids=["ai_overview"],
    ),
    RagQuery(
        name="deep_learning_definition",
        query="Give a short definition of deep learning.",
        expected_answer=(
            "Deep learning is a machine learning approach that uses multi-layer neural networks to learn "
            "hierarchical feature representations, enabling strong performance on tasks like vision, speech, and language."
        ),
        relevant_doc_ids=["deep_learning_intro"],
    ),
    RagQuery(
        name="training_process",
        query="How do neural networks learn during training?",
        expected_answer=(
            "Neural networks compare predictions to ground truth, compute loss, backpropagate errors, and update weights with gradient-based optimisation across many epochs."
        ),
        relevant_doc_ids=["neural_network_training"],
    ),
    RagQuery(
        name="ml_vs_dl",
        query="Differentiate machine learning and deep learning in one sentence.",
        expected_answer=(
            "Machine learning spans many algorithms including supervised and unsupervised methods, while deep learning relies on deep neural networks that learn end-to-end representations from raw inputs."
        ),
        relevant_doc_ids=["ml_vs_dl"],
    ),
]


In [3]:
def ensure_openai_key() -> None:
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        raise EnvironmentError(
            "OPENAI_API_KEY must be set in the environment before running DeepEval benchmarks.\n"
            "Set it using: os.environ['OPENAI_API_KEY'] = 'your-actual-api-key'"
        )
    # Check if it's a placeholder
    if "your_key" in api_key.lower() or "here" in api_key.lower() or len(api_key) < 20:
        raise EnvironmentError(
            f"OPENAI_API_KEY appears to be a placeholder value: '{api_key[:10]}...'\n"
            "Please set a valid API key using: os.environ['OPENAI_API_KEY'] = 'sk-...'"
        )


def load_vectorstore() -> Chroma:
    if not os.path.exists(PERSIST_DIRECTORY):
        raise FileNotFoundError(
            f"Chroma directory '{PERSIST_DIRECTORY}' not found. Run the indexing step before evaluation."
        )

    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
    return Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embeddings)


def build_rag_pipeline(vectorstore: Chroma):
    retriever = vectorstore.as_retriever(search_kwargs={"k": RETRIEVE_TOP_K})

    prompt = ChatPromptTemplate.from_template(
        """You are a helpful assistant. Answer the user's question using only the provided context.
Context:
{context}

Question: {question}

Provide a concise answer that stays strictly faithful to the given context."""
    )

    # Initialize ChatOpenAI - handle version compatibility issues
    # The 'proxies' error is due to langchain-openai/openai version mismatch
    # Try multiple initialization methods to find one that works
    llm = None
    init_methods = [
        # Method 1: Standard initialization
        lambda: ChatOpenAI(model=GENERATION_MODEL, temperature=0.2),
        # Method 2: With model_name (older versions)
        lambda: ChatOpenAI(model_name=GENERATION_MODEL, temperature=0.2),
        # Method 3: With explicit openai_api_key
        lambda: ChatOpenAI(
            model=GENERATION_MODEL,
            temperature=0.2,
            openai_api_key=os.getenv("OPENAI_API_KEY")
        ),
    ]
    
    for method in init_methods:
        try:
            llm = method()
            break
        except (TypeError, ValueError) as e:
            if "proxies" in str(e):
                # If proxies error persists, the issue is in the package versions
                # User should run: pip install --upgrade langchain-openai openai
                continue
            else:
                continue
    
    if llm is None:
        raise RuntimeError(
            "Failed to initialize ChatOpenAI. This is likely due to a version mismatch. "
            "Try running: pip install --upgrade langchain-openai openai"
        )
    
    parser = StrOutputParser()

    def run(question: str) -> Dict[str, Any]:
        docs: List[Document] = retriever.invoke(question)
        context = "\n\n".join(doc.page_content for doc in docs)
        answer = (prompt | llm | parser).invoke({"context": context, "question": question})
        return {
            "answer": answer,
            "docs": docs,
        }

    return run


def _doc_id(doc: Document) -> str:
    metadata = getattr(doc, "metadata", {}) or {}
    for key in ("id", "source", "doc_id", "document_id"):
        if key in metadata:
            return str(metadata[key])
    if hasattr(doc, "id"):
        return str(doc.id)
    return str(abs(hash(doc.page_content)) % (10**16))


def build_test_cases(generate_answer, queries: List[RagQuery]) -> List[LLMTestCase]:
    test_cases: List[LLMTestCase] = []
    for spec in queries:
        result = generate_answer(spec.query)
        docs: List[Document] = result["docs"]
        contexts = [doc.page_content for doc in docs]
        retrieved_ids = [_doc_id(doc) for doc in docs]

        test_cases.append(
            LLMTestCase(
                input=spec.query,
                actual_output=result["answer"],
                expected_output=spec.expected_answer,
                context=contexts,
                retrieval_context=contexts,
                additional_metadata={
                    "retrieved_doc_ids": retrieved_ids,
                    "expected_doc_ids": spec.relevant_doc_ids,
                    "query_name": spec.name,
                },
            )
        )
    return test_cases


def build_metrics() -> List[Any]:
    judge_model = GPTModel(model=DEEP_EVAL_MODEL)
    return [
        ContextualPrecisionMetric(model=judge_model, threshold=0.5),
        ContextualRecallMetric(model=judge_model, threshold=0.5),
        ContextualRelevancyMetric(model=judge_model, threshold=0.5),
        FaithfulnessMetric(model=judge_model, threshold=0.5),
        AnswerRelevancyMetric(model=judge_model, threshold=0.5),
    ]


In [4]:
# Set your OpenAI API key here (if not already set in environment)
# Replace 'your-api-key-here' with your actual OpenAI API key
# You can get one from: https://platform.openai.com/account/api-keys

if not os.getenv("OPENAI_API_KEY") or "your_key" in os.getenv("OPENAI_API_KEY", "").lower():
    
    # Uncomment and set your API key below:
    os.environ["OPENAI_API_KEY"] = "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"
    print("⚠️  Please set your OPENAI_API_KEY in the cell above or in your environment variables")
else:
    print("✅ OPENAI_API_KEY is set")


⚠️  Please set your OPENAI_API_KEY in the cell above or in your environment variables


In [5]:
ensure_openai_key()
vectorstore = load_vectorstore()
rag_runner = build_rag_pipeline(vectorstore)
test_cases = build_test_cases(rag_runner, TEST_QUERIES)
metrics = build_metrics()

print(f"Prepared {len(test_cases)} DeepEval test cases. Running evaluation...\n")
test_results = evaluate(
    test_cases=test_cases,
    metrics=metrics,
    run_async=False,
    show_indicator=True,
    print_results=True,
    write_cache=False,
    use_cache=False,
)

# Calculate and display metric pass rates
from collections import defaultdict
metric_counts = defaultdict(int)
metric_successes = defaultdict(int)

for result in test_results:
    for metric_data in result.metrics_data:
        metric_name = metric_data.name
        metric_counts[metric_name] += 1
        if metric_data.success:
            metric_successes[metric_name] += 1

print("\n" + "=" * 70)
print("Overall Metric Pass Rates")
print("=" * 70)
for metric_name in metric_counts:
    pass_rate = metric_successes[metric_name] / metric_counts[metric_name]
    print(f"  {metric_name}: {pass_rate:.2%} ({metric_successes[metric_name]}/{metric_counts[metric_name]})")
print("=" * 70)


  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given


PermissionError: [Errno 13] Permission denied: '.deepeval'