# RAG evaluation 
What it does:
- Runs all retrieval tests and summarizes MRR, nDCG, and keyword coverage.
- Groups retrieval quality by category.
- Runs LLM-as-judge answer evaluation (may incur model cost).

## Retrieval evaluation

In [None]:
import pandas as pd
from evaluation_func.eval import evaluate_all_retrieval, evaluate_all_answers
retrieval_rows = []
for idx, (test, result, progress) in enumerate(evaluate_all_retrieval(), 1):
    retrieval_rows.append({
        "test_index": idx - 1,
        "category": test.category,
        "question": test.question,
        "mrr": result.mrr,
        "ndcg": result.ndcg,
        "coverage": result.keyword_coverage,
        "keywords_found": result.keywords_found,
        "total_keywords": result.total_keywords,
        "progress": progress,
    })

retrieval_df = pd.DataFrame(retrieval_rows)
print(f"Completed retrieval tests: {len(retrieval_df)}")

retrieval_summary = pd.DataFrame({
    "mrr_mean": [retrieval_df['mrr'].mean()],
    "ndcg_mean": [retrieval_df['ndcg'].mean()],
    "coverage_mean_pct": [retrieval_df['coverage'].mean()],
})

category_mrr = (
    retrieval_df.groupby('category')['mrr']
    .mean()
    .reset_index(name='avg_mrr')
    .sort_values('avg_mrr', ascending=False)
)

display(retrieval_summary)
display(category_mrr)
display(retrieval_df.head())

## Answer Evaluation

In [None]:
# Toggle answer evaluation (LLM judge). Set to False to avoid model calls/cost.
RUN_ANSWER_EVAL = True
if RUN_ANSWER_EVAL:
    answer_rows = []
    for idx, (test, result, progress) in enumerate(evaluate_all_answers(), 1):
        answer_rows.append({
            "test_index": idx - 1,
            "category": test.category,
            "question": test.question,
            "accuracy": result.accuracy,
            "completeness": result.completeness,
            "relevance": result.relevance,
            "feedback": result.feedback,
            "progress": progress,
        })

    answer_df = pd.DataFrame(answer_rows)
    print(f"Completed answer tests: {len(answer_df)}")

    answer_summary = pd.DataFrame({
        "accuracy_mean": [answer_df['accuracy'].mean()],
        "completeness_mean": [answer_df['completeness'].mean()],
        "relevance_mean": [answer_df['relevance'].mean()],
    })

    category_accuracy = (
        answer_df.groupby('category')['accuracy']
        .mean()
        .reset_index(name='avg_accuracy')
        .sort_values('avg_accuracy', ascending=False)
    )

    display(answer_summary)
    display(category_accuracy)
    display(answer_df.head())
else:
    print("Skipping answer evaluation; set RUN_ANSWER_EVAL = True to run.")