# RAGAS Evaluation Notebook

This notebook demonstrates evaluating retrieval quality using [RAGAS](https://github.com/explodinggradients/ragas).
It assumes you have already ingested data (03_ingest_index.ipynb) and optionally run comparisons (06_end_to_end_comparison.ipynb).

Prerequisites (one-time):
- Install: `poetry run pip install ragas datasets evaluate`
- Ensure `.env` has `OPENAI_API_KEY` set (needed for some RAGAS metrics).


In [None]:
from __future__ import annotations

from typing import List, Dict

from ai_interviewer_pm.retrieval.vectorstore import similarity_search, build_vectorstore
from ai_interviewer_pm.ingestion.chunkers import TimestampChunker
from ai_interviewer_pm.settings import settings
from pathlib import Path

# Small demo dataset assembled from local data/.
texts = [p.read_text(encoding='utf-8') for p in Path(settings.data_dir).glob('*.vtt')]
if not texts:
    print('No .vtt files in data/. Add files and re-run ingestion for meaningful RAGAS evaluation.')

chunker = TimestampChunker()
chunks, metas = [], []
for t in texts:
    cs = chunker.split(t, metadata={'chunker': 'timestamp'})
    chunks.extend([c.text for c in cs])
    metas.extend([c.metadata for c in cs])

store = None
if chunks:
    store = build_vectorstore(chunks, metas)
store


In [None]:
# Build a toy QA dataset for RAGAS. In a real setup, prepare a curated dataset.
examples: List[Dict[str, str]] = []
queries = [
    'How to handle stakeholder conflict?',
    'How to structure an interview answer using STAR?',
]
if store is not None:
    for q in queries:
        hits = similarity_search(store, q, k=3)
        contexts = [d for (d, _m, _s) in hits]
        examples.append({'question': q, 'contexts': contexts, 'ground_truth': ''})
examples


In [None]:
# RAGAS evaluation (requires ragas installed)
# Metrics: Answer Relevancy, Faithfulness, Context Precision/Recall
try:
    from ragas import evaluate
    from ragas.metrics import (
        answer_relevancy, faithfulness, context_precision, context_recall
    )
    import datasets as hfds

    if examples:
        ds = hfds.Dataset.from_list(examples)
        result = evaluate(
            ds,
            metrics=[answer_relevancy, faithfulness, context_precision, context_recall],
        )
        print(result)
    else:
        print('No examples assembled for RAGAS. Ensure data is ingested and queries return contexts.')
except Exception as e:
    print('RAGAS not installed or failed to run:', e)
