## Imports & Setup

In [1]:
import os
import sys
import math
import json
import re
from typing import List, Optional

from dotenv import load_dotenv
from pydantic import BaseModel, Field

# So "src" imports work when running from project root
sys.path.append(os.path.abspath("."))

from src.database import initialize_database
from src.retrieval import Retriever
from src.llm import LLMEngine
from src.config import CHROMA_PATH  # just to confirm path / debug
from src.config import TOP_K, RERANK_TOP_K


from llama_index.llms.groq import Groq
from llama_index.core.llms import ChatMessage

load_dotenv(override=True)

print("CHROMA_PATH:", CHROMA_PATH)
print(f"‚öôÔ∏è Evaluation using TOP_K={TOP_K}, RERANK_TOP_K={RERANK_TOP_K}")



CHROMA_PATH: c:\My Projects\RepoMind\data\chromadb
‚öôÔ∏è Evaluation using TOP_K=15, RERANK_TOP_K=5


## Initialize Embeddings, Index, and LLM

In [2]:
print("üîß Initializing embedding model and vector store...")
initialize_database()  # sets global Settings.embed_model and returns vector store

print("üîç Initializing Retriever (with reranker)...")
retriever = Retriever(use_reranker=True)

print("üß† Initializing LLMEngine (for RAG answers)...")
llm_engine = LLMEngine()

print("‚úÖ RAG stack ready!")


üîß Initializing embedding model and vector store...
üîÑ Loading Embedding Model: BAAI/bge-small-en-v1.5...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

‚úÖ Embedding Model Loaded.
üîç Initializing Retriever (with reranker)...
üìÇ Loading Index from c:\My Projects\RepoMind\data\chromadb...
‚ö†Ô∏è No index metadata found, creating index from vector store...
‚úÖ Index created from existing vector store
üöÄ Initializing Reranker: cross-encoder/ms-marco-MiniLM-L-6-v2...


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

üß† Initializing LLMEngine (for RAG answers)...
üß† Initializing LLM: llama-3.3-70b-versatile...
‚úÖ RAG stack ready!


## Test Definitions Loader

In [4]:
class TestQuestion(BaseModel):
    question: str
    reference_answer: str
    keywords: List[str] = []
    category: Optional[str] = None


def load_tests(path: str = "data/tests.jsonl") -> List[TestQuestion]:
    """
    Load test questions from a JSONL file.
    Each line: {"question": "...", "reference_answer": "...", "keywords": [...], "category": "..."}
    """
    tests: List[TestQuestion] = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            tests.append(TestQuestion.model_validate_json(line))
    print(f"‚úÖ Loaded {len(tests)} tests from {path}")
    return tests

# Quick sanity check (will fail if file doesn't exist)
try:
    _ = load_tests()
except FileNotFoundError:
    print("‚ö†Ô∏è data/tests.jsonl not found yet ‚Äì create it before running full evaluation.")


‚úÖ Loaded 14 tests from data/tests.jsonl


## RAG Glue: Fetch Context & Answer Question

In [5]:
def fetch_context(question: str, top_k: int | None = None, use_rerank: bool | None = None):
    """
    Use your Retriever to fetch relevant nodes for a question.

    Args:
        question: user query
        top_k: number of initial candidates (defaults to TOP_K from config if None)
        use_rerank: whether to apply cross-encoder reranking
    """
    nodes = retriever.search(
        query_text=question,
        top_k=top_k,          # this becomes initial_k or TOP_K inside Retriever
        rerank=use_rerank,    # controls whether SentenceTransformerRerank is applied
    )
    return nodes


def answer_question(question: str):
    """
    Use your full RAG pipeline to answer and also return retrieved nodes.
    """
    nodes = fetch_context(question)
    answer = llm_engine.chat(question, nodes)
    return answer, nodes


## Metrics Models

In [6]:
class RetrievalEval(BaseModel):
    """Evaluation metrics for retrieval performance."""

    mrr: float = Field(description="Mean Reciprocal Rank - average across all keywords")
    ndcg: float = Field(description="Normalized Discounted Cumulative Gain (binary relevance)")
    keywords_found: int = Field(description="Number of keywords found in top-k results")
    total_keywords: int = Field(description="Total number of keywords to find")
    keyword_coverage: float = Field(description="Percentage of keywords found")


class AnswerEval(BaseModel):
    """LLM-as-a-judge evaluation of answer quality."""

    feedback: str = Field(
        description="Concise feedback on the answer quality, comparing it to the reference answer and evaluating based on the retrieved context"
    )
    accuracy: float = Field(
        description="How factually correct is the answer compared to the reference answer? 1 (wrong. any wrong answer must score 1) to 5 (ideal - perfectly accurate)."
    )
    completeness: float = Field(
        description="How complete is the answer in addressing all aspects of the question? 1 (very poor) to 5 (ideal)."
    )
    relevance: float = Field(
        description="How relevant is the answer to the specific question asked? 1 (very poor) to 5 (ideal)."
    )


## Node Text Helper + Retrieval Metrics

In [7]:
def get_node_text(node) -> str:
    """
    Safely extract text content from a LlamaIndex node.
    """
    if hasattr(node, "get_content"):
        try:
            return node.get_content()
        except Exception:
            pass
    if hasattr(node, "text"):
        return node.text
    return str(node)


def calculate_mrr(keyword: str, retrieved_nodes: List) -> float:
    """Calculate reciprocal rank for a single keyword (case-insensitive)."""
    keyword_lower = keyword.lower()
    for rank, node in enumerate(retrieved_nodes, start=1):
        if keyword_lower in get_node_text(node).lower():
            return 1.0 / rank
    return 0.0


def calculate_dcg(relevances: List[int], k: int) -> float:
    """Calculate Discounted Cumulative Gain."""
    dcg = 0.0
    for i in range(min(k, len(relevances))):
        dcg += relevances[i] / math.log2(i + 2)  # i+2 because rank starts at 1
    return dcg


def calculate_ndcg(keyword: str, retrieved_nodes: List, k: int = 10) -> float:
    """Calculate nDCG for a single keyword (binary relevance, case-insensitive)."""
    keyword_lower = keyword.lower()

    # Binary relevance: 1 if keyword found, 0 otherwise
    relevances = [
        1 if keyword_lower in get_node_text(node).lower() else 0
        for node in retrieved_nodes[:k]
    ]

    dcg = calculate_dcg(relevances, k)
    ideal_relevances = sorted(relevances, reverse=True)
    idcg = calculate_dcg(ideal_relevances, k)

    return dcg / idcg if idcg > 0 else 0.0


## Single-Test Retrieval Evaluation

In [8]:
def evaluate_retrieval(
    test: TestQuestion,
    use_rerank: bool = True,
) -> RetrievalEval:
    """
    Evaluate retrieval performance for a test question.

    If use_rerank=True:
        - We ask the retriever to rerank (cross-encoder)
        - We evaluate on the top RERANK_TOP_K nodes

    If use_rerank=False:
        - We evaluate on the top TOP_K nodes from pure vector search
    """

    if use_rerank:
        # Retrieve with reranking
        retrieved_nodes = fetch_context(
            test.question,
            top_k=TOP_K,        # number of initial vector candidates
            use_rerank=True,    # apply cross-encoder
        )
        k_eval = min(RERANK_TOP_K, len(retrieved_nodes))
    else:
        # Retrieve without reranking (pure vector search)
        retrieved_nodes = fetch_context(
            test.question,
            top_k=TOP_K,
            use_rerank=False,
        )
        k_eval = min(TOP_K, len(retrieved_nodes))

    # Slice to the eval window (k_eval)
    eval_nodes = retrieved_nodes[:k_eval]

    # MRR and nDCG across all keywords, evaluated over eval_nodes
    mrr_scores = [calculate_mrr(keyword, eval_nodes) for keyword in test.keywords]
    ndcg_scores = [calculate_ndcg(keyword, eval_nodes, k=k_eval) for keyword in test.keywords]

    avg_mrr = sum(mrr_scores) / len(mrr_scores) if mrr_scores else 0.0
    avg_ndcg = sum(ndcg_scores) / len(ndcg_scores) if ndcg_scores else 0.0

    keywords_found = sum(1 for score in mrr_scores if score > 0)
    total_keywords = len(test.keywords)
    keyword_coverage = (keywords_found / total_keywords * 100) if total_keywords > 0 else 0.0

    return RetrievalEval(
        mrr=avg_mrr,
        ndcg=avg_ndcg,
        keywords_found=keywords_found,
        total_keywords=total_keywords,
        keyword_coverage=keyword_coverage,
    )


## Judge LLM

In [9]:
# Cell 8 ‚Äì Initialize Judge LLM (free, open-source model via Groq)

JUDGE_MODEL_NAME = "llama-3.2-3b-instruct"  # free OSS model on Groq, adjust if needed

groq_api_key = os.getenv("GROQ_API_KEY")
if not groq_api_key:
    raise ValueError("‚ùå GROQ_API_KEY not set in environment. Add it to your .env file.")

print(f"üßë‚Äç‚öñÔ∏è Initializing Judge LLM: {JUDGE_MODEL_NAME} ...")
judge_llm = Groq(model=JUDGE_MODEL_NAME, api_key=groq_api_key)
print("‚úÖ Judge LLM ready!")


üßë‚Äç‚öñÔ∏è Initializing Judge LLM: llama-3.2-3b-instruct ...
‚úÖ Judge LLM ready!


In [10]:
# Cell 9 ‚Äì Parse JSON from judge response into AnswerEval

def parse_answer_eval_from_response(raw_content: str) -> AnswerEval:
    """
    Extract JSON object from judge LLM output and parse into AnswerEval.
    """
    # Try to find the first {...} block
    match = re.search(r"\{.*\}", raw_content, re.DOTALL)
    if not match:
        raise ValueError(f"Could not find JSON object in judge response:\n{raw_content}")

    json_str = match.group(0)
    return AnswerEval.model_validate_json(json_str)


In [11]:
# Cell 10 ‚Äì Evaluate answer quality for a single test (LLM-as-a-judge)

def evaluate_answer(test: TestQuestion) -> tuple[AnswerEval, str, list]:
    """
    Evaluate answer quality using judge LLM.

    Returns:
        (AnswerEval, generated_answer, retrieved_nodes)
    """
    # 1. Use your RAG pipeline to answer
    generated_answer, retrieved_nodes = answer_question(test.question)

    # 2. Build judge prompt
    system_prompt = (
        "You are an expert evaluator assessing the quality of answers.\n"
        "You MUST respond with a single JSON object with these keys:\n"
        "  feedback (string), accuracy (number), completeness (number), relevance (number).\n"
        "Each score is from 1 to 5. If the answer is wrong, accuracy MUST be 1.\n"
        "Do NOT include any text before or after the JSON."
    )

    user_prompt = f"""
Question:
{test.question}

Generated Answer:
{generated_answer}

Reference Answer:
{test.reference_answer}

Please evaluate the generated answer on three dimensions:
1. Accuracy: How factually correct is it compared to the reference answer? Only give 5/5 for perfect answers.
2. Completeness: How thoroughly does it address all aspects of the question, covering all the information from the reference answer?
3. Relevance: How well does it directly answer the specific question asked, giving no additional information?

Return ONLY a JSON object with keys: feedback, accuracy, completeness, relevance.
"""

    messages = [
        ChatMessage(role="system", content=system_prompt),
        ChatMessage(role="user", content=user_prompt),
    ]

    # 3. Call judge LLM
    response = judge_llm.chat(messages)
    raw_content = response.message.content if hasattr(response, "message") else str(response)

    # 4. Parse into AnswerEval
    answer_eval = parse_answer_eval_from_response(raw_content)

    return answer_eval, generated_answer, retrieved_nodes

# Again, quick manual check if you have tests:
# tests = load_tests()
# eval_res, gen_ans, ctx = evaluate_answer(tests[0])
# eval_res


In [12]:
# Cell 11 ‚Äì Run evaluation over all tests and aggregate results

import pandas as pd
from tqdm.auto import tqdm

def evaluate_all_retrieval(tests: List[TestQuestion], use_rerank: bool = True) -> pd.DataFrame:
    rows = []
    for i, test in enumerate(tqdm(tests, desc=f"Retrieval Eval (rerank={use_rerank})")):
        r = evaluate_retrieval(test, use_rerank=use_rerank)
        rows.append({
            "index": i,
            "question": test.question,
            "category": test.category,
            "use_rerank": use_rerank,
            "mrr": r.mrr,
            "ndcg": r.ndcg,
            "keywords_found": r.keywords_found,
            "total_keywords": r.total_keywords,
            "keyword_coverage": r.keyword_coverage,
        })
    return pd.DataFrame(rows)



def evaluate_all_answers(tests: List[TestQuestion]) -> pd.DataFrame:
    rows = []
    for i, test in enumerate(tqdm(tests, desc="Answer Eval")):
        a_eval, generated_answer, _ = evaluate_answer(test)
        rows.append({
            "index": i,
            "question": test.question,
            "category": test.category,
            "accuracy": a_eval.accuracy,
            "completeness": a_eval.completeness,
            "relevance": a_eval.relevance,
            "feedback": a_eval.feedback,
            "generated_answer": generated_answer,
            "reference_answer": test.reference_answer,
        })
    return pd.DataFrame(rows)

# Run both evaluations (once tests.jsonl is ready)
tests = load_tests()

retrieval_reranked_df = evaluate_all_retrieval(tests, use_rerank=True)
# retrieval_baseline_df = evaluate_all_retrieval(tests, use_rerank=False)

answer_df = evaluate_all_answers(tests)

retrieval_reranked_df.head(), answer_df.head()

‚úÖ Loaded 14 tests from data/tests.jsonl


Retrieval Eval (rerank=True):   0%|          | 0/14 [00:00<?, ?it/s]

üîç Searching for: 'What is the main purpose of the Requests library and what problem does it solve compared to using the Python standard library?'


InvalidArgumentError: Collection expecting embedding with dimension of 1024, got 384

In [None]:
# Cell 12 ‚Äì Print summary metrics
print("üìä Retrieval Metrics Summary")
print(retrieval_reranked_df[["mrr", "ndcg", "keyword_coverage"]].describe())


print("\nüìä Answer Metrics Summary")
print(answer_df[["accuracy", "completeness", "relevance"]].describe())
