In [1]:
from pathlib import Path
import yaml
with open("config.yml", 'r') as ymlfile:
    cfg = yaml.safe_load(ymlfile)
path_fname = cfg['path_fname']
fname = cfg['fname']
output_dir = cfg['output_dir']

In [2]:
# Lod chunks from json file
import json
with open("Documents\\SORA_chunks_cleaned_manual.json", 'r', encoding='utf-8') as f:
    chunks = json.load(f)
print(f"Number of chunks: {len(chunks)}")

Number of chunks: 120


In [3]:
from PreProcessing.embeddingToolsFAISSv2 import EmbeddingToolFAISS
embedder = EmbeddingToolFAISS( output_dir=Path(output_dir), index_backend="faiss")
embeddings = embedder.load_index()


  from .autonotebook import tqdm as notebook_tqdm


✅ Loaded FAISS index with 120 vectors from PreProcessing\ProcessedFiles\index\faiss.index
   • Loaded metadata for 120 documents from PreProcessing\ProcessedFiles\index\docs.json


In [4]:
from RAG.ragv2 import RAG
rag_system = RAG(embedding_tool=embedder, chunks=chunks, default_mode="hybrid", reranker="colbert")

In [5]:
from LLM.LLM_openAI_Chatbot import LLMChatbot 

llm = LLMChatbot(rag_system=rag_system)

In [6]:
skysafe_questions = [
    {
        "chunk_index": 10,
        "direct_match": "How is the intrinsic ground risk class defined in Step 2 of the SORA process, and what type of hazard does it address?",
        "synonym_paraphrase": "How is the intrinsic ground risk characterized with respect to the likelihood of a person being struck following a loss of control, assuming a reasonable level of safety?",
        "reworked_question": "When an applicant prepares a SORA for a new UAS operation, what elements must be identified and combined to establish the unmitigated ground risk before any risk reduction measures are applied?"
    },
    {
        "chunk_index": 21,
        "direct_match": "How is the intrinsic unmanned aircraft system ground risk class determined based on maximum UAS characteristic dimension, typical kinetic energy, and operational scenario?",
        "synonym_paraphrase": "How is the intrinsic ground risk characterized using the largest UAS dimension, the expected kinetic energy, and the type of operation being conducted?",
        "reworked_question": "If an applicant is assessing a UAS with a maximum characteristic dimension of 3 metres and typical kinetic energy below 34 kilojoules, how does the intrinsic ground risk class differ between visual line of sight and beyond visual line of sight operations over sparsely populated and populated areas?"
    },
    {
        "chunk_index": 80,
        "direct_match": "How is air risk strategic mitigation defined, and what are the two main categories into which strategic mitigations are divided?",
        "synonym_paraphrase": "How is strategic air risk mitigation characterized in terms of procedures and operational limits applied before take off, and how are operator controlled measures distinguished from authority established structures and rules?",
        "reworked_question": "When preparing an operation in airspace with potential manned aircraft encounters, what types of pre flight restrictions or shared airspace frameworks can be used to reduce encounter rates or limit exposure time, and which of these are under the control of the UAS operator?"
    },
    {
        "chunk_index": 113,
        "direct_match": "What is the scope of the Predefined Risk Assessment PDRA G01 Version 1.3 in terms of aircraft characteristics, type of operation, and airspace?",
        "synonym_paraphrase": "How is the scope of PDRA G01 characterized with respect to maximum UAS size, expected kinetic energy, operational mode, and airspace conditions?",
        "reworked_question": "If an operator plans a beyond visual line of sight operation using a UAS with a maximum characteristic dimension of 3 metres and typical kinetic energy below 34 kilojoules, what operational, altitude, and airspace constraints must be met for the operation to fall within the scope of PDRA G01?"
    }
]


In [7]:
import json
import os
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Tuple, Any
from collections import defaultdict

# ==================== RETRIEVAL METRICS ====================

def compute_retrieval_metrics(
    expected_chunk_id: int,
    retrieved_chunks: List[Dict],
    k_values: List[int] = [1, 3, 5, 10]
) -> Dict[str, Any]:
    """
    Compute retrieval metrics for a single query.
    
    Returns:
        - hit_at_k: dict of {k: bool} indicating if expected chunk is in top-k
        - mrr: reciprocal rank of expected chunk (0 if not found)
        - rank: actual rank of expected chunk (None if not found)
    """
    # Extract chunk indices from retrieved results
    retrieved_ids = [chunk.get("chunk_index") for chunk in retrieved_chunks]
    
    # Find rank of expected chunk
    rank = None
    for i, chunk_id in enumerate(retrieved_ids):
        if chunk_id == expected_chunk_id:
            rank = i + 1  # 1-indexed
            break
    
    # Compute hit@k for various k values
    hit_at_k = {}
    for k in k_values:
        hit_at_k[f"hit@{k}"] = rank is not None and rank <= k
    
    # Compute MRR
    mrr = 1.0 / rank if rank is not None else 0.0
    
    return {
        "expected_chunk_id": expected_chunk_id,
        "rank": rank,
        "mrr": mrr,
        **hit_at_k
    }


# ==================== GROUNDING ASSESSMENT ====================

def assess_grounding_with_llm(
    question: str,
    answer: str,
    retrieved_chunks: List[Dict],
    expected_chunk_id: int,
    llm_client
) -> Dict[str, Any]:
    """
    Use LLM to assess if the answer is grounded in retrieved evidence.
    
    Categories:
        - SUPPORTED: All factual statements can be linked to retrieved evidence
        - UNSUPPORTED: At least one factual claim is not in retrieved context
        - INCOMPLETE: Answer avoids claims but fails to answer despite evidence present
    """
    # Build context from retrieved chunks
    context_parts = []
    for chunk in retrieved_chunks[:10]:  # Limit context for assessment
        idx = chunk.get("chunk_index")
        text = chunk.get("chunk_text", "")[:500]  # Truncate for assessment
        context_parts.append(f"[Chunk {idx}]: {text}")
    
    context_str = "\n\n".join(context_parts)
    
    assessment_prompt = f"""You are an answer grounding assessor. Evaluate if the given answer is properly grounded in the retrieved evidence.

QUESTION:
{question}

ANSWER TO ASSESS:
{answer}

RETRIEVED EVIDENCE:
{context_str}

EXPECTED CHUNK ID: {expected_chunk_id}

Classify the answer into exactly one category:
1. SUPPORTED - All factual statements in the answer can be traced to the retrieved evidence
2. UNSUPPORTED - At least one factual claim in the answer cannot be found in the retrieved evidence
3. INCOMPLETE - The answer avoids making claims or is too vague, despite relevant evidence being present

Also check if the expected chunk (ID: {expected_chunk_id}) was actually used to support the answer.

Respond in this exact JSON format:
{{
    "grounding_category": "SUPPORTED" or "UNSUPPORTED" or "INCOMPLETE",
    "expected_chunk_used": true or false,
    "reasoning": "brief explanation"
}}"""

    try:
        response = llm_client.client.chat.completions.create(
            model=llm_client.model_name,
            messages=[{"role": "user", "content": assessment_prompt}],
            max_tokens=500,
            temperature=0.1
        )
        
        response_text = response.choices[0].message.content.strip()
        
        # Parse JSON response
        import re
        json_match = re.search(r'\{[^}]+\}', response_text, re.DOTALL)
        if json_match:
            result = json.loads(json_match.group())
            return {
                "grounding_category": result.get("grounding_category", "UNKNOWN"),
                "expected_chunk_used": result.get("expected_chunk_used", False),
                "reasoning": result.get("reasoning", "")
            }
    except Exception as e:
        print(f"Grounding assessment error: {e}")
    
    return {
        "grounding_category": "UNKNOWN",
        "expected_chunk_used": False,
        "reasoning": "Assessment failed"
    }


def simple_grounding_heuristic(
    answer: str,
    retrieved_chunks: List[Dict],
    expected_chunk_id: int
) -> Dict[str, Any]:
    """
    Simple heuristic-based grounding assessment (faster, no LLM call).
    
    Checks:
    - If answer references chunk indices
    - If expected chunk text appears to be used
    - Basic answer quality indicators
    """
    answer_lower = answer.lower()
    
    # Check if expected chunk is in retrieved results
    expected_in_retrieved = any(
        chunk.get("chunk_index") == expected_chunk_id 
        for chunk in retrieved_chunks
    )
    
    # Get expected chunk text for overlap check
    expected_chunk_text = ""
    for chunk in retrieved_chunks:
        if chunk.get("chunk_index") == expected_chunk_id:
            expected_chunk_text = chunk.get("chunk_text", "").lower()
            break
    
    # Simple keyword overlap between answer and expected chunk
    if expected_chunk_text:
        answer_words = set(answer_lower.split())
        chunk_words = set(expected_chunk_text.split())
        # Remove common stop words
        stop_words = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 
                      'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
                      'would', 'could', 'should', 'may', 'might', 'must', 'shall',
                      'to', 'of', 'in', 'for', 'on', 'with', 'at', 'by', 'from',
                      'and', 'or', 'but', 'if', 'then', 'else', 'when', 'where',
                      'that', 'this', 'these', 'those', 'it', 'its'}
        answer_words -= stop_words
        chunk_words -= stop_words
        
        overlap = len(answer_words & chunk_words)
        overlap_ratio = overlap / len(answer_words) if answer_words else 0
    else:
        overlap_ratio = 0
    
    # Heuristic classification
    if not expected_in_retrieved:
        category = "UNSUPPORTED"  # Expected chunk not even retrieved
    elif overlap_ratio > 0.3:
        category = "SUPPORTED"  # Good overlap with expected chunk
    elif len(answer.split()) < 20:
        category = "INCOMPLETE"  # Very short answer
    elif "don't know" in answer_lower or "cannot" in answer_lower or "no information" in answer_lower:
        category = "INCOMPLETE"
    else:
        category = "UNSUPPORTED"  # Low overlap, likely using other sources
    
    return {
        "grounding_category": category,
        "expected_chunk_used": expected_in_retrieved and overlap_ratio > 0.2,
        "overlap_ratio": round(overlap_ratio, 3),
        "expected_in_retrieved": expected_in_retrieved
    }


# ==================== EVALUATION RUNNER ====================

class EvaluationRunner:
    """
    Runs evaluation on RAG pipeline for skysafe_questions.
    Saves results per chunk and skips already processed chunks.
    """
    
    QUESTION_VARIANTS = ["direct_match", "synonym_paraphrase", "reworked_question"]
    K_VALUES = [1, 3, 5, 10]
    
    def __init__(
        self,
        llm,
        rag_system,
        use_llm_grounding: bool = False,
        top_k: int = 50,
        ce_keep_k: int = 10,
        output_dir: str = "Results/RegulatoryAssistant"
    ):
        self.llm = llm
        self.rag_system = rag_system
        self.use_llm_grounding = use_llm_grounding
        self.top_k = top_k
        self.ce_keep_k = ce_keep_k
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        self.results = []
    
    def _get_chunk_result_filename(self, chunk_id: int) -> Path:
        """Generate filename for a specific chunk's results."""
        return self.output_dir / f"chunk_{chunk_id}_results.json"
    
    def _is_chunk_processed(self, chunk_id: int) -> bool:
        """Check if a chunk has already been processed."""
        return self._get_chunk_result_filename(chunk_id).exists()
    
    def _load_chunk_results(self, chunk_id: int) -> List[Dict]:
        """Load previously saved results for a chunk."""
        filepath = self._get_chunk_result_filename(chunk_id)
        if filepath.exists():
            with open(filepath, 'r', encoding='utf-8') as f:
                data = json.load(f)
                return data.get("results", [])
        return []
    
    def _save_chunk_results(self, chunk_id: int, chunk_results: List[Dict], question_set: Dict) -> None:
        """Save results for a single chunk to its own file."""
        filepath = self._get_chunk_result_filename(chunk_id)
        
        # Compute chunk-level metrics
        chunk_metrics = self._compute_chunk_metrics(chunk_results)
        
        data = {
            "chunk_id": chunk_id,
            "timestamp": datetime.now().isoformat(),
            "config": {
                "top_k": self.top_k,
                "ce_keep_k": self.ce_keep_k,
                "use_llm_grounding": self.use_llm_grounding
            },
            "question_set": question_set,
            "results": chunk_results,
            "chunk_metrics": chunk_metrics
        }
        
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        
        print(f"    ✓ Saved results to {filepath.name}")
    
    def _compute_chunk_metrics(self, chunk_results: List[Dict]) -> Dict[str, Any]:
        """Compute metrics for a single chunk's results."""
        if not chunk_results:
            return {}
        
        metrics = {
            "num_variants": len(chunk_results),
            "retrieval": {},
            "grounding": {}
        }
        
        # Retrieval metrics
        for k in self.K_VALUES:
            hits = sum(1 for r in chunk_results if r["retrieval"].get(f"hit@{k}"))
            metrics["retrieval"][f"hit@{k}"] = hits
            metrics["retrieval"][f"hit@{k}_pct"] = round(hits / len(chunk_results) * 100, 1)
        
        mrr_sum = sum(r["retrieval"].get("mrr", 0) for r in chunk_results)
        metrics["retrieval"]["mrr"] = round(mrr_sum / len(chunk_results), 3)
        
        # Grounding metrics
        for cat in ["SUPPORTED", "UNSUPPORTED", "INCOMPLETE", "UNKNOWN"]:
            count = sum(1 for r in chunk_results if r["grounding"].get("grounding_category") == cat)
            metrics["grounding"][cat] = count
        
        expected_used = sum(1 for r in chunk_results if r["grounding"].get("expected_chunk_used"))
        metrics["grounding"]["expected_chunk_used"] = expected_used
        
        return metrics
    
    def run_single_query(
        self,
        question: str,
        expected_chunk_id: int,
        variant_type: str,
        query_set_index: int
    ) -> Dict[str, Any]:
        """Run RAG pipeline for a single query and compute metrics."""
        
        print(f"    [{variant_type}] {question[:70]}...")
        
        # Execute RAG pipeline
        response = self.llm.answer(
            question,
            chat_history=[],
            top_k=self.top_k,
            ce_keep_k=self.ce_keep_k,
            stream=False,
            print_sources=False,
            print_prompt=False,
            reasoning_effort='medium'
        )
        
        answer = response.get("answer", "")
        retrieved_chunks = response.get("chunks", [])
        sources = response.get("sources", [])
        
        # Compute retrieval metrics
        retrieval_metrics = compute_retrieval_metrics(
            expected_chunk_id,
            retrieved_chunks,
            k_values=self.K_VALUES
        )
        
        # Compute grounding metrics
        if self.use_llm_grounding:
            grounding_metrics = assess_grounding_with_llm(
                question, answer, retrieved_chunks, expected_chunk_id, self.llm
            )
        else:
            grounding_metrics = simple_grounding_heuristic(
                answer, retrieved_chunks, expected_chunk_id
            )
        
        result = {
            "query_set_index": query_set_index,
            "variant_type": variant_type,
            "question": question,
            "expected_chunk_id": expected_chunk_id,
            "answer": answer,
            "num_retrieved": len(retrieved_chunks),
            "sources": sources,
            "retrieval": retrieval_metrics,
            "grounding": grounding_metrics,
            "retrieved_chunk_ids": [c.get("chunk_index") for c in retrieved_chunks[:10]]
        }
        
        return result
    
    def run_evaluation(self, questions: List[Dict]) -> List[Dict]:
        """Run full evaluation on all questions and variants, skipping already processed chunks."""
        
        print(f"\n{'='*60}")
        print(f"Starting Evaluation: {len(questions)} question sets x {len(self.QUESTION_VARIANTS)} variants")
        print(f"Output directory: {self.output_dir}")
        print(f"{'='*60}\n")
        
        self.results = []
        processed_count = 0
        skipped_count = 0
        
        for i, q_set in enumerate(questions):
            expected_chunk_id = q_set.get("chunk_index")
            
            # Check if already processed
            if self._is_chunk_processed(expected_chunk_id):
                print(f"[{i+1}/{len(questions)}] Chunk {expected_chunk_id}: SKIPPED (already processed)")
                # Load existing results
                existing_results = self._load_chunk_results(expected_chunk_id)
                self.results.extend(existing_results)
                skipped_count += 1
                continue
            
            print(f"\n[{i+1}/{len(questions)}] Processing Chunk {expected_chunk_id}")
            print("-" * 50)
            
            chunk_results = []
            
            for variant in self.QUESTION_VARIANTS:
                question = q_set.get(variant)
                if question:
                    result = self.run_single_query(
                        question, expected_chunk_id, variant, i
                    )
                    chunk_results.append(result)
                    self.results.append(result)
            
            # Save results for this chunk immediately
            self._save_chunk_results(expected_chunk_id, chunk_results, q_set)
            processed_count += 1
        
        print(f"\n{'='*60}")
        print(f"Evaluation Complete!")
        print(f"  - Processed: {processed_count} chunks")
        print(f"  - Skipped (already done): {skipped_count} chunks")
        print(f"  - Total results: {len(self.results)} queries")
        print(f"{'='*60}\n")
        
        return self.results
    
    def load_all_results(self, questions: List[Dict]) -> List[Dict]:
        """Load all results from saved chunk files."""
        all_results = []
        for q_set in questions:
            chunk_id = q_set.get("chunk_index")
            chunk_results = self._load_chunk_results(chunk_id)
            all_results.extend(chunk_results)
        self.results = all_results
        return all_results
    
    def compute_aggregate_metrics(self) -> Dict[str, Any]:
        """Compute aggregate metrics grouped by question variant."""
        
        metrics_by_variant = defaultdict(lambda: {
            "count": 0,
            "hit@1": 0, "hit@3": 0, "hit@5": 0, "hit@10": 0,
            "mrr_sum": 0.0,
            "grounding": {"SUPPORTED": 0, "UNSUPPORTED": 0, "INCOMPLETE": 0, "UNKNOWN": 0},
            "expected_chunk_used": 0
        })
        
        for r in self.results:
            variant = r["variant_type"]
            m = metrics_by_variant[variant]
            m["count"] += 1
            
            # Retrieval metrics
            for k in self.K_VALUES:
                if r["retrieval"].get(f"hit@{k}"):
                    m[f"hit@{k}"] += 1
            m["mrr_sum"] += r["retrieval"].get("mrr", 0)
            
            # Grounding metrics
            category = r["grounding"].get("grounding_category", "UNKNOWN")
            m["grounding"][category] += 1
            if r["grounding"].get("expected_chunk_used"):
                m["expected_chunk_used"] += 1
        
        # Compute final percentages and averages
        aggregated = {}
        for variant, m in metrics_by_variant.items():
            n = m["count"]
            if n == 0:
                continue
            
            aggregated[variant] = {
                "count": n,
                "hit@1": round(m["hit@1"] / n * 100, 1),
                "hit@3": round(m["hit@3"] / n * 100, 1),
                "hit@5": round(m["hit@5"] / n * 100, 1),
                "hit@10": round(m["hit@10"] / n * 100, 1),
                "mrr": round(m["mrr_sum"] / n, 3),
                "grounded_rate": round(m["grounding"]["SUPPORTED"] / n * 100, 1),
                "unsupported_rate": round(m["grounding"]["UNSUPPORTED"] / n * 100, 1),
                "incomplete_rate": round(m["grounding"]["INCOMPLETE"] / n * 100, 1),
                "expected_chunk_used_rate": round(m["expected_chunk_used"] / n * 100, 1)
            }
        
        # Overall metrics
        total = len(self.results)
        if total > 0:
            aggregated["overall"] = {
                "count": total,
                "hit@1": round(sum(1 for r in self.results if r["retrieval"].get("hit@1")) / total * 100, 1),
                "hit@3": round(sum(1 for r in self.results if r["retrieval"].get("hit@3")) / total * 100, 1),
                "hit@5": round(sum(1 for r in self.results if r["retrieval"].get("hit@5")) / total * 100, 1),
                "hit@10": round(sum(1 for r in self.results if r["retrieval"].get("hit@10")) / total * 100, 1),
                "mrr": round(sum(r["retrieval"].get("mrr", 0) for r in self.results) / total, 3),
                "grounded_rate": round(sum(1 for r in self.results if r["grounding"].get("grounding_category") == "SUPPORTED") / total * 100, 1),
            }
        
        return aggregated
    
    def print_report(self) -> None:
        """Print a formatted evaluation report."""
        
        agg = self.compute_aggregate_metrics()
        
        print(f"\n{'='*70}")
        print("EVALUATION REPORT")
        print(f"{'='*70}\n")
        
        print("RETRIEVAL METRICS (% of queries)")
        print("-" * 70)
        print(f"{'Variant':<25} {'Hit@1':>8} {'Hit@3':>8} {'Hit@5':>8} {'Hit@10':>8} {'MRR':>8}")
        print("-" * 70)
        
        for variant in self.QUESTION_VARIANTS + ["overall"]:
            if variant in agg:
                m = agg[variant]
                print(f"{variant:<25} {m['hit@1']:>7.1f}% {m['hit@3']:>7.1f}% {m['hit@5']:>7.1f}% {m['hit@10']:>7.1f}% {m['mrr']:>8.3f}")
        
        print(f"\n{'='*70}")
        print("GROUNDING METRICS (% of queries)")
        print("-" * 70)
        print(f"{'Variant':<25} {'Grounded':>10} {'Unsupported':>12} {'Incomplete':>12} {'Chunk Used':>12}")
        print("-" * 70)
        
        for variant in self.QUESTION_VARIANTS:
            if variant in agg:
                m = agg[variant]
                print(f"{variant:<25} {m['grounded_rate']:>9.1f}% {m['unsupported_rate']:>11.1f}% {m['incomplete_rate']:>11.1f}% {m['expected_chunk_used_rate']:>11.1f}%")
        
        print(f"\n{'='*70}\n")
    
    def save_aggregate_metrics(self) -> str:
        """Save aggregate metrics summary to a single file."""
        
        metrics_file = self.output_dir / "evaluation_aggregate_metrics.json"
        agg_metrics = self.compute_aggregate_metrics()
        
        # List processed chunks
        processed_chunks = [
            int(f.stem.replace("chunk_", "").replace("_results", ""))
            for f in self.output_dir.glob("chunk_*_results.json")
        ]
        
        with open(metrics_file, 'w', encoding='utf-8') as f:
            json.dump({
                "timestamp": datetime.now().isoformat(),
                "config": {
                    "top_k": self.top_k,
                    "ce_keep_k": self.ce_keep_k,
                    "use_llm_grounding": self.use_llm_grounding,
                    "num_queries": len(self.results)
                },
                "processed_chunks": sorted(processed_chunks),
                "metrics": agg_metrics
            }, f, indent=2, ensure_ascii=False)
        
        print(f"Aggregate metrics saved to: {metrics_file}")
        return str(metrics_file)

print("Evaluation framework loaded successfully!")

Evaluation framework loaded successfully!


In [8]:
# Initialize the evaluation runner
# Results are saved per-chunk to: Results/RegulatoryAssistant/chunk_{id}_results.json
evaluator = EvaluationRunner(
    llm=llm,
    rag_system=rag_system,
    use_llm_grounding=True,  # Set to True for LLM-based grounding assessment
    top_k=50,
    ce_keep_k=10,
    output_dir="Results/RegulatoryAssistant"
)

# Run evaluation - automatically skips already processed chunks
results = evaluator.run_evaluation(skysafe_questions)

# Print formatted report (includes both new and loaded results)
evaluator.print_report()

# Save aggregate metrics summary
evaluator.save_aggregate_metrics()


Starting Evaluation: 4 question sets x 3 variants
Output directory: Results\RegulatoryAssistant


[1/4] Processing Chunk 10
--------------------------------------------------
    [direct_match] How is the intrinsic ground risk class defined in Step 2 of the SORA p...
Answer: **Facts (directly from the provided context)**  
- Step 2 of the SORA process establishes the *intrinsic ground risk class* of a UAS operation.  
- The intrinsic ground risk relates to the risk of a person being struck by the unmanned aircraft in the event of a loss of control, assuming a reasonable level of safety.  
- To determine the class, the applicant must:  
  1. Identify the maximum characteristic dimension of the UAS (e.g., wingspan, blade diameter, or overall dimension).  
  2. Define the area at risk (area of operation) including the operational volume (flight geography and contingency volume) and a ground‑risk buffer.  
  3. Combine the operational scenario (sparsely populated, populated, or assemblies

'Results\\RegulatoryAssistant\\evaluation_aggregate_metrics.json'

In [None]:
# Optional: View detailed results per question
def display_per_question_analysis(results: List[Dict]) -> None:
    """Display detailed analysis for each question."""
    
    for r in results:
        rank = r["retrieval"].get("rank")
        rank_str = f"Rank {rank}" if rank else "Not found"
        grounding = r["grounding"].get("grounding_category", "UNKNOWN")
        
        print(f"\n{'='*60}")
        print(f"Question Set {r['query_set_index']+1} | {r['variant_type']}")
        print(f"Expected Chunk: {r['expected_chunk_id']} | {rank_str}")
        print(f"Grounding: {grounding}")
        print(f"-"*60)
        print(f"Q: {r['question'][:100]}...")
        print(f"A: {r['answer'][:200]}...")
        print(f"Retrieved chunks (top 5): {r['retrieved_chunk_ids'][:5]}")

# Uncomment to see detailed analysis:
display_per_question_analysis(results)


Question Set 1 | direct_match
Expected Chunk: 10 | Rank 1
Grounding: SUPPORTED
------------------------------------------------------------
Q: How is the intrinsic ground risk class defined in Step 2 of the SORA process, and what type of hazar...
A: **Facts (directly from the provided context)**  
- Step 2 of the SORA process establishes the *intrinsic ground risk class* of a UAS operation.  
- The intrinsic ground risk relates to the risk of a p...
Retrieved chunks (top 5): [10]

Question Set 1 | synonym_paraphrase
Expected Chunk: 10 | Rank 1
Grounding: UNKNOWN
------------------------------------------------------------
Q: How is the intrinsic ground risk characterized with respect to the likelihood of a person being stru...
A: **Facts (from the provided context)**  
1. The intrinsic ground risk class is defined as the risk of a person being struck by the UAS in the event of a loss of control, *assuming a reasonable level of...
Retrieved chunks (top 5): [10]

Question Set 1 | rework

In [11]:
# To reload all results from saved files (useful for generating reports without re-running):
evaluator.load_all_results(skysafe_questions)
evaluator.print_report()


EVALUATION REPORT

RETRIEVAL METRICS (% of queries)
----------------------------------------------------------------------
Variant                      Hit@1    Hit@3    Hit@5   Hit@10      MRR
----------------------------------------------------------------------
direct_match                100.0%   100.0%   100.0%   100.0%    1.000
synonym_paraphrase           75.0%    75.0%   100.0%   100.0%    0.800
reworked_question            75.0%    75.0%    75.0%    75.0%    0.750
overall                      83.3%    83.3%    91.7%    91.7%    0.850

GROUNDING METRICS (% of queries)
----------------------------------------------------------------------
Variant                     Grounded  Unsupported   Incomplete   Chunk Used
----------------------------------------------------------------------
direct_match                   50.0%         0.0%         0.0%        50.0%
synonym_paraphrase             25.0%        25.0%         0.0%        50.0%
reworked_question              50.0%        25