In [1]:
import numpy as np
import pandas as pd
import os
import csv
import re
import math
from dotenv import load_dotenv
import json
from openai import OpenAI
from sentence_transformers import SentenceTransformer
import glob
import PyPDF2 
import hashlib
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from PyPDF2 import PdfReader
import requests

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
path = '/home/cptaswadu/new-rescue/RESCUE-n8n'
load_dotenv(dotenv_path=os.path.join(path, ".env"))
openai_api_key = os.getenv("OPEN_AI_API_KEY")
perplexity_api_key = os.getenv("PERPLEXITY_API_KEY")
chatgpt_client = OpenAI(api_key=openai_api_key)

In [None]:
class RAGPolicyRetriever:
    def __init__(self, policy_folder_path, openai_api_key=None, perplexity_api_key=None, llm_model="gpt-4o", cache_dir=None, embedder_id = "all-MiniLM-L6-v2"):
        # Initialize class variables and embedder
        self.policy_folder_path = policy_folder_path
        self.policies = {}
        self.embeddings = {}
        self.embedder = SentenceTransformer(embedder_id)
        self.llm_model = llm_model
        # Set up cache directory and model clients (OpenAI / Perplexity)
        self.openai_client = OpenAI(api_key=openai_api_key) if openai_api_key else None
        self.perplexity_api_key = perplexity_api_key
        self.cache_dir = cache_dir or os.path.join(
            os.path.dirname(policy_folder_path), "..", "cache"
        )
        self.cache_dir = os.path.abspath(self.cache_dir)
        self.embedder_id = "all-MiniLM-L6-v2"

    def load_policies(self):
        # Load all pdf policies and extract content / compute MD5
        pdf_files = glob.glob(os.path.join(self.policy_folder_path, "*.pdf")) # find all the pdf files from the path
        self.allowed_prefixes = sorted({os.path.basename(p).split("_")[0] for p in pdf_files}) # Build allowed insurer tokens from filename prefixes
        self.doc_md5s = {}
        for pdf_file in pdf_files:
            with open(pdf_file, "rb") as f: # open the pdf file in binary mode
                reader = PyPDF2.PdfReader(f) 
                text = "".join(page.extract_text() or "" for page in reader.pages) # extract text from each page and concatenate

            fname = os.path.basename(pdf_file) # Filename only
            self.policies[fname] = text # Store text by filename
            self.doc_md5s[fname] = self.calculate_pdf_md5(pdf_file) # Compute and store MD5 hash for file
        print(f"‚úÖ Loaded {len(self.policies)} policies.") 

    def calculate_pdf_md5(self, pdf_path):
        # Compute content MD5 of a PDF file
        with open(pdf_path, 'rb') as f:
            return hashlib.md5(f.read()).hexdigest() # calculate the md5 hash of the pdf file (hexadecimal string)

    def embed_policies(self):
        # Create or load cached embeddings for all policies

        # Ensure policies are loaded before embedding
        if not getattr(self, "doc_md5s", None):
            raise RuntimeError("call load_policies() before embed_policies()")
        
        # Build corpus-level hash for cache invalidation
        items = [f"{name}:{self.doc_md5s[name]}" for name in sorted(self.policies.keys())]
        corpus_hash = hashlib.md5("\n".join(items).encode("utf-8")).hexdigest()

        cache_root = os.path.join(self.cache_dir, self.embedder_id, corpus_hash)
        os.makedirs(cache_root, exist_ok=True)
        
        # Paths for cached names and vectors
        names_path = os.path.join(cache_root, "doc_names.json")
        vecs_path  = os.path.join(cache_root, "embeddings.npy")

        # Load cached names and vectors
        if os.path.exists(names_path) and os.path.exists(vecs_path):
            try:
                with open(names_path, "r", encoding="utf-8") as f:
                    names = json.load(f)
                vecs = np.load(vecs_path)
                if (len(names) == len(vecs) and set(names) == set(self.policies.keys())):
                    self.embeddings = {name: vecs[i] for i, name in enumerate(names)}
                    print(f"‚úÖ Loaded embeddings from cache ({len(names)} docs).")
                    return
            except Exception as e:
                print(f"‚ö†Ô∏è Failed to load cache. Recomputing‚Ä¶ ({e})")

        # Compute embeddings from scratch
        self.embeddings = {}
        names = []
        vec_list = []
        for doc_name, doc_text in self.policies.items():
            vec = self.embedder.encode([doc_text])[0]
            self.embeddings[doc_name] = vec
            names.append(doc_name)
            vec_list.append(vec)

        # Persist names and vectors to cache
        try:
            with open(names_path, "w", encoding="utf-8") as f:
                json.dump(names, f, ensure_ascii=False)
            np.save(vecs_path, np.stack(vec_list, axis=0))
            print(f"‚úÖ Embeddings created & cached ({len(names)} docs).")
        except Exception as e:
            print(f"‚ö†Ô∏è Failed to write cache: {e}")

    def clean_json_response(self, response_text):
        # Extract and parse clean JSON from LLM output
        original = response_text.strip()

        # Step 0: Check for hallucinated greeting (Perplexity fallback)
        if "how can I assist you" in original.lower() or "insurance-related questions" in original.lower():
            raise ValueError("Perplexity returned generic assistant response instead of JSON.")

        # Step 1: Try direct parsing
        try:
            return json.loads(original)
        except json.JSONDecodeError:
            pass

        # Step 2: Remove code block wrappers
        cleaned = re.sub(r"^```(?:json)?\s*|\s*```$", "", original, flags=re.IGNORECASE).strip()
        try:
            return json.loads(cleaned)
        except json.JSONDecodeError:
            pass

        # Step 3: Try to extract the first {...} JSON-like block
        match = re.search(r"(\{[\s\S]*?\})", original)
        if match:
            try:
                return json.loads(match.group(1))
            except json.JSONDecodeError:
                pass

        raise ValueError("No valid JSON found in the response.")
    
    def extract_insurance_and_test(self, patient_info):
        # Extract insurance and test type from patient information using LLM
        allowed = ", ".join([f'"{p}"' for p in getattr(self, "allowed_prefixes", [])])
        prompt = f"""Return STRICT JSON ONLY with keys "insurance","test"

Given the patient information below, identify:
- insurance: choose exactly one from [{allowed}] that best matches the wording
  (e.g., "UnitedHealthcare", "UHC", "Federal Employee Program", "FEP Blue" ‚Üí map to the closest allowed token).
- test: choose one of ["WES","WGS","CMA","BRCA1/2"].

PATIENT INFORMATION:
{patient_info}
"""
        messages = [
            {"role": "system", "content": "You are an information extraction system for genetic testing insurance."}, # assigning system role
            {"role": "user", "content": prompt} 
        ]

        if self.llm_model.startswith("gpt"):
    # Check if it's a GPT-5 model and conditionally set temperature parameter
            api_params = {
                "model": self.llm_model,
                "messages": messages
            }
    
    # Only add temperature parameter for non-GPT-5 models
            if "gpt-5" not in self.llm_model.lower():
                api_params["temperature"] = 0
    
            response = self.openai_client.chat.completions.create(**api_params)
            output = response.choices[0].message.content.strip()

        elif self.llm_model == "perplexity":
            headers = {
                "Authorization": f"Bearer {self.perplexity_api_key}", 
                "Content-Type": "application/json"
            }
            data = {
                "model": "sonar-pro",
                "messages": messages,
                "temperature": 0
            }
            url = "https://api.perplexity.ai/chat/completions"
            res = requests.post(url, headers=headers, json=data)
            output = res.json()["choices"][0]["message"]["content"].strip()

        else:
            raise ValueError("Unsupported LLM model")

        try: # parsing
            output_json = self.clean_json_response(output)
            insurance = output_json.get("insurance", None)
            test_name = output_json.get("test", None)
            return insurance, test_name
        
        except Exception as e:
            print(f"‚ùó JSON parsing error in extract_insurance_and_test: {e}")
            print(f"üîç Raw output: {output}")
            return None, None

    def filter_policies_by_insurance(self, insurance_name): 
        # Filter policies by insurer token found in filename & policy text
        # Return all policies if no insurance name provided
        if not insurance_name:
            return self.policies
    
        # Clean insurance name: remove spaces and convert to lowercase
        insurance_clean = insurance_name.replace(" ", "").lower()
        filtered = {}
    
        for doc_name, doc_text in self.policies.items():
        # Clean document name for comparison
            doc_name_clean = doc_name.replace(" ", "").lower()
        
            # First attempt: Match against filename (fast)
            if insurance_clean in doc_name_clean:
                filtered[doc_name] = doc_text # BCBS_FEP_204102 Whole Exome and.pdf -> bcbs_fep_204102wholeexomeand.pdf

            # Second attempt: Match against document content beginning (fallback)

            elif insurance_clean in doc_text.lower()[:1000]:
                filtered[doc_name] = doc_text 
    
        return filtered

    def get_test_keywords(self, test_name):
        # Map test name to a keyword list for filtering
        if not test_name:
            return []
        
        test_keywords_map = {
    "brca1/2": [
        "brca", "brca1", "brca2", "brca1/2", "brca 1/2"
        "breast cancer", "ovarian cancer", "pancreatic cancer", "prostate cancer", "metastatic"
    ],
    "wes": [
        "whole exome sequencing", "wes", "exome sequencing",
        "multiple congenital anomalies", "neurodevelopmental disorder", 
        "developmental delay", "unexplained disorder", "autism", "organ abnormality"
    ],
    "wgs": [
        "whole genome sequencing", "wgs", "genome sequencing",
        "congenital disorder", "fetal anomalies", 
        "unexplained anomalies", "developmental disorder", "organ system abnormality"
    ],
    "cma": [
        "chromosomal microarray", "cma", "copy number variation", 
        "developmental delay", "intellectual disability", "autism", 
        "congenital anomalies", "global developmental delay"
    ]
}
    
        test_lower = test_name.lower()
        for test_type, keywords in test_keywords_map.items():
            if test_type in test_lower:
                return keywords
    
        return [test_name.lower()]


    def filter_by_test_keywords(self, policies_dict, test_name):
        # Filter policies whose text contains any test keyword
        if not test_name or not policies_dict:
            return policies_dict
        
        test_keywords = self.get_test_keywords(test_name)
        filtered = {}
    
        for doc_name, doc_text in policies_dict.items():
            doc_text_lower = doc_text.lower()
            for keyword in test_keywords:
                if keyword in doc_text_lower:
                    filtered[doc_name] = doc_text
                    break  
                
        return filtered

    def find_top_policies(self, patient_info, insurance_name, test_name=None, top_k=5):
        # Retrieve top-k policies by cosine similarity
        filtered_policies = self.filter_policies_by_insurance(insurance_name)
        if not filtered_policies:
            print("‚ùó No policies matched the insurance. Using all policies.")
            filtered_policies = self.policies

        if test_name and filtered_policies:
            test_filtered = self.filter_by_test_keywords(filtered_policies, test_name)
            if test_filtered:
                print(f"‚úÖ Found {len(test_filtered)} policies matching test '{test_name}'")
                filtered_policies = test_filtered
            else:
                print(f"‚ö†Ô∏è No policies matched test '{test_name}'. Using insurance-filtered policies.")

        query_embedding = self.embedder.encode([patient_info])[0]
        scored_policies = []
        for doc_name, doc_text in filtered_policies.items():
            doc_embedding = self.embeddings[doc_name]
            score = cosine_similarity([query_embedding], [doc_embedding])[0][0]
            scored_policies.append((doc_name, score, doc_text))

        scored_policies.sort(key=lambda x: x[1], reverse=True)
        return scored_policies[:top_k]

    def rerank_policies(self, patient_info, candidates):
        # Use LLM to pick the best among candidates
        candidate_texts = [c[2][:500].replace("\n", " ") for c in candidates]

        prompt = f"""You are an expert insurance policy analyst specializing in genetic testing coverage.

You will be given patient information and a list of candidate insurance policies.
Select the policy that BEST COVERS the patient's specific genetic test and medical condition.

Patient Information:
{patient_info}

Candidate Policies:"""

        for idx, text in enumerate(candidate_texts, 1):
            prompt += f"\n\nPolicy {idx}:\n{text}"

        prompt += """

Please answer with only the number of the most appropriate policy.
Do not explain. Just output the number.

Answer:"""

        messages = [
            {"role": "system", "content": "You are an information extraction system for ranking the most appropriate insurance policies."},
            {"role": "user", "content": prompt}
        ]

        if self.llm_model.startswith("gpt"):
    # Check if it's a GPT-5 model and conditionally set temperature parameter
            api_params = {
                "model": self.llm_model,
                "messages": messages
            }
    
    # Only add temperature parameter for non-GPT-5 models
            if "gpt-5" not in self.llm_model.lower():
                api_params["temperature"] = 0
    
            response = self.openai_client.chat.completions.create(**api_params)
            result = response.choices[0].message.content.strip()

        elif self.llm_model == "perplexity":
            headers = {
                "Authorization": f"Bearer {self.perplexity_api_key}",
                "Content-Type": "application/json"
            }
            data = {
                "model": "sonar-pro",
                "messages": messages,
                "temperature": 0
            }
            url = "https://api.perplexity.ai/chat/completions"
            res = requests.post(url, headers=headers, json=data)
            result = res.json()["choices"][0]["message"]["content"].strip()
        else:
            raise ValueError("Unsupported LLM model")

        match = re.search(r'(\d+)', result)
        selected_idx = int(match.group(1)) - 1 if match else 0
        return candidates[selected_idx]

    def find_policies_with_matching_check(self, patient_info, expected_md5, top_k=10):
        # End-to-end retrieval + (optional) rerank + MD5 match
        insurance, test = self.extract_insurance_and_test(patient_info)
        candidates = self.find_top_policies(patient_info, insurance, test, top_k=top_k)

        # If top_k=1, rerank to pick best; else check in order
        if top_k == 1:
            print("üîÑ Performing reranking for top-1")
            best_policy = self.rerank_policies(patient_info, candidates)
            doc_name, doc_text = best_policy[0], best_policy[2]
        
            pdf_path = os.path.join(self.policy_folder_path, doc_name)
            predicted_md5 = self.calculate_pdf_md5(pdf_path)
        
            if predicted_md5 == expected_md5:
                return doc_name, doc_text, predicted_md5
        else:
            # Check candidates in order until MD5 match found when top_k > 1
            for doc_name, score, doc_text in candidates:
                pdf_path = os.path.join(self.policy_folder_path, doc_name)
                predicted_md5 = self.calculate_pdf_md5(pdf_path)
            
                if predicted_md5 == expected_md5:
                    return doc_name, doc_text, predicted_md5
    
        return None, None, None


In [None]:
class QnAExecutor:
    # Initialize the executor with necessary parameters
    def __init__(self, questions_list, llm_model="gpt-4o", openai_client=None, perplexity_api_key=None):
        self.questions_list = questions_list
        self.formatted_questions = self.format_questions()
        self.llm_model = llm_model
        self.openai_client = openai_client
        self.perplexity_api_key = perplexity_api_key

    def format_question_block(self, q, indent=2):
        # Format a single question block with indentation
        indent_str = " " * indent
        question_line = f"{q['question']}"
        question_line += f"\n{indent_str}Options: {', '.join(q['options'])}"
        return question_line


    def format_questions(self):
        # Join all questions into a single prompt chunk
        return "\n\n".join([
            f"{q['id']}. {self.format_question_block(q)}"
            for q in self.questions_list
        ])


    def clean_json_response(self, response_text):
        # Clean and extract JSON from the response text
        original = response_text.strip()

        # Step 0: Check for hallucinated greeting (Perplexity fallback)
        if "how can I assist you" in original.lower() or "insurance-related questions" in original.lower():
            raise ValueError("Perplexity returned generic assistant response instead of JSON.")

        # Step 1: Try direct parsing
        try:
            return json.loads(original)
        except json.JSONDecodeError:
            pass

        # Step 2: Remove code block wrappers
        cleaned = re.sub(r"^```(?:json)?\s*|\s*```$", "", original, flags=re.IGNORECASE).strip()
        try:
            return json.loads(cleaned)
        except json.JSONDecodeError:
            pass

        # Step 3: Try to extract the first {...} JSON-like block
        match = re.search(r"(\{[\s\S]*?\})", original)
        if match:
            try:
                return json.loads(match.group(1))
            except json.JSONDecodeError:
                pass

        raise ValueError("No valid JSON found in the response.")

    def run_qna(self, patient_info, policy_name, policy_text, case_id, retrieval_model, qna_model, predicted_md5 = None, top_k=None):
        prompt = f"""
You are a clinical insurance assistant specializing in genetic testing coverage policies.
You MUST answer in JSON format only.
You will be given:

1. Patient clinical information (including their insurance provider, plan type, and state of residence).
2. Official insurance policy document text (strictly use this policy content for insurance coverage decision making).

Instructions:
- Answer all questions strictly based on the insurance policy document provided.
- Do NOT refer to general guidelines or policies from other insurance providers.
- If policy document does not clearly specify rules, you MAY use patient's clinical information to infer answers carefully.
- Do NOT assume coverage criteria from other insurers or general clinical guidelines unless explicitly stated in the policy.
- Output answers in JSON format ONLY.

Focus on sections for uploaded policy document:
- **Age criteria**
- **Medical necessity criteria**
- **Prior test criteria**
- **Family history information** 
- **Related CPT codes**
- **Coverage criteria**
- **Counseling / Provider criteria**

Patient Information:
{patient_info}

Insurance Policy Document (source: {policy_name})
{policy_text}

Based on the uploaded policy document and patient information, answer these questions:
{self.formatted_questions}

Output your answers in JSON format only, with no explanation.
Your response must follow this exact structure:
{{
  "Q0": "WES",
  "Q1": "Yes",
  "Q2": "Not Specified",
  "Q3": "Not Specified",
  "Q4": "No",
  "Q5": "No", 
  "Q6": "Not Specified",
  "Q7": "81415",
  "Q8": "No"
}}

Answer options for each question:
- Q0: ["WES", "WGS", "BRCA1/2", "CMA"]
- Q1: ["Yes", "No", "Not Specified"]
- Q2: ["Yes", "No", "Not Specified"]
- Q3: ["Yes", "No", "Not Specified"]
- Q4: ["Yes", "No", "Not Specified"]
- Q5: ["Yes", "No", "Not Specified"]
- Q6: ["Yes", "No", "Not Specified"]
- Q7: ["81162", "81277", "81228", "81415", "81425", "Not Specified"]
- Q8: ["Yes", "No"]
"""

        messages = [
            {"role": "system", "content": "You are a clinical insurance assistant."},
            {"role": "user", "content": prompt}
        ]

        if qna_model.startswith("gpt"):
            api_params = {
                "model": qna_model,  # üëà ÏàòÏ†ï
                "messages": messages
            }
            if "gpt-5" not in qna_model.lower():
                api_params["temperature"] = 0
            response = self.openai_client.chat.completions.create(**api_params)
            result_content = response.choices[0].message.content.strip()
        
        elif qna_model == "perplexity":  # üëà ÏàòÏ†ï
            headers = {
                "Authorization": f"Bearer {self.perplexity_api_key}",
                "Content-Type": "application/json"
            }
            data = {
                "model": "sonar-pro",
                "messages": messages,
                "temperature": 0
            }
            url = "https://api.perplexity.ai/chat/completions"
            res = requests.post(url, headers=headers, json=data)
            result_content = res.json()["choices"][0]["message"]["content"].strip()
        else:
            raise ValueError(f"Unsupported QnA model: {qna_model}")

        result_json = {}

        try:
            result_json = self.clean_json_response(result_content)
            final_result = result_json.copy() 
            
            if predicted_md5 is not None:
                final_result["predicted_md5"] = predicted_md5

            model_name = f"{retrieval_model}_{qna_model}"
            if top_k is not None:
                save_dir = f"/home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/results/LLM_QnA/RAG/sample/{model_name}/top{top_k}"
            else:
                save_dir = f"/home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/results/LLM_QnA/RAG/sample/{model_name}"
    
            os.makedirs(save_dir, exist_ok=True)
            filename = os.path.join(save_dir, f"{case_id}_qna_result.json")
            with open(filename, "w") as f:
                json.dump(final_result, f, indent=2)

            print(f"‚úÖ QnA result saved to {filename}")

        except Exception as e:
            print("‚ùó JSON parsing error:", e)
            final_result = {
                "error": "JSON parsing failed",
                "raw_content": result_content
            }

        print("QnA Result JSON:", final_result)
        return final_result


In [5]:
questions_file_path = "/home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/dataset/Insurance_Genetic_Testing_QA.json"

with open(questions_file_path, "r") as f:
    questions_data = json.load(f)

questions_list = questions_data["questions"]

In [None]:
case_file_path = "/home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/dataset/sample_qna_free_text.json"
with open(case_file_path, "r") as f:
    case_ex = json.load(f)

retrieval_models = ["gpt-5-mini", "perplexity"]
qna_models = ["gpt-5-mini", "perplexity"]

model_combinations = [(r, q) for r in retrieval_models for q in qna_models]

evaluation_dir = f"/home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/results/LLM_QnA/RAG/sample/evaluation/individual"
os.makedirs(evaluation_dir, exist_ok=True)


top_k_values = [1, 3, 5, 10]
for retrieval_model, qna_model in model_combinations:
    print(f"\nüöÄ Running: {retrieval_model}_{qna_model}")
    model_name = f"{retrieval_model}_{qna_model}"
    
    retriever = RAGPolicyRetriever(
        policy_folder_path="/home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/dataset/insurance_policy",
        openai_api_key=openai_api_key,
        perplexity_api_key=perplexity_api_key,
        llm_model=retrieval_model
    )
    retriever.load_policies()
    retriever.embed_policies()

    executor = QnAExecutor(
        questions_list=questions_list,
        llm_model=qna_model,
        openai_client=retriever.openai_client,
        perplexity_api_key=perplexity_api_key
    )

    for k in top_k_values:
        print(f"\n=== Processing with Top-{k} ===")
        
        matching_stats = []

        for case in case_ex:
            case_id = case["id"]
            patient_info = case["patient_info"]
            expected_md5 = case["expected_md5"]  

            try:
                result = retriever.find_policies_with_matching_check(
                    patient_info, expected_md5, top_k=k)
                
                matching_stats.append({
                    'case_id': case_id,
                    'matched': result[0] is not None,
                    'policy_name': result[0] if result[0] else None,
                    'predicted_md5': result[2] if result[2] else None
                })
                
                if result[0] is not None:
                    policy_name, policy_text, predicted_md5 = result
                    executor.run_qna(
                        patient_info=patient_info,
                        policy_name=policy_name,
                        policy_text=policy_text,
                        case_id=f"{case_id}_top{k}",
                        retrieval_model=retrieval_model,
                        qna_model=qna_model,
                        predicted_md5=predicted_md5,
                        top_k=k
                    )
                    print(f"‚úÖ {case_id} (Top-{k}): QnA executed")
                else:
                    print(f"‚ùå {case_id} (Top-{k}): No matching policy - QnA skipped")
                
            except Exception as e:
                matching_stats.append({
                    'case_id': case_id,
                    'matched': False,
                    'policy_name': None,
                    'error': str(e)
                })
                print(f"‚ùå Failed on {case_id} ({retrieval_model}_{qna_model}, Top-{k}): {e}")

        model_folder = f"{retrieval_model}_{qna_model}"
        topk_dir = os.path.join(
            "/home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/results/LLM_QnA/RAG/sample",
            model_folder,
            f"top{k}"
        )
        os.makedirs(topk_dir, exist_ok=True)

        stats_df = pd.DataFrame(matching_stats)
        stats_path = os.path.join(
            topk_dir,
            f"{model_folder}_sample_top{k}_matching.csv"
        )
        stats_df.to_csv(stats_path, index=False)
        print(f"üìä Matching stats saved: {stats_path}")


üöÄ Running: gpt-5-mini_gpt-5-mini
‚úÖ Loaded 789 policies.
‚úÖ Embeddings created & cached (789 docs).

=== Processing with Top-1 ===
‚úÖ Found 11 policies matching test 'WES'
üîÑ Performing reranking for top-1
‚ùå Case10917 (Top-1): No matching policy - QnA skipped
‚úÖ Found 135 policies matching test 'WES'
üîÑ Performing reranking for top-1
‚ùå Case8051 (Top-1): No matching policy - QnA skipped
‚úÖ Found 11 policies matching test 'WES'
üîÑ Performing reranking for top-1
‚úÖ QnA result saved to /home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/results/LLM_QnA/RAG/sample/gpt-5-mini_gpt-5-mini/top1/Case11124_top1_qna_result.json
QnA Result JSON: {'Q0': 'WES', 'Q1': 'Yes', 'Q2': 'No', 'Q3': 'Yes', 'Q4': 'Not Specified', 'Q5': 'Yes', 'Q6': 'Yes', 'Q7': '81415', 'Q8': 'No', 'predicted_md5': '4fadf6b3ca9d4d08131cb31365e3aa7d'}
‚úÖ Case11124 (Top-1): QnA executed
‚úÖ Found 11 policies matching test 'WES'
üîÑ Performing reranking for top-1
‚ùå Case7376 (Top-1): No matching policy - 

In [None]:
def _to_bool(x):
    if isinstance(x, (int, float, bool)):
        return bool(x)
    return str(x).strip().lower() in {"1", "true", "yes", "y", "t"}

def calculate_policy_match_rates(base_dir):
    # Aggregate policy match rates across model/topK folders
    match_rate_results = []

    # Walk model folders, skip non-dirs and "evaluation"
    for folder_name in os.listdir(base_dir):
        folder_path = os.path.join(base_dir, folder_name)
        if not os.path.isdir(folder_path) or folder_name == "evaluation":
            continue

        # Find all topK subfolders
        for subfolder in os.listdir(folder_path):
            subfolder_path = os.path.join(folder_path, subfolder)
            if not (os.path.isdir(subfolder_path) and subfolder.startswith("top")):
                continue

            stats_files = [f for f in os.listdir(subfolder_path) if f.endswith("_matching.csv")]
            for stats_file in stats_files:
                csv_path = os.path.join(subfolder_path, stats_file)
                try:
                    df = pd.read_csv(csv_path)
                except Exception as e:
                    print(f"‚ö†Ô∏è Failed to read {csv_path}: {e}")
                    continue

                if "matched" not in df.columns:
                    print(f"‚ö†Ô∏è 'matched' column not found in {csv_path}. Skipped.")
                    continue

                matched_series = df["matched"].map(_to_bool) # Coerce matched values to booleans
                total_cases = len(matched_series)
                matched_cases = int(matched_series.sum())
                match_rate = (matched_cases / total_cases * 100) if total_cases > 0 else 0.0

                # Derive label from filename (model_combo_topK)
                model_combination = stats_file.replace("_matching.csv", "")

                match_rate_results.append({
                    "Model_Combination": model_combination,
                    "Total_Cases": total_cases,
                    "Matched_Cases": matched_cases,
                    "Policy_Match_Rate": f"{match_rate:.2f}%"
                })

    if not match_rate_results:
        print(f"‚ö†Ô∏è No matching CSVs found under model/topk folders in: {base_dir}")

    return match_rate_results


def get_policy_match_rate(model_name, csv_file):
    base_dir = "/home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/results/LLM_QnA/RAG/sample"

    # extract model name
    model_topk = csv_file.replace(".csv", "")
    m = re.search(r"_top(\d+)$", model_topk)
    if not m:
        print(f"‚ö†Ô∏è Could not parse top-k from filename: {csv_file}")
        return "N/A", "N/A", "N/A"

    # Extract K (Top-K) and base model combo string
    k = m.group(1)
    base_model = model_topk[:m.start()]               

    folder_path = os.path.join(base_dir, base_model, f"top{k}")

    
    filename_candidates = [
        f"{base_model}_top{k}_matching.csv",
        f"{base_model}_sample_top{k}_matching.csv",
    ]

    
    for fname in filename_candidates:
        matching_stats_path = os.path.join(folder_path, fname)
        if os.path.exists(matching_stats_path):
            try:
                match_df = pd.read_csv(matching_stats_path)
            except Exception as e:
                print(f"‚ö†Ô∏è Failed to read {matching_stats_path}: {e}")
                return "N/A", "N/A", "N/A"

            if "matched" not in match_df.columns:
                print(f"‚ö†Ô∏è 'matched' column not found in {matching_stats_path}")
                return "N/A", "N/A", "N/A"

            # Compute and return (rate%, total, matched)
            matched_series = match_df["matched"].map(_to_bool)
            total_attempted = len(matched_series)
            matched_count = int(matched_series.sum())
            match_rate = (matched_count / total_attempted * 100) if total_attempted > 0 else 0.0
            return f"{match_rate:.2f}%", total_attempted, matched_count

    # If all attempts fail, list files in the folder for debugging
    try:
        listing = os.listdir(folder_path)
        print(f"‚ö†Ô∏è Matching file not found in {folder_path}. Files: {listing}")
    except Exception as e:
        print(f"‚ö†Ô∏è Matching file not found and failed to list {folder_path}: {e}")

    return "N/A", "N/A", "N/A"

In [None]:
def merge_qna_jsons_to_csv(folder_path, output_csv_path):
    all_data = []

    for file in os.listdir(folder_path):
        if file.endswith("_qna_result.json"):
            # Extract case_id (e.g., "Case884" from "Case884_qna_result.json") 
            case_id = file.replace("_qna_result.json", "")
            json_path = os.path.join(folder_path, file)

            with open(json_path, "r") as f:
                try:
                    result = json.load(f)
                    flat_result = {"case_id": case_id}

                    for k, v in result.items():
                        if isinstance(v, list):
                            flat_result[k] = "; ".join(map(str, v))
                        else:
                            flat_result[k] = v

                    all_data.append(flat_result)
                except Exception as e:
                    print(f"‚ùó Failed to parse {file}: {e}")

    if all_data:
        df = pd.DataFrame(all_data)
        df.to_csv(output_csv_path, index=False)
        print(f"‚úÖ Merged CSV saved to: {output_csv_path}")
    else:
        print(f"‚ö†Ô∏è No valid QnA result files found in: {folder_path}")


def merge_all_combinations_to_csv(base_dir):
    for folder_name in os.listdir(base_dir):
        folder_path = os.path.join(base_dir, folder_name)
        if os.path.isdir(folder_path) and folder_name != "evaluation":  # evaluation excluded
            # Check for top_k subfolders
            for subfolder in os.listdir(folder_path):
                subfolder_path = os.path.join(folder_path, subfolder)
                if os.path.isdir(subfolder_path) and subfolder.startswith("top"):
                    # Extract top_k number (e.g., "top3" ‚Üí "3")
                    top_k = subfolder.replace("top", "")
                    output_csv = os.path.join(subfolder_path, f"{folder_name}_top{top_k}.csv")  # ÏàòÏ†ïÎêú Í≤ΩÎ°ú
                    merge_qna_jsons_to_csv(subfolder_path, output_csv)

merge_all_combinations_to_csv(
    base_dir="/home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/results/LLM_QnA/RAG/sample"
)


‚úÖ Merged CSV saved to: /home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/results/LLM_QnA/RAG/sample/gpt-5-mini_gpt-5-mini/top10/gpt-5-mini_gpt-5-mini_top10.csv
‚úÖ Merged CSV saved to: /home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/results/LLM_QnA/RAG/sample/gpt-5-mini_gpt-5-mini/top1/gpt-5-mini_gpt-5-mini_top1.csv
‚úÖ Merged CSV saved to: /home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/results/LLM_QnA/RAG/sample/gpt-5-mini_gpt-5-mini/top3/gpt-5-mini_gpt-5-mini_top3.csv
‚úÖ Merged CSV saved to: /home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/results/LLM_QnA/RAG/sample/gpt-5-mini_gpt-5-mini/top5/gpt-5-mini_gpt-5-mini_top5.csv
‚úÖ Merged CSV saved to: /home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/results/LLM_QnA/RAG/sample/perplexity_gpt-5-mini/top10/perplexity_gpt-5-mini_top10.csv
‚úÖ Merged CSV saved to: /home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/results/LLM_QnA/RAG/sample/perplexity_gpt-5-mini/top1/perplexity_gpt-5-mini_top1.csv
‚úÖ Merged CSV sav

In [None]:
def get_rag_combination_files(base_dir, include_matching=False):
    # Collect folder CSV from {retrieval}_{qna} subfolder topK
    rag_combination_files = []

    for folder_name in os.listdir(base_dir):
        folder_path = os.path.join(base_dir, folder_name)
        # exclude evaluation folder
        if not os.path.isdir(folder_path) or folder_name == "evaluation":
            continue

        for topk_folder in os.listdir(folder_path):
            topk_path = os.path.join(folder_path, topk_folder)
            if not (os.path.isdir(topk_path) and topk_folder.startswith("top")):
                continue

            try:
                files = os.listdir(topk_path)
            except Exception as e:
                print(f"‚ö†Ô∏è Failed to list {topk_path}: {e}")
                continue

            # Include all CSV files when include_matching is True
            if include_matching:
                wanted = [f for f in files if f.endswith(".csv")]
            
            # Exclude "*matching.csv" when include_matching is False
            else:
                wanted = [f for f in files if f.endswith(".csv") and not f.endswith("_matching.csv")]

            # Append (full_path, filename) to results
            for csv_file in wanted:
                full_path = os.path.join(topk_path, csv_file)
                rag_combination_files.append((full_path, csv_file))

    return rag_combination_files


In [None]:
def evaluate_csv_results(csv_path, gold_answers, model_name):
    # Evaluate a QnA CSV against gold answers and write case/overall stats
    df_results = pd.read_csv(csv_path)
    base_dir = "/home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/results/LLM_QnA/RAG/sample"

    # 1) Prepare model directory
    base_model = model_name.split("_top")[0] if "_top" in model_name else model_name.split("_aggregated")[0]
    model_dir = os.path.join(base_dir, "evaluation", "individual", base_model)
    os.makedirs(model_dir, exist_ok=True)

    # 2) Determine file type (topk/aggregated) and maintain standard prefix
    csv_filename = os.path.basename(csv_path)
    if "_aggregated" in csv_filename or "_aggregated" in model_name:
        file_type = "aggregated"
    else:
        model_topk = csv_filename.replace(".csv", "")
        if "_top" in model_topk:
            n = model_topk.split("_top")[1]
            file_type = f"top{n}"   
        else:
            file_type = "top1"

    case_level_stats = []

    print(f"üîÑ Evaluating {len(df_results)} cases from {csv_path}")

    # 3) Evaluate each case
    for _, row in df_results.iterrows():
        case_id = row["case_id"]
        clean_case_id = case_id.split("_top")[0] if "_top" in case_id else case_id

        gold_result = gold_answers.get(clean_case_id)
        if gold_result is None:
            print(f"‚ö†Ô∏è No gold standard found for {case_id}")
            continue

        predicted_result = row.to_dict()

        correct_count = 0
        total_count = 0

        # Only evaluate Q0-Q8 questions
        for qid in ["Q0", "Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8"]:
            if qid not in gold_result:
                continue

            pred_answer = predicted_result.get(qid, "")
            gold_answer = gold_result.get(qid, "")

            # Normalize NaN/list/string
            if pred_answer is None or (isinstance(pred_answer, float) and np.isnan(pred_answer)):
                pred_answer = ""
            if isinstance(pred_answer, list):
                pred_answer = ", ".join(map(str, pred_answer))
            if isinstance(gold_answer, list):
                gold_answer = ", ".join(map(str, gold_answer))

            pred_answer = str(pred_answer).strip()
            gold_answer = str(gold_answer).strip()

            is_correct = pred_answer == gold_answer
            correct_count += 1 if is_correct else 0
            total_count  += 1

        accuracy = correct_count / total_count * 100 if total_count > 0 else 0.0

        case_stats = {
            "case_id": case_id,
            "accuracy": accuracy,
            "correct_count": correct_count,
            "total_count": total_count
        }
        case_level_stats.append(case_stats)
        print(f"‚úÖ {case_id}: {accuracy:.2f}% accuracy ({correct_count}/{total_count})")

    if not case_level_stats:
        return None, None

    # 4) Save case-level statistics
    case_df = pd.DataFrame(case_level_stats)
    if file_type.startswith("top"):
        n = file_type.replace("top", "")
        legacy_case = os.path.join(model_dir, f"{n}_case_level.csv")
        if n.isdigit() and os.path.exists(legacy_case):
            os.remove(legacy_case)

    case_csv_path = os.path.join(model_dir, f"{file_type}_case_level.csv")
    case_df.to_csv(case_csv_path, index=False)
    print(f"‚úÖ Case-level statistics saved to: {case_csv_path}")

    # 5) Overall statistics calculation
    all_accuracies = case_df["accuracy"].values
    overall_stats = {
        "total_cases": len(case_level_stats),
        "mean_accuracy": all_accuracies.mean(),
        "std_accuracy":  all_accuracies.std(),
        "min_accuracy":  all_accuracies.min(),
        "max_accuracy":  all_accuracies.max(),
        "median_accuracy": np.median(all_accuracies),
    }

    # 6) (Optional) Include match rate from the same model‚ÄìTopK folder into overall
    model_topk_noext = os.path.splitext(csv_filename)[0]  # Remove extension
    m = re.search(r"_top(\d+)$", model_topk_noext)
    if m:
        k = m.group(1)
        model_folder = model_topk_noext[:m.start()]  # '..._sample'
        matching_filename = f"{model_topk_noext}_matching.csv"
        matching_stats_path = os.path.join(base_dir, model_folder, f"top{k}", matching_filename)
        if os.path.exists(matching_stats_path):
            mdf = pd.read_csv(matching_stats_path)

            def _to_bool(x):
                if isinstance(x, (int, float, bool)):
                    return bool(x)
                return str(x).strip().lower() in {"1", "true", "yes", "y", "t"}

            matched_series = mdf["matched"].map(_to_bool)
            total_attempted = len(matched_series)
            matched_count  = int(matched_series.sum())
            match_rate_pct = (matched_count / total_attempted * 100) if total_attempted > 0 else 0.0

            # Add match rate column to overall statistics
            overall_stats.update({
                "policy_match_rate": match_rate_pct,
                "total_attempted": total_attempted,
                "matched_count": matched_count,
            })

    overall_df = pd.DataFrame([overall_stats])

    # 7) Save overall statistics (+ legacy filename cleanup)
    if file_type.startswith("top"):
        n = file_type.replace("top", "")
        legacy_overall = os.path.join(model_dir, f"{n}_overall.csv")
        if n.isdigit() and os.path.exists(legacy_overall):
            os.remove(legacy_overall)

    overall_csv_path = os.path.join(model_dir, f"{file_type}_overall.csv")
    overall_df.to_csv(overall_csv_path, index=False)
    print(f"‚úÖ Overall statistics saved to: {overall_csv_path}")

    print(f"Total cases evaluated: {overall_stats['total_cases']}")
    print(f"QnA Accuracy (Mean %, Std): {overall_stats['mean_accuracy']:.2f}%, {overall_stats['std_accuracy']:.2f}%")
    print(f"Min accuracy: {overall_stats['min_accuracy']:.2f}%")
    print(f"Max accuracy: {overall_stats['max_accuracy']:.2f}%")
    print(f"Median accuracy: {overall_stats['median_accuracy']:.2f}%")

    return case_df, overall_df

In [None]:
def evaluate_all_folders_with_summary(base_dir, gold_answers, summary_output_csv):
    # Evaluate all model/topK CSVs and write a per-model summary
    summary_records = []

    # 1) Prepare model directory
    csv_files_tuples = get_rag_combination_files(base_dir, include_matching=False)
    if not csv_files_tuples:
        print("‚ö†Ô∏è No RAG combination CSV files found")
        return

    print(f"üìä Evaluating {len(csv_files_tuples)} model combinations")

    # Collect case performance by model
    model_results = {}  # {'<base_model>': [ {case_id, accuracy, ...}, ... ] }

    # 2) Run evaluation for each model-topk CSV ‚Üí Accumulate cases by model
    for csv_path, csv_file in csv_files_tuples:
        model_topk = csv_file.replace('.csv', '')  # Example: gpt-5-mini_gpt-5-mini_sample_top1
        print(f"üîÑ Evaluating {model_topk}")

        case_df, overall_df = evaluate_csv_results(
            csv_path=csv_path,
            gold_answers=gold_answers,
            model_name=model_topk
        )

        if case_df is not None:
            # base_model = '..._sample'
            base_model = model_topk.split("_top")[0] if "_top" in model_topk else model_topk.split("_aggregated")[0]
            if base_model not in model_results:
                model_results[base_model] = []
            model_results[base_model].extend(case_df.to_dict('records'))

    # 3) Collect matching files
    all_files_with_matching = get_rag_combination_files(base_dir, include_matching=True)

    for model_name, all_cases in model_results.items():
        if not all_cases:
            continue

        # Collect overall statistics
        total_cases = len(all_cases)
        total_correct = sum(case['correct_count'] for case in all_cases)
        total_questions = sum(case['total_count'] for case in all_cases)
        overall_accuracy = (total_correct / total_questions * 100) if total_questions > 0 else 0.0
        case_accuracies = [case['accuracy'] for case in all_cases]

        # --- Collect model-level match rate (sum of all top-k) ---
        # Find matching files in the model folder (topk) with pattern: '{model_name}_top{k}_matching.csv'
        prefixes = [model_name, f"{model_name}_sample"]
        matching_files_for_model = [
            (full, name)
            for (full, name) in all_files_with_matching
            if name.endswith("_matching.csv")
            and any(name.startswith(f"{p}_top") for p in prefixes)
        ]

        total_attempted = 0
        matched_count = 0
        for full_path, _ in matching_files_for_model:
            mdf = pd.read_csv(full_path)
            matched_series = mdf["matched"].map(
                lambda x: (str(x).strip().lower() in {"1", "true", "yes", "y", "t"})
                if not isinstance(x, (int, float, bool)) else bool(x)
            )
            total_attempted += len(matched_series)
            matched_count  += int(matched_series.sum())

        if total_attempted > 0:
            match_rate_str = f"{(matched_count / total_attempted * 100):.2f}%"
        else:
            match_rate_str = "N/A"

        # --- Collect summary records ---
        summary_records.append({
            "Model_Name": model_name,
            "Overall_Accuracy": f"{overall_accuracy:.2f}%",
            "Case_Accuracy_Mean": f"{np.mean(case_accuracies):.2f}%",
            "Case_Accuracy_Std": f"{np.std(case_accuracies):.2f}%",
            "Total_Cases": total_cases,
            "Total_Questions": total_questions,
            "Total_Correct": total_correct,
            "Policy_Match_Rate": match_rate_str,
            "Total_Attempted": total_attempted if total_attempted > 0 else "N/A",
            "Matched_Count": matched_count if total_attempted > 0 else "N/A",
        })

    # 4) Save summary
    if summary_records:
        comparative_dir = os.path.join(base_dir, "evaluation", "comparative")
        os.makedirs(comparative_dir, exist_ok=True)

        # Use provided path if given, otherwise save to default path
        out_path = summary_output_csv if summary_output_csv else os.path.join(comparative_dir, "model_summary.csv")

        summary_df = pd.DataFrame(summary_records)
        summary_df.to_csv(out_path, index=False)
        print(f"\n‚úÖ Summary saved to: {out_path}")
        print(summary_df)


In [12]:
base_dir = "/home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/results/LLM_QnA/RAG/sample"
ground_truth_path = "/home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/dataset/sample_ground_truth.json"
with open(ground_truth_path, "r") as f:
    ground_truth = json.load(f)

evaluate_all_folders_with_summary(
    base_dir=base_dir,
    gold_answers=ground_truth,
    summary_output_csv="" 
)


üìä Evaluating 16 model combinations
üîÑ Evaluating gpt-5-mini_gpt-5-mini_top10
üîÑ Evaluating 10 cases from /home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/results/LLM_QnA/RAG/sample/gpt-5-mini_gpt-5-mini/top10/gpt-5-mini_gpt-5-mini_top10.csv
‚úÖ Case11124_top10: 55.56% accuracy (5/9)
‚úÖ Case18257_top10: 88.89% accuracy (8/9)
‚úÖ Case19321_top10: 88.89% accuracy (8/9)
‚úÖ Case7376_top10: 66.67% accuracy (6/9)
‚úÖ Case4512_top10: 66.67% accuracy (6/9)
‚úÖ Case8051_top10: 77.78% accuracy (7/9)
‚úÖ Case9349_top10: 88.89% accuracy (8/9)
‚úÖ Case10363_top10: 77.78% accuracy (7/9)
‚úÖ Case10451_top10: 77.78% accuracy (7/9)
‚úÖ Case10917_top10: 77.78% accuracy (7/9)
‚úÖ Case-level statistics saved to: /home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/results/LLM_QnA/RAG/sample/evaluation/individual/gpt-5-mini_gpt-5-mini/top10_case_level.csv
‚úÖ Overall statistics saved to: /home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/results/LLM_QnA/RAG/sample/evaluation/individual/gpt-5-

In [None]:
def evaluate_all_models_combined(base_dir, gold_answers):
    # Aggregate all model/topK CSVs into a single combined statistics file
    print(f"üîÑ Collecting all raw results from {base_dir}")

    # 1) Collect all QnA CSV files (excluding matching.csv)
    csv_files_tuples = get_rag_combination_files(base_dir, include_matching=False)
    if not csv_files_tuples:
        print("‚ö†Ô∏è No RAG combination CSV files found")
        return None, None

    # Accumulation variables for summation
    total_correct_all = 0
    total_questions_all = 0
    case_accuracies_all = []   # Distribution of case-level accuracy (overall)

    # 2) Iterate through all QnA CSVs and accumulate global accuracy by comparing answers for Q0~Q8
    for csv_path, csv_file in csv_files_tuples:
        try:
            df = pd.read_csv(csv_path)
        except Exception as e:
            print(f"‚ö†Ô∏è Failed to read {csv_path}: {e}")
            continue

        if "case_id" not in df.columns:
            print(f"‚ö†Ô∏è 'case_id' column not found in {csv_path}. Skipped.")
            continue

        for _, row in df.iterrows():
            case_id = row["case_id"]
            clean_case_id = case_id.split("_top")[0] if "_top" in case_id else case_id

            gold = gold_answers.get(clean_case_id)
            if gold is None:
                continue

            correct = 0
            total = 0
            for qid in ["Q0","Q1","Q2","Q3","Q4","Q5","Q6","Q7","Q8"]:
                if qid not in gold:
                    continue
                pred = row.get(qid, "")
                ans  = gold.get(qid, "")

                # NaN/list normalization
                if pred is None or (isinstance(pred, float) and np.isnan(pred)):
                    pred = ""
                if isinstance(pred, list):
                    pred = ", ".join(map(str, pred))
                if isinstance(ans, list):
                    ans = ", ".join(map(str, ans))

                pred = str(pred).strip()
                ans  = str(ans).strip()

                total += 1
                if pred == ans:
                    correct += 1

            if total > 0:
                acc = correct / total * 100.0
                case_accuracies_all.append(acc)
                total_correct_all += correct
                total_questions_all += total

    # 3) Aggregate global accuracy statistics
    if total_questions_all > 0 and len(case_accuracies_all) > 0:
        overall_accuracy = total_correct_all / total_questions_all * 100.0
        overall_accuracy_se = math.sqrt((total_correct_all / total_questions_all) * (1.0 - (total_correct_all / total_questions_all)) / total_questions_all) * 100.0
        stats_summary = {
            "total_cases_evaluated": len(case_accuracies_all),
            "total_questions": int(total_questions_all),
            "total_correct": int(total_correct_all),
            "overall_accuracy": round(overall_accuracy, 2),
            "overall_accuracy_se": round(overall_accuracy_se, 2),
        }
    else:
        stats_summary = {
            "total_cases_evaluated": 0,
            "total_questions": 0,
            "total_correct": 0,
            "overall_accuracy": "N/A",
            "overall_accuracy_se": "N/A",
        }

    # 4) Aggregate global Match Rate (sum of all models√ótop-k)
    all_matching_data = []
    for _, csv_file in csv_files_tuples:
        match_rate, total_attempted, matched_count = get_policy_match_rate("", csv_file)
        if match_rate != "N/A":
            all_matching_data.append({
                "model_combination": csv_file.replace(".csv",""),
                "total_attempted": total_attempted,
                "matched_count": matched_count,
                "match_rate": match_rate
            })

    if all_matching_data:
        total_attempted_all = sum(item["total_attempted"] for item in all_matching_data)
        total_matched_all   = sum(item["matched_count"]   for item in all_matching_data)
        overall_match_rate  = (total_matched_all / total_attempted_all * 100.0) if total_attempted_all > 0 else 0.0
        stats_summary.update({
            "overall_match_rate": round(overall_match_rate, 2),
            "overall_match_total_attempted": int(total_attempted_all),
            "overall_match_total_matched": int(total_matched_all),
        })
    else:
        stats_summary.update({
            "overall_match_rate": "N/A",
            "overall_match_total_attempted": "N/A",
            "overall_match_total_matched": "N/A",
        })

    # 5) Save combined statistics (combined folder, only two files)
    combined_dir = os.path.join(base_dir, "evaluation", "combined")
    os.makedirs(combined_dir, exist_ok=True)

    combined_stats_path = os.path.join(combined_dir, "all_models_combined_statistics.csv")
    pd.DataFrame([stats_summary]).to_csv(combined_stats_path, index=False)
    print(f"‚úÖ Combined statistics saved: {combined_stats_path}")

    if all_matching_data:
        matching_details_path = os.path.join(combined_dir, "all_models_matching_details.csv")
        pd.DataFrame(all_matching_data).to_csv(matching_details_path, index=False)
        print(f"‚úÖ Matching details saved: {matching_details_path}")
    else:
        matching_details_path = None
        print("‚ö†Ô∏è No matching details to save")

    print("\nüéâ Combined evaluation completed!")
    print(f"Results saved to: {combined_dir}")

    return combined_stats_path, matching_details_path

evaluate_all_models_combined(
    base_dir=base_dir,
    gold_answers=ground_truth
)

üîÑ Collecting all raw results from /home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/results/LLM_QnA/RAG/sample
‚úÖ Combined statistics saved: /home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/results/LLM_QnA/RAG/sample/evaluation/combined/all_models_combined_statistics.csv
‚úÖ Matching details saved: /home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/results/LLM_QnA/RAG/sample/evaluation/combined/all_models_matching_details.csv

üéâ Combined evaluation completed!
Results saved to: /home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/results/LLM_QnA/RAG/sample/evaluation/combined


('/home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/results/LLM_QnA/RAG/sample/evaluation/combined/all_models_combined_statistics.csv',
 '/home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/results/LLM_QnA/RAG/sample/evaluation/combined/all_models_matching_details.csv')

In [None]:
def analyze_rag_question_accuracy_combined(base_dir, ground_truth, output_dir):
    # Analyze overall question accuracy across all RAG combinations (combined)
    csv_files_tuples = get_rag_combination_files(base_dir, include_matching=False)
    if not csv_files_tuples:
        print("‚ö†Ô∏è No RAG combination CSV files found")
        return None

    csv_files = [name for _, name in csv_files_tuples]
    print(f"üìã Found RAG combinations: {[f.replace('.csv', '') for f in csv_files]}")

    # Load and combine data
    all_data = []
    question_columns = ['Q0','Q1','Q2','Q3','Q4','Q5','Q6','Q7','Q8']
    for full_path, csv_file in csv_files_tuples:
        try:
            df = pd.read_csv(full_path)
        except Exception as e:
            print(f"‚ö†Ô∏è Failed to read {full_path}: {e}")
            continue
        df['rag_combination'] = csv_file.replace('.csv', '')
        all_data.append(df)
        print(f"‚úÖ Loaded {len(df)} cases from {csv_file}")

    if not all_data:
        print("‚ö†Ô∏è No valid CSVs loaded")
        return None

    # Concatenate all dataframes
    combined_df = pd.concat(all_data, ignore_index=True)
    print(f"üìä Total combined cases: {len(combined_df)} across {len(csv_files)} RAG combinations")

    # Aggregate question-level accuracy
    overall_stats = {}
    for q in question_columns:
        if q not in combined_df.columns:
            continue
        correct, total = 0, 0
        for _, row in combined_df.iterrows():
            case_id = row.get('case_id', '')
            clean_case_id = case_id.split("_top")[0] if "_top" in str(case_id) else case_id
            gold = ground_truth.get(clean_case_id)
            if gold is None or q not in gold:
                continue

            pred = row.get(q, "")
            ans  = gold.get(q, "")

            if pred is None or (isinstance(pred, float) and np.isnan(pred)):
                pred = ""
            if isinstance(pred, list):
                pred = ", ".join(map(str, pred))
            if isinstance(ans, list):
                ans = ", ".join(map(str, ans))
            pred = str(pred).strip()
            ans  = str(ans).strip()

            total += 1
            if pred == ans:
                correct += 1

        acc = (correct / total * 100.0) if total > 0 else 0.0
        overall_stats[q] = {"accuracy_pct": acc, "correct": correct, "total": total}

    print("\n" + "="*60)
    print("üìä OVERALL RAG QUESTION ACCURACY (All Combinations Combined)")
    print("="*60)
    for q in question_columns:
        if q in overall_stats:
            s = overall_stats[q]
            print(f"{q}: {s['accuracy_pct']:.3f}% ({s['correct']}/{s['total']})")

    difficulty_ranking = sorted(overall_stats.items(), key=lambda x: x[1]['accuracy_pct'], reverse=True)
    difficulty_df = pd.DataFrame([{
        'Question': q,
        'Overall_Accuracy_pct': stats['accuracy_pct'],
        'Correct_Total': f"{stats['correct']}/{stats['total']}",
        'Difficulty_Level': 'Easy' if stats['accuracy_pct'] > 80
                             else 'Medium' if stats['accuracy_pct'] > 50 else 'Hard'
    } for q, stats in difficulty_ranking])

    print("\n" + "="*60)
    print("üìä RAG QUESTION DIFFICULTY RANKING")
    print("="*60)
    if not difficulty_df.empty:
        print(difficulty_df.to_string(index=False))

    combined_dir = os.path.join(base_dir, "evaluation", "combined")
    os.makedirs(combined_dir, exist_ok=True)

    overall_df = pd.DataFrame([{
        'Question': q,
        'Accuracy_pct': s['accuracy_pct'],
        'Correct': s['correct'],
        'Total': s['total']
    } for q, s in overall_stats.items()])

    overall_out = os.path.join(combined_dir, "overall_question_accuracy.csv")
    overall_df.to_csv(overall_out, index=False)
    print(f"\n‚úÖ Overall question accuracy saved: {overall_out}")

    difficulty_out = os.path.join(combined_dir, "experiment_summary.csv")
    difficulty_df.to_csv(difficulty_out, index=False)
    print(f"‚úÖ RAG difficulty ranking saved: {difficulty_out}")

    return overall_stats, difficulty_df

def analyze_rag_question_accuracy_by_combination(base_dir, ground_truth, output_dir):
    # Analyze question accuracy for each RAG combination separately
    csv_files_tuples = get_rag_combination_files(base_dir, include_matching=False)
    if not csv_files_tuples:
        print("‚ö†Ô∏è No RAG combination CSV files found")
        return None

    all_results = {}
    question_columns = ['Q0','Q1','Q2','Q3','Q4','Q5','Q6','Q7','Q8']

    for full_path, csv_file in csv_files_tuples:
        rag_comb = csv_file.replace('.csv', '')
        try:
            df = pd.read_csv(full_path)
        except Exception as e:
            print(f"‚ö†Ô∏è Failed to read {full_path}: {e}")
            continue

        combination_acc = {}
        for q in question_columns:
            if q not in df.columns:
                continue
            correct, total = 0, 0
            for _, row in df.iterrows():
                case_id = row.get('case_id', '')
                clean_case_id = case_id.split("_top")[0] if "_top" in str(case_id) else case_id
                gold = ground_truth.get(clean_case_id)
                if gold is None or q not in gold:
                    continue

                pred = row.get(q, "")
                ans  = gold.get(q, "")

                if pred is None or (isinstance(pred, float) and np.isnan(pred)):
                    pred = ""
                if isinstance(pred, list):
                    pred = ", ".join(map(str, pred))
                if isinstance(ans, list):
                    ans = ", ".join(map(str, ans))
                pred = str(pred).strip()
                ans  = str(ans).strip()

                total += 1
                if pred == ans:
                    correct += 1

            acc = (correct / total * 100.0) if total > 0 else 0.0
            combination_acc[q] = {"accuracy_pct": acc, "correct": correct, "total": total}

        all_results[rag_comb] = combination_acc
        print(f"‚úÖ Processed {rag_comb}: {len(df)} cases")

    rows = []
    for comb_name in sorted(all_results.keys()):
        row = {'RAG_Combination': comb_name}
        for q in question_columns:
            row[q] = all_results[comb_name].get(q, {}).get('accuracy_pct', 0.0)
        rows.append(row)

    results_df = pd.DataFrame(rows)
    print("\n" + "="*60)
    print("üìä RAG COMBINATION-WISE QUESTION ACCURACY (%, higher is better)")
    print("="*60)
    if not results_df.empty:
        print(results_df.round(2).to_string(index=False))

    comparative_dir = os.path.join(base_dir, "evaluation", "comparative")
    os.makedirs(comparative_dir, exist_ok=True)

    combination_out = os.path.join(comparative_dir, "question_accuracy_by_model.csv")
    results_df.to_csv(combination_out, index=False)
    print(f"\n‚úÖ Model-wise question accuracy saved: {combination_out}")

    detailed_out = os.path.join(comparative_dir, "detailed_question_analysis.json")
    with open(detailed_out, 'w', encoding='utf-8') as f:
        json.dump(all_results, f, indent=2, ensure_ascii=False)
    print(f"‚úÖ Detailed question analysis saved: {detailed_out}")

    return all_results, results_df


# Usage example:
def run_complete_rag_question_analysis(base_dir, ground_truth, output_dir):
    # Run complete RAG question analysis - both combined and by combination
    print("üöÄ Starting Complete RAG Question Analysis")
    print("="*60)

    # 1) Overall Analysis
    print("\n1Ô∏è‚É£ Running Overall Analysis (All RAG Combinations Combined)")
    overall_stats, difficulty_df = analyze_rag_question_accuracy_combined(
        base_dir=base_dir,
        ground_truth=ground_truth,
        output_dir=output_dir
    )

    # 2) Combination-wise Analysis
    print("\n2Ô∏è‚É£ Running Combination-wise Analysis")
    detailed_results, combination_df = analyze_rag_question_accuracy_by_combination(
        base_dir=base_dir,
        ground_truth=ground_truth,
        output_dir=output_dir
    )

    print("\nüéâ Complete RAG Question Analysis Finished!")
    print(f"üìÅ All results saved under: {os.path.join(base_dir, 'evaluation')}")
    return overall_stats, difficulty_df, detailed_results, combination_df

# Run analysis
overall_stats, difficulty_df, detailed_results, combination_df = run_complete_rag_question_analysis(
    base_dir=base_dir,
    ground_truth=ground_truth,
    output_dir=""
)


üöÄ Starting Complete RAG Question Analysis

1Ô∏è‚É£ Running Overall Analysis (All RAG Combinations Combined)
üìã Found RAG combinations: ['gpt-5-mini_gpt-5-mini_top10', 'gpt-5-mini_gpt-5-mini_top1', 'gpt-5-mini_gpt-5-mini_top3', 'gpt-5-mini_gpt-5-mini_top5', 'perplexity_gpt-5-mini_top10', 'perplexity_gpt-5-mini_top1', 'perplexity_gpt-5-mini_top3', 'perplexity_gpt-5-mini_top5', 'gpt-5-mini_perplexity_top10', 'gpt-5-mini_perplexity_top1', 'gpt-5-mini_perplexity_top3', 'gpt-5-mini_perplexity_top5', 'perplexity_perplexity_top10', 'perplexity_perplexity_top1', 'perplexity_perplexity_top3', 'perplexity_perplexity_top5']
‚úÖ Loaded 10 cases from gpt-5-mini_gpt-5-mini_top10.csv
‚úÖ Loaded 2 cases from gpt-5-mini_gpt-5-mini_top1.csv
‚úÖ Loaded 6 cases from gpt-5-mini_gpt-5-mini_top3.csv
‚úÖ Loaded 9 cases from gpt-5-mini_gpt-5-mini_top5.csv
‚úÖ Loaded 10 cases from perplexity_gpt-5-mini_top10.csv
‚úÖ Loaded 2 cases from perplexity_gpt-5-mini_top1.csv
‚úÖ Loaded 6 cases from perplexity_gpt-5-