# RAG System

In [48]:
import re
import pickle
import numpy as np
from typing import Dict, List
from dataclasses import dataclass
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import os

@dataclass
class DocumentChunk:
    text: str
    metadata: Dict[str, any]
    chunk_id: str

class FocusedThaiMedicalProcessor:
    def __init__(self, model_name: str = "BAAI/bge-m3"):
        try:
            self.model = SentenceTransformer(model_name)
            print("BGE-M3 model loaded successfully!")
        except Exception as e:
            print(f"Error loading model {model_name}: {e}")
            print("Trying fallback model...")
            self.model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
            print("Fallback model loaded!")

        # Initialize storage for embeddings and chunks
        self.chunks = []
        self.embeddings = None

    def extract_focused_sections(self, pdf_path: str, page_offset: int = 5) -> Dict[str, List[Dict[str, any]]]:
      """Extract only the two target sections (page-wise with page numbers)."""
      reader = PdfReader(pdf_path)

      # Sections you want (TOC/page-number based, 1-indexed in doc)
      target_sections = {
          'general_knowledge': (4, 23),            # ‡∏Ñ‡∏ß‡∏≤‡∏°‡∏£‡∏π‡πâ‡∏ó‡∏±‡πà‡∏ß‡πÑ‡∏õ‡∏Ç‡∏≠‡∏á‡πÇ‡∏£‡∏Ñ‡∏≠‡∏≤‡∏´‡∏≤‡∏£‡πÄ‡∏õ‡πá‡∏ô‡∏û‡∏¥‡∏©
          'investigation_guidelines': (24, 61),    # ‡πÅ‡∏ô‡∏ß‡∏ó‡∏≤‡∏á‡∏Å‡∏≤‡∏£‡∏™‡∏≠‡∏ö‡∏™‡∏ß‡∏ô‡∏Å‡∏≤‡∏£‡∏£‡∏∞‡∏ö‡∏≤‡∏î‡πÇ‡∏£‡∏Ñ‡∏≠‡∏≤‡∏´‡∏≤‡∏£‡πÄ‡∏õ‡πá‡∏ô‡∏û‡∏¥‡∏©
      }

      extracted_sections: Dict[str, List[Dict[str, any]]] = {}

      for section_name, (start_page, end_page) in target_sections.items():
          # Convert to 0-indexed actual PDF pages with offset
          actual_start = start_page + page_offset - 1
          actual_end_exclusive = end_page + page_offset - 1  # we'll use as exclusive in range()

          print(
              f"Extracting {section_name}: TOC pages {start_page}-{end_page} "
              f"(actual PDF pages {actual_start+1}-{actual_end_exclusive})"
          )

          pages_list: List[Dict[str, any]] = []
          # Iterate page-by-page; note: range() stop is exclusive
          for p in range(actual_start, min(actual_end_exclusive, len(reader.pages))):
              page_text = reader.pages[p].extract_text() or ""
              page_text = self._clean_text(page_text)
              pages_list.append({
                  "page_number": p + 1,  # human-readable (1-based)
                  "text": page_text
              })

          extracted_sections[section_name] = pages_list

      return extracted_sections


    def _clean_text(self, text: str) -> str:
        """Clean up extracted text"""
        # Remove multiple newlines
        text = re.sub(r'\n\s*\n', '\n\n', text)
        # Remove extra spaces
        text = re.sub(r' +', ' ', text)
        # Keep Thai characters, English, numbers, and basic punctuation
        text = re.sub(r'[^\u0E00-\u0E7F\u0020-\u007E\u00A0-\u00FF\n\r\t]', '', text)
        return text.strip()

    def create_focused_chunks(self, sections: Dict[str, List[Dict[str, any]]], chunk_size: int = 800) -> List[DocumentChunk]:
      """
      Create chunks from page-wise sections and carry page numbers into metadata.

      Expected sections format:
      {
        'section_name': [
          { 'page_number': int, 'text': str },
          ...
        ],
        ...
      }
      """
      all_chunks: List[DocumentChunk] = []

      for section_name, pages in sections.items():
          # Backward compatibility: if old code passes a big string, wrap it as a single pseudo-page
          if isinstance(pages, str):
              pages = [{"page_number": None, "text": pages}]

          total_chars = sum(len(p["text"]) for p in pages if p.get("text"))
          print(f"\nChunking {section_name} ({total_chars} characters across {len(pages)} pages)")

          # Build a list of (paragraph_text, page_number)
          para_items: List[Dict[str, any]] = []
          for item in pages:
              pnum = item.get("page_number")
              ptxt = item.get("text") or ""
              # split by double newline into paragraphs (like old behavior)
              for para in (ptxt.split("\n\n") if ptxt else []):
                  para = para.strip()
                  if para:
                      para_items.append({"page_number": pnum, "text": para})

          # Greedy pack paragraphs into chunks
          current_text = ""
          current_pages_set = set()
          chunks_for_section = []
          for para in para_items:
              candidate = (current_text + ("\n\n" if current_text else "") + para["text"])
              if current_text and len(candidate) > chunk_size:
                  # flush current chunk
                  if current_text.strip():
                      page_list = sorted([p for p in current_pages_set if p is not None])
                      page_start = page_list[0] if page_list else None
                      page_end = page_list[-1] if page_list else None
                      chunks_for_section.append({
                          "text": current_text.strip(),
                          "page_start": page_start,
                          "page_end": page_end,
                          "pages": page_list
                      })
                  # reset with current paragraph
                  current_text = para["text"]
                  current_pages_set = set([para["page_number"]])
              else:
                  # accumulate
                  current_text = candidate
                  if para["page_number"] is not None:
                      current_pages_set.add(para["page_number"])

          # flush last chunk
          if current_text.strip():
              page_list = sorted([p for p in current_pages_set if p is not None])
              page_start = page_list[0] if page_list else None
              page_end = page_list[-1] if page_list else None
              chunks_for_section.append({
                  "text": current_text.strip(),
                  "page_start": page_start,
                  "page_end": page_end,
                  "pages": page_list
              })

          # Convert into DocumentChunk objects with page metadata
          for i, ch in enumerate(chunks_for_section):
              chunk = DocumentChunk(
                  text=ch["text"],
                  metadata={
                      'section_name': section_name,
                      'chunk_index': i,
                      'section_total_chunks': len(chunks_for_section),
                      'language': 'thai',
                      'source': 'thai_medical_guide',
                      'length': len(ch["text"]),
                      # NEW: page info
                      'page_start': ch["page_start"],
                      'page_end': ch["page_end"],
                      'pages': ch["pages"],  # list[int]
                  },
                  chunk_id=f"{section_name}_chunk_{i}"
              )
              all_chunks.append(chunk)

          print(f"  Created {len(chunks_for_section)} chunks for {section_name}")

      print(f"\nTotal chunks created: {len(all_chunks)}")
      return all_chunks


    def _split_into_chunks(self, text: str, chunk_size: int) -> List[str]:
        """Split text into chunks by paragraphs and size"""
        paragraphs = text.split('\n\n')
        chunks = []
        current_chunk = ""

        for paragraph in paragraphs:
            # If adding this paragraph would exceed chunk size and we have content
            if len(current_chunk + paragraph) > chunk_size and current_chunk:
                chunks.append(current_chunk.strip())
                current_chunk = paragraph
            else:
                current_chunk += "\n\n" + paragraph if current_chunk else paragraph

        # Add the last chunk
        if current_chunk:
            chunks.append(current_chunk.strip())

        return chunks

    def generate_embeddings(self, chunks: List[DocumentChunk], batch_size: int = 32) -> np.ndarray:
        """Generate embeddings for chunks"""
        texts = [chunk.text for chunk in chunks]

        print(f"Generating embeddings for {len(texts)} chunks...")

        try:
            embeddings = self.model.encode(
                texts,
                batch_size=batch_size,
                show_progress_bar=True,
                convert_to_numpy=True
            )
            print(f"Embeddings generated! Shape: {embeddings.shape}")
            return embeddings
        except Exception as e:
            print(f"Error with batch_size {batch_size}: {e}")
            print("Retrying with smaller batch...")
            embeddings = self.model.encode(
                texts,
                batch_size=8,
                show_progress_bar=True,
                convert_to_numpy=True
            )
            print(f"Embeddings generated with smaller batch! Shape: {embeddings.shape}")
            return embeddings

    def build_system(self, pdf_path: str, chunk_size: int = 800):
        """Build the complete system"""
        print("=== Building Focused Thai Medical System ===")

        # Step 1: Extract focused sections
        print("\nStep 1: Extracting target sections...")
        sections = self.extract_focused_sections(pdf_path)

        # Show what we extracted
        for section_name, content in sections.items():
            print(f"  {section_name}: {len(content)} characters")

        # Step 2: Create chunks
        print("\nStep 2: Creating chunks...")
        self.chunks = self.create_focused_chunks(sections, chunk_size)

        # Step 3: Generate embeddings
        print("\nStep 3: Generating embeddings...")
        self.embeddings = self.generate_embeddings(self.chunks)

        print("\n=== System Ready! ===")
        print(f"Total chunks: {len(self.chunks)}")

        # Show chunk distribution by section
        section_counts = {}
        for chunk in self.chunks:
            section = chunk.metadata['section_name']
            section_counts[section] = section_counts.get(section, 0) + 1
        print(f"Chunk distribution: {section_counts}")

    def save_system(self, filepath: str):
        """Save the system to local files"""
        print(f"üíæ Saving system to {filepath}...")

        # Prepare data to save
        data = {
            'chunks': self.chunks,
            'embeddings': self.embeddings,
            'model_info': {
                'model_name': 'BAAI/bge-m3',
                'embedding_dim': self.embeddings.shape[1] if self.embeddings is not None else None,
                'num_chunks': len(self.chunks)
            }
        }

        # Save with pickle
        with open(filepath, 'wb') as f:
            pickle.dump(data, f)

        file_size = os.path.getsize(filepath) / (1024 * 1024)  # MB
        print(f"‚úÖ System saved successfully! File size: {file_size:.2f} MB")

    def load_system(self, filepath: str):
        """Load the system from local files"""
        print(f"üìÇ Loading system from {filepath}...")

        with open(filepath, 'rb') as f:
            data = pickle.load(f)

        self.chunks = data['chunks']
        self.embeddings = data['embeddings']

        print(f"‚úÖ System loaded successfully!")
        print(f"  Chunks: {len(self.chunks)}")
        print(f"  Embeddings shape: {self.embeddings.shape}")
        print(f"  Model info: {data.get('model_info', 'N/A')}")

    def search(self, query: str, top_k: int = 5, section_filter: str = None) -> List[Dict]:
        """Search for relevant chunks"""
        if self.embeddings is None or not self.chunks:
            raise ValueError("System not built or loaded. Please build or load system first.")

        print(f"üîç Searching: '{query}'")

        # Filter chunks by section if specified
        if section_filter:
            filtered_indices = [
                i for i, chunk in enumerate(self.chunks)
                if chunk.metadata['section_name'] == section_filter
            ]
            filtered_embeddings = self.embeddings[filtered_indices]
            filtered_chunks = [self.chunks[i] for i in filtered_indices]
            print(f"  Filtering to section: {section_filter} ({len(filtered_chunks)} chunks)")
        else:
            filtered_embeddings = self.embeddings
            filtered_chunks = self.chunks
            filtered_indices = list(range(len(self.chunks)))

        # Generate query embedding
        query_embedding = self.model.encode([query])

        # Calculate similarities
        similarities = cosine_similarity(query_embedding, filtered_embeddings)[0]

        # Get top-k results
        top_indices = np.argsort(similarities)[::-1][:top_k]

        results = []
        for idx in top_indices:
            chunk = filtered_chunks[idx]
            results.append({
                'text': chunk.text,
                'metadata': chunk.metadata,
                'score': float(similarities[idx]),
                'chunk_id': chunk.chunk_id,
                'relevance_score': float(similarities[idx])
            })

        print(f"  Found {len(results)} results")
        return results

#     def answer_question(self, question: str, top_k: int = 3, section_filter: str = None) -> str:
#         """Answer question based on retrieved context"""
#         # Search for relevant chunks
#         results = self.search(question, top_k, section_filter)

#         if not results:
#             return "‡πÑ‡∏°‡πà‡∏û‡∏ö‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏ó‡∏µ‡πà‡πÄ‡∏Å‡∏µ‡πà‡∏¢‡∏ß‡∏Ç‡πâ‡∏≠‡∏á‡∏Å‡∏±‡∏ö‡∏Ñ‡∏≥‡∏ñ‡∏≤‡∏°"

#         # Combine context
#         context_parts = []
#         for r in results:
#             section_name = r['metadata']['section_name']
#             context_parts.append(f"[{section_name}]: {r['text'][:200]}")

#         context = f"{'...'*3}\n\n".join(context_parts)

#         # Generate answer
#         answer = f"""‡∏ï‡∏≤‡∏°‡πÄ‡∏≠‡∏Å‡∏™‡∏≤‡∏£‡πÅ‡∏ô‡∏ß‡∏ó‡∏≤‡∏á‡∏Å‡∏≤‡∏£‡∏™‡∏≠‡∏ö‡∏™‡∏ß‡∏ô‡πÅ‡∏•‡∏∞‡∏Ñ‡∏ß‡∏ö‡∏Ñ‡∏∏‡∏°‡πÇ‡∏£‡∏Ñ‡∏≠‡∏≤‡∏´‡∏≤‡∏£‡πÄ‡∏õ‡πá‡∏ô‡∏û‡∏¥‡∏©:

# {context}

# ‡∏´‡∏°‡∏≤‡∏¢‡πÄ‡∏´‡∏ï‡∏∏: ‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏à‡∏≤‡∏Å‡∏™‡πà‡∏ß‡∏ô {', '.join(set([r['metadata']['section_name'] for r in results]))}
# ‡∏Ñ‡∏∞‡πÅ‡∏ô‡∏ô‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏Å‡∏µ‡πà‡∏¢‡∏ß‡∏Ç‡πâ‡∏≠‡∏á: {[f"{r['score']:.3f}" for r in results]}"""

#         return answer

def create_system(pdf_path: str, save_filename: str = "medical_system.pkl",
                 chunk_size: int = 800, model_name: str = "models/bge-m3"):
    """Create or load system with custom filename"""

    # Ensure .pkl extension
    if not save_filename.endswith('.pkl'):
        save_filename += '.pkl'

    processor = FocusedThaiMedicalProcessor(model_name)

    # Check if saved system exists
    if os.path.exists(save_filename):
        print(f"üìÇ Found existing system: {save_filename}")
        processor.load_system(save_filename)
        return processor

    # Build new system
    print(f"üöÄ Building new system: {save_filename}")
    processor.build_system(pdf_path, chunk_size)

    # Auto save
    processor.save_system(save_filename)

    return processor

## RAG System Testing

In [None]:
import json
from typing import Dict, List
from groq import Groq

# Initialize Groq client
GROQ_API_KEY = ""
client = Groq(api_key=GROQ_API_KEY)

class RAGEnhancementSystem:
    def __init__(self, rag_system):
        """
        Initialize with existing RAG system
        
        Args:
            rag_system: ‡∏£‡∏∞‡∏ö‡∏ö FocusedThaiMedicalProcessor ‡∏ó‡∏µ‡πà‡πÇ‡∏´‡∏•‡∏î‡πÅ‡∏•‡πâ‡∏ß
        """
        self.rag_system = rag_system
        
    def extract_actionable_items(self, analysis_json: str) -> List[str]:
        """
        ‡πÅ‡∏¢‡∏Å‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏à‡∏≤‡∏Å Actionable_Instructions ‡πÅ‡∏•‡∏∞ Future_plan_add-ons
        
        Args:
            analysis_json: JSON string ‡∏à‡∏≤‡∏Å‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå pipeline ‡∏´‡∏•‡∏±‡∏Å
            
        Returns:
            List of actionable items to query
        """
        try:
            data = json.loads(analysis_json)
            items = []
            
            # Extract from Actionable_Instructions
            actionable = data.get("Actionable_Instructions", {})
            
            # Next_24-72_hr
            next_24_72 = actionable.get("Next_24-72_hr", [])
            for item in next_24_72:
                items.append({
                    "text": item,
                    "category": "Next_24-72_hr",
                    "type": "actionable"
                })
            
            # Next_1-2_weeks
            next_1_2_weeks = actionable.get("Next_1-2_weeks", [])
            for item in next_1_2_weeks:
                items.append({
                    "text": item,
                    "category": "Next_1-2_weeks", 
                    "type": "actionable"
                })
            
            # Future_plan_add-ons
            future_plans = data.get("Future_plan_add-ons", [])
            for item in future_plans:
                items.append({
                    "text": item,
                    "category": "Future_plan_add-ons",
                    "type": "future_plan"
                })
            
            print(f"üìã ‡πÅ‡∏¢‡∏Å‡πÑ‡∏î‡πâ {len(items)} ‡∏£‡∏≤‡∏¢‡∏Å‡∏≤‡∏£‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏Å‡∏≤‡∏£ query:")
            for i, item in enumerate(items, 1):
                print(f"   {i}. [{item['category']}] {item['text'][:60]}...")
            
            return items
            
        except json.JSONDecodeError as e:
            print(f"‚ùå Error parsing JSON: {e}")
            return []
        except Exception as e:
            print(f"‚ùå Error extracting items: {e}")
            return []

    def query_each_item(self, items: List[Dict], top_k: int = 3) -> List[Dict]:
        """
        Query ‡πÅ‡∏ï‡πà‡∏•‡∏∞ item ‡πÉ‡∏ô RAG system
        
        Args:
            items: List of items ‡∏à‡∏≤‡∏Å extract_actionable_items
            top_k: ‡∏à‡∏≥‡∏ô‡∏ß‡∏ô results ‡∏ó‡∏µ‡πà‡∏ï‡πâ‡∏≠‡∏á‡∏Å‡∏≤‡∏£‡∏à‡∏≤‡∏Å‡πÅ‡∏ï‡πà‡∏•‡∏∞ query
            
        Returns:
            List ‡∏ó‡∏µ‡πà‡∏°‡∏µ‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡∏Å‡∏≤‡∏£ query ‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î
        """
        all_results = []
        
        print(f"\nüîç ‡∏Å‡∏≥‡∏•‡∏±‡∏á query {len(items)} ‡∏£‡∏≤‡∏¢‡∏Å‡∏≤‡∏£‡πÉ‡∏ô RAG system...")
        
        for i, item in enumerate(items, 1):
            # print(f"\n--- Query {i}/{len(items)} ---")
            # print(f"‡∏´‡∏°‡∏ß‡∏î: {item['category']}")
            # print(f"‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°: {item['text']}")
            
            # Query ‡πÉ‡∏ô RAG system
            try:
                results = self.rag_system.search(item['text'], top_k=top_k)
                
                item_result = {
                    'original_item': item,
                    'query_results': results,
                    'query_success': True
                }
                
                # print(f"‚úÖ ‡∏û‡∏ö {len(results)} ‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå")
                for j, result in enumerate(results, 1):
                    section = self._translate_section_name(result['metadata']['section_name'])
                    score = result['score']
                    print(f"   {j}. {section} (‡∏Ñ‡∏∞‡πÅ‡∏ô‡∏ô: {score:.4f})")
                
            except Exception as e:
                print(f"‚ùå Error querying: {e}")
                item_result = {
                    'original_item': item,
                    'query_results': [],
                    'query_success': False,
                    'error': str(e)
                }
            
            all_results.append(item_result)
        
        return all_results

    def _translate_section_name(self, section_name: str) -> str:
        """‡πÅ‡∏õ‡∏•‡∏á‡∏ä‡∏∑‡πà‡∏≠ section ‡πÄ‡∏õ‡πá‡∏ô‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢"""
        translations = {
            'general_knowledge': '‡∏Ñ‡∏ß‡∏≤‡∏°‡∏£‡∏π‡πâ‡∏ó‡∏±‡πà‡∏ß‡πÑ‡∏õ‡∏Ç‡∏≠‡∏á‡πÇ‡∏£‡∏Ñ‡∏≠‡∏≤‡∏´‡∏≤‡∏£‡πÄ‡∏õ‡πá‡∏ô‡∏û‡∏¥‡∏©',
            'investigation_guidelines': '‡πÅ‡∏ô‡∏ß‡∏ó‡∏≤‡∏á‡∏Å‡∏≤‡∏£‡∏™‡∏≠‡∏ö‡∏™‡∏ß‡∏ô‡∏Å‡∏≤‡∏£‡∏£‡∏∞‡∏ö‡∏≤‡∏î‡πÇ‡∏£‡∏Ñ‡∏≠‡∏≤‡∏´‡∏≤‡∏£‡πÄ‡∏õ‡πá‡∏ô‡∏û‡∏¥‡∏©'
        }
        return translations.get(section_name, section_name)

    def analyze_with_llm(self, original_json: str, rag_results: List[Dict]) -> str:
        """
        ‡πÉ‡∏ä‡πâ LLM ‡∏ß‡∏¥‡πÄ‡∏Ñ‡∏£‡∏≤‡∏∞‡∏´‡πå‡∏ß‡πà‡∏≤‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏à‡∏≤‡∏Å RAG ‡∏°‡∏µ‡∏õ‡∏£‡∏∞‡πÇ‡∏¢‡∏ä‡∏ô‡πå‡∏´‡∏£‡∏∑‡∏≠‡πÑ‡∏°‡πà
        
        Args:
            original_json: JSON ‡πÄ‡∏î‡∏¥‡∏°‡∏à‡∏≤‡∏Å pipeline
            rag_results: ‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡∏à‡∏≤‡∏Å RAG queries
            
        Returns:
            Enhanced analysis with RAG information
        """
        
        # ‡∏™‡∏£‡πâ‡∏≤‡∏á context ‡∏à‡∏≤‡∏Å RAG results
        rag_context = self._format_rag_context(rag_results)
        
        # ‡∏™‡∏£‡πâ‡∏≤‡∏á prompt
        system_prompt = self._create_analysis_prompt()
        user_prompt = self._create_user_prompt(original_json, rag_context)
        
        print("\nüß† ‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏ß‡∏¥‡πÄ‡∏Ñ‡∏£‡∏≤‡∏∞‡∏´‡πå‡∏î‡πâ‡∏ß‡∏¢ LLM...")
        
        try:
            response = client.chat.completions.create(
                model="qwen/qwen3-32b",
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                temperature=0.2,
                max_tokens=4000,
                timeout=60
            )
            
            result = response.choices[0].message.content.strip()
            result = strip_think(result)

            try:
                parsed = json.loads(result)
                return json.dumps(parsed, ensure_ascii=False, indent=2)
            except Exception as e:
                print(f"‚ùå JSON parse error: {e}")
                return result
        except Exception as e:
            print(f"‚ùå Error in LLM analysis: {e}")
            return f"Error in analysis: {str(e)}"

    def _format_rag_context(self, rag_results: List[Dict]) -> str:
        """‡∏à‡∏±‡∏î‡∏£‡∏π‡∏õ‡πÅ‡∏ö‡∏ö‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏à‡∏≤‡∏Å RAG ‡πÄ‡∏õ‡πá‡∏ô context ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö LLM"""
        
        context_parts = []
        
        for i, result in enumerate(rag_results, 1):
            original_item = result['original_item']
            query_results = result.get('query_results', [])
            
            context_parts.append(f"\n=== ‡∏£‡∏≤‡∏¢‡∏Å‡∏≤‡∏£‡∏ó‡∏µ‡πà {i} ===")
            context_parts.append(f"‡∏´‡∏°‡∏ß‡∏î: {original_item['category']}")
            context_parts.append(f"‡∏Ç‡πâ‡∏≠‡πÄ‡∏™‡∏ô‡∏≠‡πÅ‡∏ô‡∏∞‡πÄ‡∏î‡∏¥‡∏°: {original_item['text']}")
            
            if query_results:
                context_parts.append(f"‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏™‡∏ô‡∏±‡∏ö‡∏™‡∏ô‡∏∏‡∏ô‡∏à‡∏≤‡∏Å‡πÄ‡∏≠‡∏Å‡∏™‡∏≤‡∏£ ({len(query_results)} ‡∏£‡∏≤‡∏¢‡∏Å‡∏≤‡∏£):")
                
                for j, qr in enumerate(query_results, 1):
                    section_thai = self._translate_section_name(qr['metadata']['section_name'])
                    score = qr['score']
                    text_preview = qr['text'][:300] + "..." if len(qr['text']) > 300 else qr['text']
                    
                    context_parts.append(f"\n  {j}. ‡πÅ‡∏´‡∏•‡πà‡∏á‡∏ó‡∏µ‡πà‡∏°‡∏≤: {section_thai}")
                    context_parts.append(f"     ‡∏Ñ‡∏∞‡πÅ‡∏ô‡∏ô‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏Å‡∏µ‡πà‡∏¢‡∏ß‡∏Ç‡πâ‡∏≠‡∏á: {score:.4f}")
                    context_parts.append(f"     ‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤: {text_preview}")
                    context_parts.append(f"     ‡∏´‡∏ô‡πâ‡∏≤: {qr['metadata'].get('page_start','?')}-{qr['metadata'].get('page_end','?')}")
            else:
                context_parts.append("‡πÑ‡∏°‡πà‡∏û‡∏ö‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏™‡∏ô‡∏±‡∏ö‡∏™‡∏ô‡∏∏‡∏ô‡∏ó‡∏µ‡πà‡πÄ‡∏Å‡∏µ‡πà‡∏¢‡∏ß‡∏Ç‡πâ‡∏≠‡∏á")
        
        return "\n".join(context_parts)

    def _create_analysis_prompt(self) -> str:
        """Create system prompt for guideline-based analysis"""

        return """You are an expert in foodborne disease outbreak investigation and control.
    You are tasked with refining preliminary recommendations using the official guideline document as a reference.

    Your responsibilities:
    1. Analyze whether information from the guideline is relevant to improve the recommendation.
    2. If relevant, enhance the recommendation to make it more accurate, complete, and aligned with the guideline.
    3. If not relevant, keep the original recommendation unchanged.
    4. Clearly specify whether RAG information was used or not.

    Evaluation criteria:
    - Consistency with guideline content
    - Scientific accuracy
    - Practical feasibility
    - Completeness of recommendations

    Response format:
    Return the output strictly in JSON format. 
    For each recommendation:
    - Always include: Original, Enhanced, and Use_RAG.
    - If Use_RAG = true ‚Üí also include Guideline_Reference, Page_Number, and Relevance_Score (decimal with 4 digits).
    - If Use_RAG = false ‚Üí do not include those fields."""


    def _create_user_prompt(self, original_json: str, rag_context: str) -> str:
        """Create user prompt for guideline-based recommendation refinement"""

        return f"""
    Original Analysis JSON:
    {original_json}

    Supporting context from guideline document:
    {rag_context}

    Please analyze and return the results strictly as a valid JSON object in the following structure:

    {{
    "Analysis_Summary": {{
        "Total_Items_Analyzed": <int>,
        "Items_Enhanced_With_Guidelines": <int>,
        "Items_Kept_Original": <int>
    }},
    "Enhanced_Recommendations": {{
        "Next_24-72_hr": [
        {{
            "Original": "<original recommendation>",
            "Enhanced": "<enhanced recommendation>",
            "Use_RAG": true,
            "Guideline_Reference": "<specific section or paragraph reference>",
            "Page_Number": "<page number>",
            "Relevance_Score": "<decimal with 4 digits>"
        }},
        {{
            "Original": "<original recommendation>",
            "Enhanced": "<enhanced recommendation>",
            "Use_RAG": false
        }}
        ],
        "Next_1-2_weeks": [
        {{
            "Original": "<original recommendation>",
            "Enhanced": "<enhanced recommendation>",
            "Use_RAG": true,
            "Guideline_Reference": "<specific section or paragraph reference>",
            "Page_Number": "<page number>",
            "Relevance_Score": "<decimal with 4 digits>"
        }}
        ],
        "Future_plan_add-ons": [
        {{
            "Original": "<original recommendation>",
            "Enhanced": "<enhanced recommendation>",
            "Use_RAG": false
        }}
        ]
    }},
    "Enhancement_Notes": [
        "<note about how the recommendation was enhanced or why it was unchanged>"
    ]
    }}

    Notes:
    - All recommendations must remain in Thai, but all JSON keys and structure must be in English.
    - If Use_RAG = false, copy Original into Enhanced.
    - Output must be pure JSON only. Do not include <think>, comments, or markdown fences.
    """

    def run_enhancement(self, analysis_json: str, top_k: int = 3) -> str:
        """
        ‡∏£‡∏±‡∏ô‡∏Å‡∏£‡∏∞‡∏ö‡∏ß‡∏ô‡∏Å‡∏≤‡∏£ enhancement ‡πÅ‡∏ö‡∏ö‡πÄ‡∏ï‡πá‡∏° ‡πÅ‡∏•‡πâ‡∏ß '‡∏£‡∏ß‡∏°‡∏ú‡∏•' ‡∏Å‡∏•‡∏±‡∏ö‡πÑ‡∏õ‡πÄ‡∏õ‡πá‡∏ô JSON ‡∏™‡∏∏‡∏î‡∏ó‡πâ‡∏≤‡∏¢
        """
        print("üöÄ ‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏Å‡∏£‡∏∞‡∏ö‡∏ß‡∏ô‡∏Å‡∏≤‡∏£ RAG Enhancement")
        print("=" * 60)

        # Step 1: Extract
        items = self.extract_actionable_items(analysis_json)
        if not items:
            return "Error: ‡πÑ‡∏°‡πà‡∏™‡∏≤‡∏°‡∏≤‡∏£‡∏ñ‡πÅ‡∏¢‡∏Å‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏à‡∏≤‡∏Å JSON ‡πÑ‡∏î‡πâ"

        # Step 2: Query RAG
        print(f"\nüìö ‡∏Å‡∏≥‡∏•‡∏±‡∏á query {len(items)} ‡∏£‡∏≤‡∏¢‡∏Å‡∏≤‡∏£‡πÉ‡∏ô knowledge base...")
        rag_results = self.query_each_item(items, top_k=top_k)

        # Step 3: LLM Analysis
        print(f"\nüîç ‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏ß‡∏¥‡πÄ‡∏Ñ‡∏£‡∏≤‡∏∞‡∏´‡πå‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏Å‡∏µ‡πà‡∏¢‡∏ß‡∏Ç‡πâ‡∏≠‡∏á‡∏Ç‡∏≠‡∏á‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•...")
        enhanced_result = self.analyze_with_llm(analysis_json, rag_results)

        # Step 4: Merge ‡πÄ‡∏Ç‡πâ‡∏≤‡∏Å‡∏±‡∏ö original
        print(f"\nüß© ‡∏£‡∏ß‡∏°‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå RAG ‡∏Å‡∏±‡∏ö‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡πÄ‡∏î‡∏¥‡∏°...")
        merged_final = merge_enhancements_with_original(analysis_json, enhanced_result)

        print(f"\n‚úÖ ‡∏Å‡∏£‡∏∞‡∏ö‡∏ß‡∏ô‡∏Å‡∏≤‡∏£ RAG Enhancement ‡πÄ‡∏™‡∏£‡πá‡∏à‡∏™‡∏¥‡πâ‡∏ô")
        return merged_final

def strip_think(text: str) -> str:
    """Remove <think> tags and markdown code fences from text."""
    if not isinstance(text, str):
        return text
    cleaned = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)
    cleaned = re.sub(r"```(?:json)?", "", cleaned)  # remove ``` or ```json
    return cleaned.strip()

def _sanitize_item(it: dict) -> dict:
    """
    Enforce rules:
      - If Use_RAG == False: Enhanced must equal Original, and remove guideline fields.
      - If Use_RAG == True: ensure required fields exist and normalize Relevance_Score to 4 decimals.
    """
    it = dict(it)  # shallow copy
    original = it.get("Original", "")
    use_rag = bool(it.get("Use_RAG", False))

    it["Original"] = original

    if not use_rag:
        it["Use_RAG"] = False
        it["Enhanced"] = original
        # drop guideline-only fields if present
        for k in ("Guideline_Reference", "Page_Number", "Relevance_Score"):
            it.pop(k, None)
    else:
        it["Use_RAG"] = True
        # ensure Enhanced exists
        it["Enhanced"] = it.get("Enhanced", original)
        # normalize Relevance_Score
        rs = it.get("Relevance_Score", None)
        if rs is not None and rs != "":
            try:
                it["Relevance_Score"] = f"{float(rs):.4f}"
            except Exception:
                # keep as-is if cannot coerce
                pass
        # coerce Page_Number to string if present
        if "Page_Number" in it and it["Page_Number"] is not None:
            it["Page_Number"] = str(it["Page_Number"])

    return it

def _sanitize_llm_output(llm: dict) -> dict:
    """
    Sanitize LLM JSON in-place: enforce the Use_RAG rules across all buckets.
    """
    llm = dict(llm)
    enh = dict(llm.get("Enhanced_Recommendations", {}))
    for key in ("Next_24-72_hr", "Next_1-2_weeks", "Future_plan_add-ons"):
        arr = enh.get(key) or []
        new_arr = []
        for it in arr:
            if isinstance(it, dict):
                new_arr.append(_sanitize_item(it))
            else:
                # if LLM returned raw strings unexpectedly, wrap them
                new_arr.append({"Original": it, "Enhanced": it, "Use_RAG": False})
        enh[key] = new_arr
    llm["Enhanced_Recommendations"] = enh
    return llm

def _index_by_original(items):
    """Build dict: original_text -> sanitized item_dict"""
    idx = {}
    for it in items or []:
        if not isinstance(it, dict):
            # tolerate stray strings
            it = {"Original": str(it), "Enhanced": str(it), "Use_RAG": False}
        it = _sanitize_item(it)  # <-- enforce rules here
        orig = it.get("Original", "")
        if orig:
            idx[orig] = it
    return idx

def _merge_bucket(orig_list, idx_enh):
    """
    Merge a bucket with enforcement:
      - Match by Original text.
      - If found in idx_enh: use the sanitized item.
      - If not found: produce {Original, Enhanced=Original, Use_RAG=False}.
      - Preserve original order.
    """
    merged = []
    for entry in orig_list:
        if isinstance(entry, dict):
            original_text = entry.get("Original") or entry.get("Enhanced") or ""
            if original_text in idx_enh:
                merged.append(_sanitize_item(idx_enh[original_text]))  # ensure sanitized
            else:
                base = {
                    "Original": original_text or entry,
                    "Enhanced": entry.get("Enhanced", original_text or entry),
                    "Use_RAG": bool(entry.get("Use_RAG", False)),
                }
                # final sanitize
                merged.append(_sanitize_item(base))
        else:
            # entry is string
            if entry in idx_enh:
                merged.append(_sanitize_item(idx_enh[entry]))
            else:
                merged.append({
                    "Original": entry,
                    "Enhanced": entry,  # enforce Use_RAG=False => same as original
                    "Use_RAG": False
                })
    return merged

def merge_enhancements_with_original(original_json_str: str, llm_json_str: str) -> str:
    """
    - parse both sides
    - sanitize LLM output
    - index and merge
    - attach Enhancement_Notes (if any)
    """
    # parse original
    try:
        original = json.loads(original_json_str)
    except Exception as e:
        return json.dumps({"error": f"Cannot parse original_json: {e}"}, ensure_ascii=False, indent=2)

    # parse LLM (strip <think> / fences first)
    llm_clean = strip_think(llm_json_str)
    try:
        llm_raw = json.loads(llm_clean)
    except Exception as e:
        final_out = dict(original)
        final_out["Enhancement_Notes"] = ["LLM output could not be parsed as JSON.", str(e)]
        return json.dumps(final_out, ensure_ascii=False, indent=2)

    # --- NEW: sanitize the whole LLM block first ---
    llm = _sanitize_llm_output(llm_raw)

    # pull enhanced buckets
    enh = llm.get("Enhanced_Recommendations", {})
    e_24_72 = enh.get("Next_24-72_hr") or []
    e_1_2w  = enh.get("Next_1-2_weeks") or []
    e_future = enh.get("Future_plan_add-ons") or []

    # build indexes
    idx_24_72 = _index_by_original(e_24_72)
    idx_1_2w  = _index_by_original(e_1_2w)
    idx_future = _index_by_original(e_future)

    # merge Actionable_Instructions
    actionable = original.get("Actionable_Instructions", {})
    orig_24_72 = actionable.get("Next_24-72_hr", [])
    orig_1_2w  = actionable.get("Next_1-2_weeks", [])
    actionable["Next_24-72_hr"] = _merge_bucket(orig_24_72, idx_24_72)
    actionable["Next_1-2_weeks"] = _merge_bucket(orig_1_2w, idx_1_2w)
    original["Actionable_Instructions"] = actionable

    # merge Future_plan-add-ons
    orig_future = original.get("Future_plan_add-ons", [])
    original["Future_plan_add-ons"] = _merge_bucket(orig_future, idx_future)

    # attach Enhancement_Notes at root
    notes = llm.get("Enhancement_Notes") or []
    if notes:
        original["Enhancement_Notes"] = notes

    return json.dumps(original, ensure_ascii=False, indent=2)

# ==================== ‡∏ü‡∏±‡∏á‡∏Å‡πå‡∏ä‡∏±‡∏ô‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏ó‡∏î‡∏™‡∏≠‡∏ö ====================

def test_rag_enhancement():
    """‡∏ó‡∏î‡∏™‡∏≠‡∏ö‡∏£‡∏∞‡∏ö‡∏ö RAG Enhancement"""
    
    # ‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á JSON ‡∏à‡∏≤‡∏Å pipeline ‡∏´‡∏•‡∏±‡∏Å
    sample_analysis = '''{
    "Actions_Adequacy": {
        "Adequacy": false,
        "Recommendations": [
        "‡∏î‡∏≥‡πÄ‡∏ô‡∏¥‡∏ô‡∏Å‡∏≤‡∏£‡∏õ‡∏¥‡∏î‡∏™‡∏ñ‡∏≤‡∏ô‡∏ó‡∏µ‡πà‡∏£‡πâ‡∏≤‡∏ô‡∏Ç‡πâ‡∏≤‡∏ß‡∏´‡∏°‡∏π‡πÅ‡∏î‡∏á‡∏ó‡∏±‡∏ô‡∏ó‡∏µ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏Ñ‡∏ß‡∏ö‡∏Ñ‡∏∏‡∏°‡∏Å‡∏≤‡∏£‡πÅ‡∏û‡∏£‡πà‡∏Å‡∏£‡∏∞‡∏à‡∏≤‡∏¢‡πÄ‡∏ä‡∏∑‡πâ‡∏≠‡πÅ‡∏•‡∏∞‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏™‡∏∏‡∏Ç‡∏≤‡∏†‡∏¥‡∏ö‡∏≤‡∏•‡∏≠‡∏¢‡πà‡∏≤‡∏á‡πÄ‡∏Ç‡πâ‡∏°‡∏á‡∏ß‡∏î",
        "‡∏à‡∏±‡∏î‡∏Å‡∏≤‡∏£‡∏ù‡∏∂‡∏Å‡∏≠‡∏ö‡∏£‡∏°‡∏™‡∏∏‡∏Ç‡∏≤‡∏†‡∏¥‡∏ö‡∏≤‡∏•‡∏≠‡∏≤‡∏´‡∏≤‡∏£‡πÉ‡∏´‡πâ‡∏ú‡∏π‡πâ‡∏õ‡∏£‡∏∞‡∏Å‡∏≠‡∏ö‡∏Å‡∏≤‡∏£‡πÅ‡∏•‡∏∞‡∏ú‡∏π‡πâ‡∏ä‡πà‡∏ß‡∏¢‡πÇ‡∏î‡∏¢‡πÄ‡∏£‡πá‡∏ß‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î ‡∏û‡∏£‡πâ‡∏≠‡∏°‡∏ï‡∏¥‡∏î‡∏ï‡∏≤‡∏°‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡∏≠‡∏ö‡∏£‡∏°‡πÄ‡∏õ‡πá‡∏ô‡∏£‡∏∞‡∏¢‡∏∞",
        "‡∏™‡∏±‡πà‡∏á‡∏´‡πâ‡∏≤‡∏°‡∏Å‡∏≤‡∏£‡∏à‡∏±‡∏î‡∏™‡πà‡∏á‡∏´‡∏£‡∏∑‡∏≠‡∏à‡∏≥‡∏´‡∏ô‡πà‡∏≤‡∏¢‡∏≠‡∏≤‡∏´‡∏≤‡∏£‡∏à‡∏≤‡∏Å‡πÅ‡∏´‡∏•‡πà‡∏á‡∏ó‡∏µ‡πà‡πÄ‡∏Å‡∏¥‡∏î‡πÄ‡∏´‡∏ï‡∏∏‡∏à‡∏ô‡∏Å‡∏ß‡πà‡∏≤‡∏à‡∏∞‡πÑ‡∏î‡πâ‡∏£‡∏±‡∏ö‡∏Å‡∏≤‡∏£‡∏£‡∏±‡∏ö‡∏£‡∏≠‡∏á‡∏Ñ‡∏ß‡∏≤‡∏°‡∏õ‡∏•‡∏≠‡∏î‡∏†‡∏±‡∏¢‡∏à‡∏≤‡∏Å‡∏´‡∏ô‡πà‡∏ß‡∏¢‡∏á‡∏≤‡∏ô‡∏™‡∏≤‡∏ò‡∏≤‡∏£‡∏ì‡∏™‡∏∏‡∏Ç"
        ]
    },
    "Actionable_Instructions": {
        "Next_24-72_hr": [
        "‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡πÅ‡∏•‡∏∞‡∏Ñ‡∏ß‡∏ö‡∏Ñ‡∏∏‡∏°‡∏Ñ‡∏•‡∏≠‡∏£‡∏µ‡∏ô‡πÉ‡∏ô‡∏ô‡πâ‡∏≥‡∏≠‡∏∏‡∏õ‡πÇ‡∏†‡∏Ñ‡∏ö‡∏£‡∏¥‡πÇ‡∏†‡∏Ñ‡∏Ç‡∏≠‡∏á‡∏£‡πâ‡∏≤‡∏ô‡∏Ç‡πâ‡∏≤‡∏ß‡∏´‡∏°‡∏π‡πÅ‡∏î‡∏á‡πÉ‡∏´‡πâ‡∏™‡∏≠‡∏î‡∏Ñ‡∏•‡πâ‡∏≠‡∏á‡∏Å‡∏±‡∏ö‡∏°‡∏≤‡∏ï‡∏£‡∏ê‡∏≤‡∏ô‡∏™‡∏∏‡∏Ç‡∏≠‡∏ô‡∏≤‡∏°‡∏±‡∏¢",
        "‡πÄ‡∏Å‡πá‡∏ö‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏≠‡∏≤‡∏´‡∏≤‡∏£‡∏ó‡∏µ‡πà‡πÄ‡∏´‡∏•‡∏∑‡∏≠‡πÅ‡∏•‡∏∞‡∏≠‡∏∏‡∏õ‡∏Å‡∏£‡∏ì‡πå‡∏™‡∏±‡∏°‡∏ú‡∏±‡∏™‡∏≠‡∏≤‡∏´‡∏≤‡∏£‡πÄ‡∏û‡∏¥‡πà‡∏°‡πÄ‡∏ï‡∏¥‡∏°‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏ï‡∏£‡∏ß‡∏à‡∏´‡∏≤‡πÄ‡∏ä‡∏∑‡πâ‡∏≠‡πÉ‡∏ô‡∏´‡πâ‡∏≠‡∏á‡∏õ‡∏è‡∏¥‡∏ö‡∏±‡∏ï‡∏¥‡∏Å‡∏≤‡∏£",
        "‡∏à‡∏±‡∏î‡∏ó‡∏≥‡πÅ‡∏ú‡∏ô‡πÄ‡∏ù‡πâ‡∏≤‡∏£‡∏∞‡∏ß‡∏±‡∏á‡∏ú‡∏π‡πâ‡∏™‡∏±‡∏°‡∏ú‡∏±‡∏™‡πÉ‡∏Å‡∏•‡πâ‡∏ä‡∏¥‡∏î‡∏à‡∏≤‡∏Å‡∏á‡∏≤‡∏ô‡∏å‡∏≤‡∏õ‡∏ô‡∏Å‡∏¥‡∏à‡∏®‡∏û‡∏ó‡∏µ‡πà‡∏¢‡∏±‡∏á‡πÑ‡∏°‡πà‡πÑ‡∏î‡πâ‡∏£‡∏±‡∏ö‡∏Å‡∏≤‡∏£‡∏ï‡∏¥‡∏î‡∏ï‡∏≤‡∏°"
        ],
        "Next_1-2_weeks": [
        "‡∏î‡∏≥‡πÄ‡∏ô‡∏¥‡∏ô‡∏Å‡∏≤‡∏£‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏™‡∏†‡∏≤‡∏û‡πÅ‡∏ß‡∏î‡∏•‡πâ‡∏≠‡∏°‡πÅ‡∏•‡∏∞‡∏£‡∏∞‡∏ö‡∏ö‡∏™‡∏∏‡∏Ç‡∏≤‡∏†‡∏¥‡∏ö‡∏≤‡∏•‡∏Ç‡∏≠‡∏á‡∏£‡πâ‡∏≤‡∏ô‡∏Ç‡πâ‡∏≤‡∏ß‡∏´‡∏°‡∏π‡πÅ‡∏î‡∏á‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏•‡∏∞‡πÄ‡∏≠‡∏µ‡∏¢‡∏î",
        "‡∏à‡∏±‡∏î‡∏ó‡∏≥‡∏£‡∏≤‡∏¢‡∏á‡∏≤‡∏ô‡∏™‡∏£‡∏∏‡∏õ‡∏Å‡∏≤‡∏£‡∏™‡∏≠‡∏ö‡∏™‡∏ß‡∏ô‡∏£‡∏∞‡∏ö‡∏≤‡∏î‡∏ß‡∏¥‡∏ó‡∏¢‡∏≤‡πÅ‡∏•‡∏∞‡∏™‡∏≤‡πÄ‡∏´‡∏ï‡∏∏‡∏Å‡∏≤‡∏£‡∏£‡∏∞‡∏ö‡∏≤‡∏î‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏£‡∏≤‡∏¢‡∏á‡∏≤‡∏ô‡∏´‡∏ô‡πà‡∏ß‡∏¢‡∏á‡∏≤‡∏ô‡∏ó‡∏µ‡πà‡πÄ‡∏Å‡∏µ‡πà‡∏¢‡∏ß‡∏Ç‡πâ‡∏≠‡∏á",
        "‡∏ï‡∏¥‡∏î‡∏ï‡∏≤‡∏°‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡∏ï‡∏£‡∏ß‡∏à‡∏´‡πâ‡∏≠‡∏á‡∏õ‡∏è‡∏¥‡∏ö‡∏±‡∏ï‡∏¥‡∏Å‡∏≤‡∏£‡∏ó‡∏∏‡∏Å‡∏ä‡πà‡∏≠‡∏á‡∏ó‡∏≤‡∏á‡πÅ‡∏•‡∏∞‡∏õ‡∏£‡∏±‡∏ö‡πÅ‡∏ú‡∏ô‡∏Ñ‡∏ß‡∏ö‡∏Ñ‡∏∏‡∏°‡πÇ‡∏£‡∏Ñ‡∏ï‡∏≤‡∏°‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡πÉ‡∏´‡∏°‡πà"
        ]
    },
    "Flaws_Gaps": [
        "‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏°‡∏≤‡∏ï‡∏£‡∏Å‡∏≤‡∏£‡∏Ñ‡∏ß‡∏ö‡∏Ñ‡∏∏‡∏°‡πÄ‡∏£‡πà‡∏á‡∏î‡πà‡∏ß‡∏ô‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏Å‡∏≤‡∏£‡∏õ‡∏¥‡∏î‡∏´‡∏£‡∏∑‡∏≠‡∏Å‡∏±‡∏Å‡∏Å‡∏±‡∏ô‡πÅ‡∏´‡∏•‡πà‡∏á‡∏≠‡∏≤‡∏´‡∏≤‡∏£‡∏ó‡∏µ‡πà‡πÄ‡∏õ‡πá‡∏ô‡πÅ‡∏´‡∏•‡πà‡∏á‡πÅ‡∏û‡∏£‡πà‡πÄ‡∏ä‡∏∑‡πâ‡∏≠",
        "‡∏Ç‡∏≤‡∏î‡∏Å‡∏≤‡∏£‡πÄ‡∏ù‡πâ‡∏≤‡∏£‡∏∞‡∏ß‡∏±‡∏á‡∏ú‡∏π‡πâ‡∏™‡∏±‡∏°‡∏ú‡∏±‡∏™‡∏≠‡∏≤‡∏´‡∏≤‡∏£‡πÇ‡∏î‡∏¢‡∏ï‡∏£‡∏á (‡πÄ‡∏ä‡πà‡∏ô ‡∏ú‡∏π‡πâ‡∏õ‡∏£‡∏∞‡∏Å‡∏≠‡∏ö‡∏Å‡∏≤‡∏£‡πÅ‡∏•‡∏∞‡∏ú‡∏π‡πâ‡∏ä‡πà‡∏ß‡∏¢) ‡∏≠‡∏¢‡πà‡∏≤‡∏á‡πÄ‡∏õ‡πá‡∏ô‡∏£‡∏∞‡∏ö‡∏ö",
        "‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏Å‡∏≤‡∏£‡∏ö‡∏±‡∏á‡∏Ñ‡∏±‡∏ö‡πÉ‡∏ä‡πâ‡∏°‡∏≤‡∏ï‡∏£‡∏Å‡∏≤‡∏£‡∏õ‡∏£‡∏±‡∏ö‡∏õ‡∏£‡∏∏‡∏á‡∏™‡∏∏‡∏Ç‡∏≤‡∏†‡∏¥‡∏ö‡∏≤‡∏•‡∏™‡∏ñ‡∏≤‡∏ô‡∏ó‡∏µ‡πà‡∏à‡∏±‡∏î‡∏á‡∏≤‡∏ô‡∏´‡∏£‡∏∑‡∏≠‡∏£‡πâ‡∏≤‡∏ô‡∏Ç‡πâ‡∏≤‡∏ß‡∏´‡∏°‡∏π‡πÅ‡∏î‡∏á‡πÉ‡∏ô‡∏ó‡∏±‡∏ô‡∏ó‡∏µ"
    ],
    "Future_plan_add-ons": [
        "‡πÄ‡∏û‡∏¥‡πà‡∏°‡∏Å‡∏≤‡∏£‡∏ß‡∏¥‡πÄ‡∏Ñ‡∏£‡∏≤‡∏∞‡∏´‡πå‡πÄ‡∏ä‡∏¥‡∏á‡∏õ‡∏£‡∏¥‡∏°‡∏≤‡∏ì‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏±‡∏°‡∏û‡∏±‡∏ô‡∏ò‡πå‡∏£‡∏∞‡∏´‡∏ß‡πà‡∏≤‡∏á‡∏£‡∏≤‡∏¢‡∏Å‡∏≤‡∏£‡∏≠‡∏≤‡∏´‡∏≤‡∏£‡πÅ‡∏•‡∏∞‡∏≠‡∏≤‡∏Å‡∏≤‡∏£‡∏ú‡∏π‡πâ‡∏õ‡πà‡∏ß‡∏¢‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏¢‡∏∑‡∏ô‡∏¢‡∏±‡∏ô‡∏™‡∏≤‡πÄ‡∏´‡∏ï‡∏∏‡∏´‡∏•‡∏±‡∏Å",
        "‡∏à‡∏±‡∏î‡∏ó‡∏≥‡πÅ‡∏ô‡∏ß‡∏ó‡∏≤‡∏á‡∏Å‡∏≤‡∏£‡∏õ‡πâ‡∏≠‡∏á‡∏Å‡∏±‡∏ô‡πÇ‡∏£‡∏Ñ‡∏≠‡∏≤‡∏´‡∏≤‡∏£‡πÄ‡∏õ‡πá‡∏ô‡∏û‡∏¥‡∏©‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏á‡∏≤‡∏ô‡∏û‡∏¥‡∏ò‡∏µ‡∏Å‡∏£‡∏£‡∏°‡πÉ‡∏ô‡∏ä‡∏∏‡∏°‡∏ä‡∏ô",
        "‡∏ï‡∏¥‡∏î‡∏ï‡∏≤‡∏°‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡∏ï‡∏£‡∏ß‡∏à‡πÇ‡∏Ñ‡∏•‡∏¥‡∏ü‡∏≠‡∏£‡πå‡∏°‡πÅ‡∏ö‡∏Ñ‡∏ó‡∏µ‡πÄ‡∏£‡∏µ‡∏¢‡πÉ‡∏ô‡∏ô‡πâ‡∏≥‡πÅ‡∏•‡∏∞‡∏≠‡∏∏‡∏õ‡∏Å‡∏£‡∏ì‡πå‡∏™‡∏±‡∏°‡∏ú‡∏±‡∏™‡∏≠‡∏≤‡∏´‡∏≤‡∏£‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏õ‡∏£‡∏∞‡πÄ‡∏°‡∏¥‡∏ô‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏µ‡πà‡∏¢‡∏á‡∏£‡∏∞‡∏¢‡∏∞‡∏¢‡∏≤‡∏ß"
    ],
    "Rationale": [
        "‡∏Å‡∏≤‡∏£‡πÑ‡∏°‡πà‡∏õ‡∏¥‡∏î‡πÅ‡∏´‡∏•‡πà‡∏á‡∏≠‡∏≤‡∏´‡∏≤‡∏£‡∏ó‡∏±‡∏ô‡∏ó‡∏µ‡∏≠‡∏≤‡∏à‡∏ô‡∏≥‡πÑ‡∏õ‡∏™‡∏π‡πà‡∏Å‡∏≤‡∏£‡∏£‡∏∞‡∏ö‡∏≤‡∏î‡∏ã‡πâ‡∏≥‡πÉ‡∏ô‡∏Å‡∏•‡∏∏‡πà‡∏°‡∏ú‡∏π‡πâ‡∏£‡πà‡∏ß‡∏°‡∏á‡∏≤‡∏ô‡∏≠‡∏∑‡πà‡∏ô‡∏´‡∏£‡∏∑‡∏≠‡∏ä‡∏∏‡∏°‡∏ä‡∏ô‡πÉ‡∏Å‡∏•‡πâ‡πÄ‡∏Ñ‡∏µ‡∏¢‡∏á",
        "‡∏Å‡∏≤‡∏£‡πÑ‡∏°‡πà‡πÄ‡∏ù‡πâ‡∏≤‡∏£‡∏∞‡∏ß‡∏±‡∏á‡∏ú‡∏π‡πâ‡∏™‡∏±‡∏°‡∏ú‡∏±‡∏™‡∏≠‡∏≤‡∏´‡∏≤‡∏£‡πÇ‡∏î‡∏¢‡∏ï‡∏£‡∏á‡∏≠‡∏≤‡∏à‡∏ó‡∏≥‡πÉ‡∏´‡πâ‡πÑ‡∏°‡πà‡∏™‡∏≤‡∏°‡∏≤‡∏£‡∏ñ‡∏ï‡∏£‡∏ß‡∏à‡∏à‡∏±‡∏ö‡∏Å‡∏≤‡∏£‡πÅ‡∏û‡∏£‡πà‡πÄ‡∏ä‡∏∑‡πâ‡∏≠‡πÉ‡∏ô‡∏ß‡∏á‡∏Å‡∏ß‡πâ‡∏≤‡∏á‡πÑ‡∏î‡πâ‡∏ó‡∏±‡∏ô‡πÄ‡∏ß‡∏•‡∏≤",
        "‡∏Å‡∏≤‡∏£‡πÑ‡∏°‡πà‡∏ö‡∏±‡∏á‡∏Ñ‡∏±‡∏ö‡πÉ‡∏ä‡πâ‡∏°‡∏≤‡∏ï‡∏£‡∏Å‡∏≤‡∏£‡∏õ‡∏£‡∏±‡∏ö‡∏õ‡∏£‡∏∏‡∏á‡∏™‡∏∏‡∏Ç‡∏≤‡∏†‡∏¥‡∏ö‡∏≤‡∏•‡πÉ‡∏ô‡∏ó‡∏±‡∏ô‡∏ó‡∏µ‡∏≠‡∏≤‡∏à‡∏ó‡∏≥‡πÉ‡∏´‡πâ‡πÄ‡∏Å‡∏¥‡∏î‡∏Å‡∏≤‡∏£‡∏õ‡∏ô‡πÄ‡∏õ‡∏∑‡πâ‡∏≠‡∏ô‡∏ã‡πâ‡∏≥‡πÉ‡∏ô‡∏≠‡∏ô‡∏≤‡∏Ñ‡∏ï"
    ],
    "Response_Time": "65.71 ‡∏ß‡∏¥‡∏ô‡∏≤‡∏ó‡∏µ",
    "Model_Used": "qwen/qwen3-32b"
    }'''
    
    print("üß™ ‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏ó‡∏î‡∏™‡∏≠‡∏ö RAG Enhancement System")
    print("=" * 60)
    
    # ‡πÇ‡∏´‡∏•‡∏î RAG system (‡∏ï‡πâ‡∏≠‡∏á‡∏°‡∏µ‡πÑ‡∏ü‡∏•‡πå medical_system.pkl ‡∏≠‡∏¢‡∏π‡πà‡πÅ‡∏•‡πâ‡∏ß)
    try:
        rag_system = create_system("Docs/Guideline.pdf", "my_medical_ragv2.pkl")
        
        # ‡∏™‡∏£‡πâ‡∏≤‡∏á enhancement system
        enhancer = RAGEnhancementSystem(rag_system)
        
        # ‡∏£‡∏±‡∏ô‡∏Å‡∏≤‡∏£ enhancement
        result = enhancer.run_enhancement(sample_analysis, top_k=1)
        
        print("\n" + "=" * 60)
        print("üìä ‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡∏Å‡∏≤‡∏£ Enhancement:")
        print("=" * 60)
        print(result)
        
        # ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå
        # with open("enhanced_analysis.json", "w", encoding="utf-8") as f:
        #     f.write(result)
        # print(f"\nüíæ ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡πÑ‡∏ß‡πâ‡∏ó‡∏µ‡πà enhanced_analysis.json")
        
    except Exception as e:
        print(f"‚ùå Error in testing: {e}")
        print("‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏ß‡πà‡∏≤‡∏°‡∏µ‡πÑ‡∏ü‡∏•‡πå medical_system.pkl ‡πÅ‡∏•‡∏∞ import ‡∏ñ‡∏π‡∏Å‡∏ï‡πâ‡∏≠‡∏á")

In [50]:
test_rag_enhancement()

üß™ ‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏ó‡∏î‡∏™‡∏≠‡∏ö RAG Enhancement System
BGE-M3 model loaded successfully!
üìÇ Found existing system: my_medical_ragv2.pkl
üìÇ Loading system from my_medical_ragv2.pkl...
‚úÖ System loaded successfully!
  Chunks: 69
  Embeddings shape: (69, 1024)
  Model info: {'model_name': 'BAAI/bge-m3', 'embedding_dim': 1024, 'num_chunks': 69}
üöÄ ‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏Å‡∏£‡∏∞‡∏ö‡∏ß‡∏ô‡∏Å‡∏≤‡∏£ RAG Enhancement
üìã ‡πÅ‡∏¢‡∏Å‡πÑ‡∏î‡πâ 9 ‡∏£‡∏≤‡∏¢‡∏Å‡∏≤‡∏£‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏Å‡∏≤‡∏£ query:
   1. [Next_24-72_hr] ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡πÅ‡∏•‡∏∞‡∏Ñ‡∏ß‡∏ö‡∏Ñ‡∏∏‡∏°‡∏Ñ‡∏•‡∏≠‡∏£‡∏µ‡∏ô‡πÉ‡∏ô‡∏ô‡πâ‡∏≥‡∏≠‡∏∏‡∏õ‡πÇ‡∏†‡∏Ñ‡∏ö‡∏£‡∏¥‡πÇ‡∏†‡∏Ñ‡∏Ç‡∏≠‡∏á‡∏£‡πâ‡∏≤‡∏ô‡∏Ç‡πâ‡∏≤‡∏ß‡∏´‡∏°‡∏π‡πÅ‡∏î‡∏á‡πÉ‡∏´‡πâ‡∏™...
   2. [Next_24-72_hr] ‡πÄ‡∏Å‡πá‡∏ö‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏≠‡∏≤‡∏´‡∏≤‡∏£‡∏ó‡∏µ‡πà‡πÄ‡∏´‡∏•‡∏∑‡∏≠‡πÅ‡∏•‡∏∞‡∏≠‡∏∏‡∏õ‡∏Å‡∏£‡∏ì‡πå‡∏™‡∏±‡∏°‡∏ú‡∏±‡∏™‡∏≠‡∏≤‡∏´‡∏≤‡∏£‡πÄ‡∏û‡∏¥‡πà‡∏°‡πÄ‡∏ï‡∏¥‡∏°‡πÄ‡∏û‡∏∑‡πà‡∏≠...
   3. [Next_24-72_hr] ‡∏à‡∏±‡∏î‡∏ó‡∏≥‡πÅ‡∏ú‡∏ô‡πÄ‡∏ù‡πâ‡∏≤‡∏£‡∏∞‡∏ß‡∏±‡∏á‡∏ú‡∏π‡πâ‡∏™‡∏±‡∏°‡∏

# Main Pipeline

In [None]:
import os
import re
import json
import time
import pandas as pd
from pathlib import Path
from pypdf import PdfReader
from groq import Groq
from typing import Dict, List, Optional, Tuple

# ==================== CONFIG ====================
# MODEL_NAME = "qwen/qwen3-32b"
MODEL_NAME_QWEN = "qwen/qwen3-32b"
MODEL_NAME_LLAMA = "llama-3.1-8b-instant"
LLAMA_SHORT_NAME = "llama3.1-8b"

os.environ['GROQ_API_KEY'] = ""
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

if not GROQ_API_KEY:
    raise ValueError("Please set GROQ_API_KEY environment variable")

client = Groq(api_key=GROQ_API_KEY)

# ==================== PDF EXTRACTION ====================
def extract_text_from_pdf(pdf_path: str) -> str:
    """Extract text from PDF file with basic cleanup."""
    reader = PdfReader(pdf_path)
    text = "\n".join(p.extract_text() or "" for p in reader.pages)
    # Basic cleanup
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{2,}", "\n\n", text).strip()
    return text

# ==================== SECTION PARSING ====================
SECTION_KEYWORDS = (
    "‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏õ‡πá‡∏ô‡∏°‡∏≤", "‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡∏™‡∏≠‡∏ö‡∏™‡∏ß‡∏ô", "‡∏™‡∏¥‡πà‡∏á‡∏ó‡∏µ‡πà‡∏î‡∏≥‡πÄ‡∏ô‡∏¥‡∏ô‡∏Å‡∏≤‡∏£‡πÑ‡∏õ‡πÅ‡∏•‡πâ‡∏ß",
    "‡∏™‡∏¥‡πà‡∏á‡∏ó‡∏µ‡πà‡∏à‡∏∞‡∏î‡∏≥‡πÄ‡∏ô‡∏¥‡∏ô‡∏Å‡∏≤‡∏£‡∏ï‡πà‡∏≠‡πÑ‡∏õ", "‡∏Ç‡πâ‡∏≠‡πÄ‡∏™‡∏ô‡∏≠‡πÅ‡∏ô‡∏∞", "‡∏Ç‡πâ‡∏≠‡πÄ‡∏™‡∏ô‡∏≠‡πÅ‡∏ô‡∏∞‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏û‡∏¥‡∏à‡∏≤‡∏£‡∏ì‡∏≤",
    "‡∏•‡∏á‡∏ä‡∏∑‡πà‡∏≠", "‡∏ó‡∏µ‡∏°‡∏õ‡∏è‡∏¥‡∏ö‡∏±‡∏ï‡∏¥‡∏Å‡∏≤‡∏£‡∏™‡∏≠‡∏ö‡∏™‡∏ß‡∏ô"
)

BULLET_START = re.compile(r'^\s*(?:\d+(?:\.\d+)*[.)]|[‚Ä¢\-‚Äì‚Äî])\s+')
SENT_END = re.compile(r'[.!?‚Ä¶]|[)\]]|‚Äù|‚Äô|"|\'|‡∏ô\.$')

CONTINUATION_START = re.compile(
    r'^\s*(?:\d+[^\d\s]|‡πÅ‡∏•‡∏∞|‡∏´‡∏£‡∏∑‡∏≠|‡πÇ‡∏î‡∏¢|‡πÅ‡∏ï‡πà|‡∏£‡∏ß‡∏°‡∏ñ‡∏∂‡∏á|‡∏ó‡∏±‡πâ‡∏á‡∏ô‡∏µ‡πâ|‡∏ã‡∏∂‡πà‡∏á|‡∏ó‡∏µ‡πà|‡∏à‡∏≤‡∏Å|‡πÉ‡∏ô|‡∏Ç‡∏≠‡∏á|‡∏ï‡πà‡∏≠‡∏°‡∏≤|‡∏≠‡∏µ‡∏Å‡∏ó‡∏±‡πâ‡∏á|‡∏£‡∏ß‡∏°‡∏ñ‡∏∂‡∏á)\b'
)

HEADER_FULLLINE = [
    re.compile(rf'^\s*{re.escape(k)}\s*$', flags=re.I) 
    for k in SECTION_KEYWORDS
]

def is_header(text: str) -> bool:
    """Check if text is a section header."""
    s = text.strip()
    if not s:
        return False
    
    # Exact-line header match
    if any(pat.match(s) for pat in HEADER_FULLLINE):
        return True
    
    # Heuristic header shape
    if len(s) <= 40 and not SENT_END.search(s) and not BULLET_START.match(s):
        return True
    
    return False

def split_report(text: str) -> List[str]:
    """Split report into paragraphs."""
    # Normalize
    t = re.sub(r'\r\n?', '\n', text)
    t = re.sub(r'[ \t]+$', '', t, flags=re.M)
    t = t.replace('\f', '')
    
    # Mark headers
    def mark_headers(tt):
        for k in SECTION_KEYWORDS:
            tt = re.sub(rf'(?m)^\s*({re.escape(k)})(?:\s*)$', r'\n\1\n', tt)
        return tt
    
    t = mark_headers(t)
    raw = [p.strip() for p in re.split(r'\n\s*\n+', t) if p.strip()]
    
    # Fix paragraph merging
    fixed = []
    for cur in raw:
        if not fixed:
            fixed.append(cur)
            continue
        
        prev = fixed[-1]
        
        if is_header(cur):
            fixed.append(cur)
            continue
        
        lines = cur.splitlines()
        all_bullets = lines and all(BULLET_START.match(x) or not x.strip() for x in lines)
        if all_bullets:
            fixed.append(cur)
            continue
        
        # Merge criteria
        short = len(cur) < 80
        prev_not_end = not SENT_END.search(prev.splitlines()[-1])
        contish = CONTINUATION_START.match(cur) or re.match(r'^\s*\d+([^\d]|\s|$)', cur)
        single_line = len(lines) == 1
        
        if (short and single_line and (prev_not_end or contish)):
            fixed[-1] = (prev.rstrip() + ' ' + cur.lstrip()).strip()
        else:
            fixed.append(cur)
    
    # Fix hard wraps within paragraphs
    final = []
    for para in fixed:
        ls = [x.strip() for x in para.splitlines() if x.strip()]
        if not ls:
            continue
        out = []
        for line in ls:
            if not out:
                out.append(line)
                continue
            if BULLET_START.match(line) or BULLET_START.match(out[-1]):
                out.append(line)
            else:
                out[-1] = (out[-1].rstrip() + ' ' + line.lstrip()).strip()
        final.append('\n'.join(out).strip())
    
    return final

def extract_sections(paras: List[str]) -> Dict[str, str]:
    """Extract sections from paragraphs."""
    sections = {}
    i = 0
    while i < len(paras):
        if is_header(paras[i]):
            head = paras[i].strip()
            i += 1
            buf = []
            while i < len(paras) and not is_header(paras[i]):
                buf.append(paras[i].strip())
                i += 1
            sections[head] = "\n\n".join(buf).strip()
        else:
            i += 1
    return sections

ALIAS = {
    "situation": ["‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏õ‡πá‡∏ô‡∏°‡∏≤", "‡∏™‡∏ñ‡∏≤‡∏ô‡∏Å‡∏≤‡∏£‡∏ì‡πå", "background", "situation"],
    "findings": ["‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡∏™‡∏≠‡∏ö‡∏™‡∏ß‡∏ô", "‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö", "‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡∏®‡∏∂‡∏Å‡∏©‡∏≤", "findings"],
    "actions_done": ["‡∏™‡∏¥‡πà‡∏á‡∏ó‡∏µ‡πà‡∏î‡∏≥‡πÄ‡∏ô‡∏¥‡∏ô‡∏Å‡∏≤‡∏£‡πÑ‡∏õ‡πÅ‡∏•‡πâ‡∏ß", "‡∏°‡∏≤‡∏ï‡∏£‡∏Å‡∏≤‡∏£‡∏ó‡∏µ‡πà‡∏î‡∏≥‡πÄ‡∏ô‡∏¥‡∏ô‡∏Å‡∏≤‡∏£‡πÑ‡∏õ‡πÅ‡∏•‡πâ‡∏ß", "actions taken", "actions done"],
    "actions_next": ["‡∏™‡∏¥‡πà‡∏á‡∏ó‡∏µ‡πà‡∏à‡∏∞‡∏î‡∏≥‡πÄ‡∏ô‡∏¥‡∏ô‡∏Å‡∏≤‡∏£‡∏ï‡πà‡∏≠‡πÑ‡∏õ", "‡∏°‡∏≤‡∏ï‡∏£‡∏Å‡∏≤‡∏£‡∏ó‡∏µ‡πà‡∏à‡∏∞‡∏î‡∏≥‡πÄ‡∏ô‡∏¥‡∏ô‡∏Å‡∏≤‡∏£", "next steps", "next actions"],
}

def pick_section(sections: Dict[str, str], key: str) -> str:
    """Pick section text by key alias."""
    targets = ALIAS.get(key, [])
    for h, txt in sections.items():
        for t in targets:
            if re.search(re.escape(t), h, flags=re.I):
                return txt
    return ""

# ==================== SUMMARIZATION ====================
def strip_think(text: str) -> str:
    """Remove <think> tags from text."""
    return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()

def run_groq(system_prompt: str, user_prompt: str, model_name: str, max_tokens: int = 2000) -> str:
    """Run Groq API call with error handling and model selection."""
    try:
        resp = client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt},
            ],
            temperature=0.2,
            top_p=0.9,
            max_tokens=max_tokens,
            timeout=60
        )
        return strip_think(resp.choices[0].message.content)
    except Exception as e:
        print(f"Groq API Error with {model_name}: {e}")
        return f"Error: {str(e)}"


# Summarization prompts
SITUATION_SYSTEM = (
    "You are an assistant that summarizes outbreak investigation reports in Thai.\n"
    "Task: Summarize the '‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏õ‡πá‡∏ô‡∏°‡∏≤' (Situation) section concisely and EXACTLY in the format below.\n"
    "STRICT FORMAT RULES:\n"
    "1) Return EXACTLY 5 lines. No extra lines, no blank lines, no leading/trailing spaces.\n"
    "2) Each line starts with a hyphen, a space, the header, a colon, a single space, then the content.\n"
    "3) Content must be on the SAME LINE as the header (do NOT break to a new line).\n"
    "4) Use semicolons ' ; ' to separate multiple facts in the SAME LINE.\n"
    "5) Use only information from the source text (no guessing). Numbers/dates/places must be exact.\n"
    "6) If a field is missing, write '‡πÑ‡∏°‡πà‡∏£‡∏∞‡∏ö‡∏∏'.\n"
    "OUTPUT (exactly these 5 lines, in this order):\n"
    "- ‡∏™‡∏ñ‡∏≤‡∏ô‡∏ó‡∏µ‡πà/‡πÄ‡∏´‡∏ï‡∏∏‡∏Å‡∏≤‡∏£‡∏ì‡πå: <content>\n"
    "- ‡∏ä‡πà‡∏ß‡∏á‡πÄ‡∏ß‡∏•‡∏≤/‡∏ß‡∏±‡∏ô‡∏™‡∏≥‡∏Ñ‡∏±‡∏ç: <content>\n"
    "- ‡∏Å‡∏•‡∏∏‡πà‡∏°‡πÄ‡∏õ‡πâ‡∏≤‡∏´‡∏°‡∏≤‡∏¢/‡∏ú‡∏π‡πâ‡πÄ‡∏™‡∏µ‡πà‡∏¢‡∏á: <content>\n"
    "- ‡∏™‡∏≤‡πÄ‡∏´‡∏ï‡∏∏/‡∏¢‡∏≤‡∏ô‡∏û‡∏≤‡∏´‡∏ô‡∏∞‡∏™‡∏á‡∏™‡∏±‡∏¢ (‡∏ñ‡πâ‡∏≤‡∏°‡∏µ): <content>\n"
    "- ‡∏ß‡∏±‡∏ï‡∏ñ‡∏∏‡∏õ‡∏£‡∏∞‡∏™‡∏á‡∏Ñ‡πå‡∏Å‡∏≤‡∏£‡∏™‡∏≠‡∏ö‡∏™‡∏ß‡∏ô: <content>"
)

FINDINGS_SYSTEM = (
    "You are an assistant that summarizes outbreak investigation reports in Thai.\n"
    "Task: Summarize the '‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡∏™‡∏≠‡∏ö‡∏™‡∏ß‡∏ô' (Findings) section concisely and EXACTLY in the format below.\n"
    "STRICT FORMAT RULES:\n"
    "1) Return EXACTLY 10 lines. No extra lines, no blank lines, no leading/trailing spaces.\n"
    "2) Each line starts with a hyphen, a space, the header, a colon, a single space, then the content.\n"
    "3) All content MUST remain on the SAME LINE as its header (do NOT wrap to a new line).\n"
    "4) Use semicolons ' ; ' to separate multiple facts in the SAME LINE; keep original numbers/dates/times.\n"
    "5) Use only information from the source text (no guessing). If missing, write '‡πÑ‡∏°‡πà‡∏£‡∏∞‡∏ö‡∏∏'.\n"
    "OUTPUT (exactly these 10 lines, in this order):\n"
    "- ‡∏ú‡∏π‡πâ‡∏õ‡πà‡∏ß‡∏¢/‡∏™‡∏≥‡∏£‡∏ß‡∏à/‡∏≠‡∏±‡∏ï‡∏£‡∏≤‡∏õ‡πà‡∏ß‡∏¢: <content>\n"
    "- ‡πÄ‡∏û‡∏®‚Äì‡∏≠‡∏≤‡∏¢‡∏∏: <content>\n"
    "- ‡∏≠‡∏≤‡∏Å‡∏≤‡∏£‡πÄ‡∏î‡πà‡∏ô: <content>\n"
    "- ‡πÄ‡∏™‡πâ‡∏ô‡πÇ‡∏Ñ‡πâ‡∏á‡∏Å‡∏≤‡∏£‡∏£‡∏∞‡∏ö‡∏≤‡∏î: <content>\n"
    "- ‡∏¢‡∏≤‡∏ô‡∏û‡∏≤‡∏´‡∏ô‡∏∞‡∏™‡∏á‡∏™‡∏±‡∏¢: <content>\n"
    "- ‡∏Å‡∏≤‡∏£‡∏£‡∏±‡∏Å‡∏©‡∏≤: <content>\n"
    "- ‡∏Å‡∏≤‡∏£‡∏Å‡∏£‡∏∞‡∏à‡∏≤‡∏¢‡∏û‡∏∑‡πâ‡∏ô‡∏ó‡∏µ‡πà: <content>\n"
    "- ‡∏ú‡∏•‡πÅ‡∏•‡πá‡∏ö: <content>\n"
    "- ‡∏™‡∏¥‡πà‡∏á‡πÅ‡∏ß‡∏î‡∏•‡πâ‡∏≠‡∏°: <content>\n"
    "- ‡πÑ‡∏ó‡∏°‡πå‡πÑ‡∏•‡∏ô‡πå‡∏≠‡∏≤‡∏´‡∏≤‡∏£: <content>"
)


def summarize_sections(situation_text: str, findings_text: str, verbose: bool = True) -> Tuple[str, str]:
    """Summarize situation and findings sections with optional verbose output."""
    
    if verbose:
        print("\n" + "="*60)
        print("STEP 3: CREATING SUMMARIES")
        print("="*60)
        print(f"Using {MODEL_NAME_QWEN} for summarization...")
    
    # Use Qwen for summarization (since it's better at structured output)
    situation_summary = run_groq(SITUATION_SYSTEM, f'{situation_text} /nothink', MODEL_NAME_QWEN, 1500)
    findings_summary = run_groq(FINDINGS_SYSTEM, f'{findings_text} /nothink', MODEL_NAME_QWEN, 1500)
    
    if verbose:
        print("‚úì Summaries completed")
        print(f"Situation summary length: {len(situation_summary)} characters")
        print(f"Findings summary length: {len(findings_summary)} characters")
        print("\nSITUATION SUMMARY:")
        print("-" * 40)
        print(situation_summary)
        
        print("\nFINDINGS SUMMARY:")
        print("-" * 40)
        print(findings_summary)
        
    
    return situation_summary, findings_summary

# ==================== ANALYSIS ====================
SYSTEM_PROMPT = (
    "You are a Thai-speaking field epidemiology reviewer. "
    "Input: Situation, Findings, Actions, and Future Plan summaries from the Spot Report "
    "(‡∏£‡∏≤‡∏¢‡∏á‡∏≤‡∏ô‡∏Å‡∏≤‡∏£‡∏™‡∏≠‡∏ö‡∏™‡∏ß‡∏ô‡πÄ‡∏ö‡∏∑‡πâ‡∏≠‡∏á‡∏ï‡πâ‡∏ô) of a foodborne outbreak. "
    "Task: evaluate adequacy of Actions already taken (‡∏™‡∏¥‡πà‡∏á‡∏ó‡∏µ‡πà‡∏î‡∏≥‡πÄ‡∏ô‡∏¥‡∏ô‡∏Å‡∏≤‡∏£‡πÑ‡∏õ‡πÅ‡∏•‡πâ‡∏ß), "
    "state if they are sufficient, and suggest improvements if not. "
    "Also provide recommendations to strengthen the Future Plan. "
    "Output must be raw JSON only with English key names and Thai content. "
    "Use only these exact key names: Actions_Adequacy, Adequacy, Recommendations, "
    "Actionable_Instructions, Next_24-72_hr, Next_1-2_weeks, Flaws_Gaps, Future_plan_add-ons, Rationale, Response_Time. "
    "Start directly with { and end with }. No markdown, no explanatory text."
)

RUBRIC = """
‡πÄ‡∏Å‡∏ì‡∏ë‡πå‡∏û‡∏¥‡∏à‡∏≤‡∏£‡∏ì‡∏≤:
- ‡∏™‡∏≠‡∏î‡∏Ñ‡∏•‡πâ‡∏≠‡∏á‡∏Å‡∏±‡∏ö‡∏†‡∏≤‡∏û‡∏£‡∏∞‡∏ö‡∏≤‡∏î‡∏ß‡∏¥‡∏ó‡∏¢‡∏≤‡πÅ‡∏•‡∏∞‡∏™‡∏ñ‡∏≤‡∏ô‡∏Å‡∏≤‡∏£‡∏ì‡πå
- ‡∏°‡∏≤‡∏ï‡∏£‡∏Å‡∏≤‡∏£‡∏Ñ‡∏ß‡∏ö‡∏Ñ‡∏∏‡∏°‡πÄ‡∏£‡πà‡∏á‡∏î‡πà‡∏ß‡∏ô (‡∏≠‡∏≤‡∏´‡∏≤‡∏£ ‡∏ô‡πâ‡∏≥ ‡∏™‡∏∏‡∏Ç‡∏≤‡∏†‡∏¥‡∏ö‡∏≤‡∏• ‡∏ú‡∏π‡πâ‡∏™‡∏±‡∏°‡∏ú‡∏±‡∏™‡∏≠‡∏≤‡∏´‡∏≤‡∏£ ‡∏≠‡∏∏‡∏õ‡∏Å‡∏£‡∏ì‡πå‡∏î‡∏¥‡∏ö/‡∏™‡∏∏‡∏Å)
- ‡∏Å‡∏≤‡∏£‡∏à‡∏±‡∏î‡∏Å‡∏≤‡∏£‡∏ú‡∏π‡πâ‡∏õ‡πà‡∏ß‡∏¢/‡πÄ‡∏ù‡πâ‡∏≤‡∏£‡∏∞‡∏ß‡∏±‡∏á
- ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏¥‡πà‡∏á‡πÅ‡∏ß‡∏î‡∏•‡πâ‡∏≠‡∏°‡πÅ‡∏•‡∏∞‡∏™‡∏∏‡∏Ç‡∏≤‡∏†‡∏¥‡∏ö‡∏≤‡∏•‡∏™‡∏ñ‡∏≤‡∏ô‡∏ó‡∏µ‡πà
- ‡∏Å‡∏≤‡∏£‡πÄ‡∏Å‡πá‡∏ö‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á/‡∏ú‡∏•‡πÅ‡∏•‡∏ö
- ‡πÅ‡∏ú‡∏ô‡∏ß‡∏¥‡πÄ‡∏Ñ‡∏£‡∏≤‡∏∞‡∏´‡πå‡πÄ‡∏ä‡∏¥‡∏á‡∏™‡∏ñ‡∏¥‡∏ï‡∏¥‡πÅ‡∏•‡∏∞‡∏Å‡∏≤‡∏£‡∏ö‡∏π‡∏£‡∏ì‡∏≤‡∏Å‡∏≤‡∏£‡πÅ‡∏•‡∏ö
"""

USER_PROMPT_TEMPLATE = """
[SITUATION]
{SITUATION}

[FINDINGS]
{FINDINGS}

[ACTIONS]
{ACTIONS}

[FUTURE_PLAN]
{FUTURE_PLAN}

[‡πÄ‡∏Å‡∏ì‡∏ë‡πå‡∏õ‡∏£‡∏∞‡πÄ‡∏°‡∏¥‡∏ô]
{RUBRIC}

‡∏Ñ‡∏≥‡∏™‡∏±‡πà‡∏á‡πÄ‡∏≠‡∏≤‡∏ï‡πå‡∏û‡∏∏‡∏ï: ‡∏™‡∏£‡πâ‡∏≤‡∏á JSON ‡πÇ‡∏î‡∏¢‡πÉ‡∏ä‡πâ key names ‡πÄ‡∏õ‡πá‡∏ô‡∏†‡∏≤‡∏©‡∏≤‡∏≠‡∏±‡∏á‡∏Å‡∏§‡∏©‡πÄ‡∏´‡∏°‡∏∑‡∏≠‡∏ô‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏ô‡∏µ‡πâ‡∏ó‡∏∏‡∏Å‡∏õ‡∏£‡∏∞‡∏Å‡∏≤‡∏£:

{{
  "Actions_Adequacy": {{
    "Adequacy": false,
    "Recommendations": [
      "‡∏Ç‡πâ‡∏≠‡πÄ‡∏™‡∏ô‡∏≠‡πÅ‡∏ô‡∏∞‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢ 1",
      "‡∏Ç‡πâ‡∏≠‡πÄ‡∏™‡∏ô‡∏≠‡πÅ‡∏ô‡∏∞‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢ 2"
    ]
  }},
  "Actionable_Instructions": {{
    "Next_24-72_hr": [
      "‡∏Å‡∏≤‡∏£‡∏Å‡∏£‡∏∞‡∏ó‡∏≥‡πÄ‡∏£‡πà‡∏á‡∏î‡πà‡∏ß‡∏ô‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢ 1",
      "‡∏Å‡∏≤‡∏£‡∏Å‡∏£‡∏∞‡∏ó‡∏≥‡πÄ‡∏£‡πà‡∏á‡∏î‡πà‡∏ß‡∏ô‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢ 2"
    ],
    "Next_1-2_weeks": [
      "‡∏Å‡∏≤‡∏£‡∏Å‡∏£‡∏∞‡∏ó‡∏≥‡∏£‡∏∞‡∏¢‡∏∞‡∏Å‡∏•‡∏≤‡∏á‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢ 1",
      "‡∏Å‡∏≤‡∏£‡∏Å‡∏£‡∏∞‡∏ó‡∏≥‡∏£‡∏∞‡∏¢‡∏∞‡∏Å‡∏•‡∏≤‡∏á‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢ 2"
    ]
  }},
  "Flaws_Gaps": [
    "‡∏Ç‡πâ‡∏≠‡∏ö‡∏Å‡∏û‡∏£‡πà‡∏≠‡∏á‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢ 1",
    "‡∏Ç‡πâ‡∏≠‡∏ö‡∏Å‡∏û‡∏£‡πà‡∏≠‡∏á‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢ 2"
  ],
  "Future_plan_add-ons": [
    "‡∏™‡∏¥‡πà‡∏á‡∏ó‡∏µ‡πà‡∏Ñ‡∏ß‡∏£‡πÄ‡∏û‡∏¥‡πà‡∏°‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢ 1",
    "‡∏™‡∏¥‡πà‡∏á‡∏ó‡∏µ‡πà‡∏Ñ‡∏ß‡∏£‡πÄ‡∏û‡∏¥‡πà‡∏°‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢ 2"
  ],
  "Rationale": [
    "‡πÄ‡∏´‡∏ï‡∏∏‡∏ú‡∏•‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢ 1",
    "‡πÄ‡∏´‡∏ï‡∏∏‡∏ú‡∏•‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢ 2"
  ],
  "Response_Time": "21 ‡∏ß‡∏±‡∏ô"
}}

‡∏Ç‡πâ‡∏≠‡∏Å‡∏≥‡∏´‡∏ô‡∏î‡∏™‡∏≥‡∏Ñ‡∏±‡∏ç:
- ‡πÉ‡∏ä‡πâ key names ‡πÄ‡∏õ‡πá‡∏ô‡∏†‡∏≤‡∏©‡∏≤‡∏≠‡∏±‡∏á‡∏Å‡∏§‡∏©‡πÄ‡∏ó‡πà‡∏≤‡∏ô‡∏±‡πâ‡∏ô ‡πÄ‡∏´‡∏°‡∏∑‡∏≠‡∏ô‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏Ç‡πâ‡∏≤‡∏á‡∏ï‡πâ‡∏ô‡∏ó‡∏∏‡∏Å‡∏õ‡∏£‡∏∞‡∏Å‡∏≤‡∏£
- ‡∏´‡πâ‡∏≤‡∏°‡πÉ‡∏ä‡πâ‡∏´‡∏±‡∏ß‡∏Ç‡πâ‡∏≠ # ‡∏´‡∏£‡∏∑‡∏≠‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏≠‡∏∑‡πà‡∏ô‡πÄ‡∏õ‡πá‡∏ô key names
- ‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤‡πÉ‡∏ô values ‡πÅ‡∏•‡∏∞ arrays ‡πÄ‡∏õ‡πá‡∏ô‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢
- Adequacy ‡πÉ‡∏™‡πà true ‡∏´‡∏£‡∏∑‡∏≠ false ‡πÄ‡∏ó‡πà‡∏≤‡∏ô‡∏±‡πâ‡∏ô
- ‡∏ï‡πâ‡∏≠‡∏á‡∏≠‡πâ‡∏≤‡∏á‡∏≠‡∏¥‡∏á‡∏ö‡∏£‡∏¥‡∏ö‡∏ó‡πÄ‡∏´‡∏ï‡∏∏‡∏Å‡∏≤‡∏£‡∏ì‡πå‡∏ô‡∏µ‡πâ‡πÇ‡∏î‡∏¢‡πÄ‡∏â‡∏û‡∏≤‡∏∞ (‡∏≠‡∏≤‡∏´‡∏≤‡∏£ ‡∏™‡∏ñ‡∏≤‡∏ô‡∏ó‡∏µ‡πà ‡∏Ñ‡∏•‡∏≠‡∏£‡∏µ‡∏ô ‡∏≠‡∏∏‡∏õ‡∏Å‡∏£‡∏ì‡πå‡∏î‡∏¥‡∏ö/‡∏™‡∏∏‡∏Å ‡∏ú‡∏π‡πâ‡∏™‡∏±‡∏°‡∏ú‡∏±‡∏™‡∏≠‡∏≤‡∏´‡∏≤‡∏£)
- ‡∏ñ‡πâ‡∏≤‡πÑ‡∏°‡πà‡∏û‡∏ö‡∏Ç‡πâ‡∏≠‡∏ö‡∏Å‡∏û‡∏£‡πà‡∏≠‡∏á ‡πÉ‡∏´‡πâ‡πÉ‡∏™‡πà ["‡πÑ‡∏°‡πà‡∏û‡∏ö‡∏Ç‡πâ‡∏≠‡∏ö‡∏Å‡∏û‡∏£‡πà‡∏≠‡∏á‡∏™‡∏≥‡∏Ñ‡∏±‡∏ç"] ‡πÉ‡∏ô Flaws_Gaps
- ‡πÄ‡∏≠‡∏≤‡∏ï‡πå‡∏û‡∏∏‡∏ï‡∏ï‡πâ‡∏≠‡∏á‡πÄ‡∏õ‡πá‡∏ô JSON ‡∏ó‡∏µ‡πà valid ‡πÇ‡∏î‡∏¢‡∏ï‡∏£‡∏á ‡∏´‡πâ‡∏≤‡∏°‡∏°‡∏µ ```json wrapper ‡∏´‡∏£‡∏∑‡∏≠ markdown formatting
- ‡∏´‡πâ‡∏≤‡∏°‡∏°‡∏µ‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏≠‡∏ò‡∏¥‡∏ö‡∏≤‡∏¢‡∏´‡∏£‡∏∑‡∏≠‡∏Ñ‡∏≥‡∏≠‡∏ò‡∏¥‡∏ö‡∏≤‡∏¢‡πÄ‡∏û‡∏¥‡πà‡∏°‡πÄ‡∏ï‡∏¥‡∏°‡∏ô‡∏≠‡∏Å‡∏à‡∏≤‡∏Å JSON
""".strip()

def analyze_report_with_model(situation: str, findings: str, actions: str, future_plan: str, 
                             model_name: str, analysis_type: str = "", verbose: bool = True) -> Tuple[str, float]:
    """Analyze report with specified model and return JSON output with response time."""
    
    if verbose:
        print(f"\nRunning {analysis_type} with {model_name}...")
    
    user_prompt = USER_PROMPT_TEMPLATE.format(
        SITUATION=situation.strip(),
        FINDINGS=findings.strip(),
        ACTIONS=actions.strip() or "(‡πÑ‡∏°‡πà‡πÑ‡∏î‡πâ‡πÉ‡∏™‡πà‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏°‡∏≤)",
        FUTURE_PLAN=future_plan.strip() or "(‡πÑ‡∏°‡πà‡πÑ‡∏î‡πâ‡πÉ‡∏™‡πà‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏°‡∏≤)",
        RUBRIC=RUBRIC.strip()
    )
    
    start_time = time.time()
    output = run_groq(SYSTEM_PROMPT, user_prompt, model_name, 3000)
    response_time = time.time() - start_time
    
    if verbose:
        print(f"‚úì Completed in {response_time:.2f}s")
    
    # Try to parse and add response time to JSON
    try:
        parsed = json.loads(output)
        parsed["Response_Time"] = f"{response_time:.2f} ‡∏ß‡∏¥‡∏ô‡∏≤‡∏ó‡∏µ"
        parsed["Model_Used"] = model_name
        output = json.dumps(parsed, ensure_ascii=False, indent=2)
    except json.JSONDecodeError:
        pass
    
    return output, response_time


def _update_response_time(json_str: str, model_seconds: float, rag_seconds: float, model_name: str) -> str:
    total = float(model_seconds) + float(rag_seconds)
    try:
        obj = json.loads(json_str)
        obj["Response_Time"] = f"{total:.2f} ‡∏ß‡∏¥‡∏ô‡∏≤‡∏ó‡∏µ"
        obj["Model_Used"] = model_name
        return json.dumps(obj, ensure_ascii=False, indent=2)
    except Exception:
        # ‡∏ñ‡πâ‡∏≤ parse ‡πÑ‡∏°‡πà‡πÑ‡∏î‡πâ ‡∏Å‡πá‡∏Ñ‡∏∑‡∏ô‡∏Ñ‡πà‡∏≤‡πÄ‡∏î‡∏¥‡∏°
        return json_str

# ==================== MAIN PIPELINE ====================
def process_pdf_file(pdf_path: str, verbose: bool = True, rag_system: Optional[object] = None) -> Dict[str, any]:
    """Process a single PDF file with both models and return analysis results.
    
    Args:
        pdf_path: path to PDF
        verbose: print progress
        rag_system: pre-built RAG system to reuse; if None, will create one here
    """
    filename = Path(pdf_path).stem
    
    if verbose:
        print("\n" + "="*80)
        print(f"PROCESSING FILE: {filename}")
        print("="*80)
    
    try:
        # Step 1: Extract text
        if verbose:
            print("\n" + "="*60)
            print("STEP 1: EXTRACTING TEXT FROM PDF")
            print("="*60)
        
        raw_text = extract_text_from_pdf(pdf_path)
        
        if verbose:
            print(f"‚úì Text extracted: {len(raw_text)} characters")
            print(f"Preview (first 200 chars): {raw_text[:200]}...")
        
        # Step 2: Parse sections
        if verbose:
            print("\n" + "="*60)
            print("STEP 2: PARSING SECTIONS")
            print("="*60)
        
        paras = split_report(raw_text)
        sections = extract_sections(paras)
        
        situation_text = pick_section(sections, "situation")
        findings_text = pick_section(sections, "findings")
        actions_text = pick_section(sections, "actions_done")
        future_text = pick_section(sections, "actions_next")
        
        if verbose:
            print(f"‚úì Found {len(paras)} paragraphs")
            print(f"‚úì Identified {len(sections)} sections:")
            for section_name in sections.keys():
                print(f"    - {section_name}")
            
            print(f"\nExtracted key sections:")
            print(f"  Situation: {len(situation_text)} characters")
            print(f"  Findings: {len(findings_text)} characters")
            print(f"  Actions: {len(actions_text)} characters") 
            print(f"  Future: {len(future_text)} characters")
            
            # Show preview of original sections
            print("\nORIGINAL SITUATION (first 300 chars):")
            print("-" * 40)
            print(situation_text[:300] + ("..." if len(situation_text) > 300 else ""))
            
            print("\nORIGINAL FINDINGS (first 300 chars):")
            print("-" * 40)
            print(findings_text[:300] + ("..." if len(findings_text) > 300 else ""))
        
        # Step 3: Summarize (using Qwen)
        situation_summary, findings_summary = summarize_sections(
            situation_text, findings_text, verbose=verbose
        )
        
        # Step 4: Run all 4 analyses
        if verbose:
            print("\n" + "="*60)
            print("STEP 4: RUNNING ANALYSES WITH BOTH MODELS")
            print("="*60)
        
        # Qwen analyses
        qwen_no_summary_result, qwen_no_summary_time = analyze_report_with_model(
            situation_text, findings_text, actions_text, future_text,
            MODEL_NAME_QWEN, "Qwen No-Summary", verbose=verbose
        )
        
        qwen_summary_result, qwen_summary_time = analyze_report_with_model(
            situation_summary, findings_summary, actions_text, future_text,
            MODEL_NAME_QWEN, "Qwen Summary", verbose=verbose
        )
        
        # Llama analyses
        llama_no_summary_result, llama_no_summary_time = analyze_report_with_model(
            situation_text, findings_text, actions_text, future_text,
            MODEL_NAME_LLAMA, "Llama No-Summary", verbose=verbose
        )
        
        llama_summary_result, llama_summary_time = analyze_report_with_model(
            situation_summary, findings_summary, actions_text, future_text,
            MODEL_NAME_LLAMA, "Llama Summary", verbose=verbose
        )
        
        if verbose:
            print("\n" + "="*60)
            print("STEP 5: PROCESSING SUMMARY")
            print("="*60)
            print(f"‚úì File: {filename}")
            print(f"‚úì Qwen no-summary: {qwen_no_summary_time:.2f}s")
            print(f"‚úì Qwen summary: {qwen_summary_time:.2f}s")  
            print(f"‚úì Llama no-summary: {llama_no_summary_time:.2f}s")
            print(f"‚úì Llama summary: {llama_summary_time:.2f}s")
            print(f"‚úì Total model time: {(qwen_no_summary_time + qwen_summary_time + llama_no_summary_time + llama_summary_time):.2f}s")
            print(f"‚úì All analyses completed successfully")
            
            # Show response length comparison
            print(f"\nResponse lengths comparison:")
            print(f"  Qwen no-summary: {len(qwen_no_summary_result)} chars")
            print(f"  Qwen summary: {len(qwen_summary_result)} chars")
            print(f"  Llama no-summary: {len(llama_no_summary_result)} chars")
            print(f"  Llama summary: {len(llama_summary_result)} chars")

        # ---------------------------
        # STEP 6: RAG ENHANCEMENT
        # ---------------------------
        if verbose:
            print("\n" + "="*60)
            print("STEP 6: RAG ENHANCEMENT (MERGE WITH GUIDELINES)")
            print("="*60)

        # rag_total_start = time.time()

        # Setup/Load RAG system (once or per call if None)
        rag_setup_start = time.time()
        if rag_system is None:
            if verbose:
                print("‚Ä¢ Loading/Building RAG system...")
            rag_system = create_system("Docs/Guideline.pdf", "my_medical_ragv2.pkl")
        rag_setup_time = time.time() - rag_setup_start
        if verbose:
            print(f"‚úì RAG system ready ({rag_setup_time:.2f}s)")

        enhancer = RAGEnhancementSystem(rag_system)

        # Qwen RAG
        rag_qwen_start = time.time()
        qwen_summary_rag = enhancer.run_enhancement(qwen_summary_result, top_k=1)
        rag_qwen_time = time.time() - rag_qwen_start
        qwen_summary_rag = _update_response_time(
            qwen_summary_rag, 
            model_seconds=qwen_summary_time, 
            rag_seconds=rag_qwen_time, 
            model_name=MODEL_NAME_QWEN
        )
        if verbose:
            print(f"‚úì Qwen summary ‚Üí RAG enhanced in {rag_qwen_time:.2f}s")

        # Llama RAG
        rag_llama_start = time.time()
        llama_summary_rag = enhancer.run_enhancement(llama_summary_result, top_k=1)
        rag_llama_time = time.time() - rag_llama_start
        if verbose:
            print(f"‚úì Llama summary ‚Üí RAG enhanced in {rag_llama_time:.2f}s")
        # rag_total_time = time.time() - rag_total_start
        
        llama_summary_rag = _update_response_time(
            llama_summary_rag, 
            model_seconds=llama_summary_time, 
            rag_seconds=rag_llama_time, 
            model_name=MODEL_NAME_LLAMA
        )
        if verbose:
            # print(f"‚úì Total RAG time: {rag_total_time:.2f}s")
            print("‚úì RAG enhancement completed")

        summaries_output = f"""SITUATION SUMMARY
{'-'*40}
{situation_summary}
        
FINDINGS SUMMARY
{'-'*40}
{findings_summary}"""
        
        return {
            "file": Path(pdf_path).stem,
            "situation_findings_summary": summaries_output,
            f"{MODEL_NAME_QWEN}_no-summary": qwen_no_summary_result,
            f"{MODEL_NAME_QWEN}_summary": qwen_summary_result,
            f"{MODEL_NAME_QWEN}_summary_rag": qwen_summary_rag,
            f"{LLAMA_SHORT_NAME}_no-summary": llama_no_summary_result,
            f"{LLAMA_SHORT_NAME}_summary": llama_summary_result,
            f"{LLAMA_SHORT_NAME}_summary_rag": llama_summary_rag,
            "processing_success": True
        }
        
    except Exception as e:
        if verbose:
            print(f"\n‚ùå ERROR: {str(e)}")
            print("Processing failed")
        
        return {
            "file": filename,
            f"{MODEL_NAME_QWEN}_no-summary": f"Error: {str(e)}",
            f"{MODEL_NAME_QWEN}_summary": f"Error: {str(e)}",
            f"{MODEL_NAME_QWEN}_summary_rag": f"Error: {str(e)}",
            f"{LLAMA_SHORT_NAME}_no-summary": f"Error: {str(e)}",
            f"{LLAMA_SHORT_NAME}_summary": f"Error: {str(e)}",
            f"{LLAMA_SHORT_NAME}_summary_rag": f"Error: {str(e)}",
            "processing_success": False
        }

def process_multiple_pdfs(pdf_paths: List[str], verbose: bool = True) -> pd.DataFrame:
    """Process many PDFs. Build/load RAG system once and reuse."""
    results = []

    # ‡πÇ‡∏´‡∏•‡∏î/‡∏™‡∏£‡πâ‡∏≤‡∏á RAG system ‡∏Ñ‡∏£‡∏±‡πâ‡∏á‡πÄ‡∏î‡∏µ‡∏¢‡∏ß
    rag_system = create_system("Docs/Guideline.pdf", "my_medical_ragv2.pkl")

    for pdf_path in pdf_paths:
        result = process_pdf_file(pdf_path, verbose=verbose, rag_system=rag_system)  # ‡∏™‡πà‡∏á‡πÄ‡∏Ç‡πâ‡∏≤‡πÑ‡∏õ‡πÄ‡∏•‡∏¢
        results.append(result)
        time.sleep(2)

    df = pd.DataFrame(results)
    columns = [
        "file",
        "situation_findings_summary",
        f"{MODEL_NAME_QWEN}_no-summary",
        f"{MODEL_NAME_QWEN}_summary",
        f"{MODEL_NAME_QWEN}_summary_rag",
        f"{LLAMA_SHORT_NAME}_no-summary",
        f"{LLAMA_SHORT_NAME}_summary",
        f"{LLAMA_SHORT_NAME}_summary_rag",
        "processing_success"
    ]
    for c in columns:
        if c not in df.columns:
            df[c] = None
    return df[columns]

In [64]:
import pandas as pd

# --- ‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡πÑ‡∏ü‡∏•‡πå PDF ‡∏ó‡∏µ‡πà‡∏ï‡πâ‡∏≠‡∏á‡∏Å‡∏≤‡∏£‡∏õ‡∏£‡∏∞‡∏°‡∏ß‡∏•‡∏ú‡∏• (‡∏ß‡∏¥‡∏ò‡∏µ A: ‡∏£‡∏∞‡∏ö‡∏∏‡∏£‡∏≤‡∏¢‡∏Å‡∏≤‡∏£‡πÄ‡∏≠‡∏á) ---
pdf_paths = [
    "Docs/Exsum_food_poisoning.pdf",
    # "Docs/Another_report.pdf",
]

# ‡∏£‡∏±‡∏ô pipeline ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏´‡∏•‡∏≤‡∏¢‡πÑ‡∏ü‡∏•‡πå (‡∏ü‡∏±‡∏á‡∏Å‡πå‡∏ä‡∏±‡∏ô‡∏ô‡∏µ‡πâ‡∏à‡∏∞‡∏™‡∏£‡πâ‡∏≤‡∏á/‡πÇ‡∏´‡∏•‡∏î RAG system ‡∏Ñ‡∏£‡∏±‡πâ‡∏á‡πÄ‡∏î‡∏µ‡∏¢‡∏ß‡πÅ‡∏•‡∏∞ reuse)
df_multi = process_multiple_pdfs(pdf_paths, verbose=True)

# ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå (‡πÉ‡∏ä‡πâ utf-8-sig ‡∏´‡∏≤‡∏Å‡∏à‡∏∞‡πÄ‡∏õ‡∏¥‡∏î‡∏î‡πâ‡∏ß‡∏¢ Excel ‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢‡πÉ‡∏´‡πâ‡πÅ‡∏™‡∏î‡∏á‡∏ñ‡∏π‡∏Å‡∏ï‡πâ‡∏≠‡∏á)
df_multi.to_csv("analysis_results_rag_multi.csv", index=False)


BGE-M3 model loaded successfully!
üìÇ Found existing system: my_medical_ragv2.pkl
üìÇ Loading system from my_medical_ragv2.pkl...
‚úÖ System loaded successfully!
  Chunks: 69
  Embeddings shape: (69, 1024)
  Model info: {'model_name': 'BAAI/bge-m3', 'embedding_dim': 1024, 'num_chunks': 69}

PROCESSING FILE: Exsum_food_poisoning

STEP 1: EXTRACTING TEXT FROM PDF
‚úì Text extracted: 10049 characters
Preview (first 200 chars): ‡∏£‡∏≤‡∏¢‡∏á‡∏≤‡∏ô‡∏Å‡∏≤‡∏£‡∏™‡∏≠‡∏ö‡∏™‡∏ß‡∏ô‡πÄ‡∏ö‡∏∑‡πâ‡∏≠‡∏á‡∏ï‡πâ‡∏ô ‡∏Å‡∏≤‡∏£‡∏£‡∏∞‡∏ö‡∏≤‡∏î‡πÇ‡∏£‡∏Ñ‡∏≠‡∏≤‡∏´‡∏≤‡∏£‡πÄ‡∏õ‡πá‡∏ô‡∏û‡∏¥‡∏© ‡πÅ‡∏•‡∏∞‡∏≠‡∏∏‡∏à‡∏à‡∏≤‡∏£‡∏∞‡∏£‡πà‡∏ß‡∏á‡πÄ‡∏â‡∏µ‡∏¢‡∏ö‡∏û‡∏•‡∏±‡∏ô 
 ‡∏ï‡∏≥‡∏ö‡∏•‡∏Ñ‡∏∑‡∏≠‡πÄ‡∏ß‡∏µ‡∏¢‡∏á ‡∏≠‡∏≥‡πÄ‡∏†‡∏≠‡∏î‡∏≠‡∏Å‡∏Ñ‡∏≥‡πÉ‡∏ï‡πâ ‡∏à‡∏±‡∏á‡∏´‡∏ß‡∏±‡∏î‡∏û‡∏∞‡πÄ‡∏¢‡∏≤ ‡∏ß‡∏±‡∏ô‡∏ó‡∏µ‡πà 17-19 ‡∏°‡∏¥‡∏ñ‡∏∏‡∏ô‡∏≤‡∏¢‡∏ô 2568 
 
‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏õ‡πá‡∏ô‡∏°‡∏≤ 
‡∏ß‡∏±‡∏ô‡∏ó‡∏µ‡πà 15 ‡∏°‡∏¥‡∏ñ‡∏∏‡∏ô‡∏≤‡∏¢‡∏ô ‡∏û.‡∏®. 2568 ‡πÄ‡∏ß‡∏•‡∏≤ 18.00 ‡∏ô. ...

STEP 2: PARSING SECTIONS
‚úì Found 14 paragraphs
‚úì Identifi

## Answer

In [2]:
import pandas as pd
rag = pd.read_csv('analysis_results_rag_latest.csv')
display(rag)

Unnamed: 0,file,situation_findings_summary,qwen/qwen3-32b_no-summary,qwen/qwen3-32b_summary,qwen/qwen3-32b_summary_rag,llama3.1-8b_no-summary,llama3.1-8b_summary,llama3.1-8b_summary_rag,processing_success
0,Exsum_food_poisoning,SITUATION SUMMARY\n---------------------------...,"{\n ""Actions_Adequacy"": {\n ""Adequacy"": fa...","{\n ""Actions_Adequacy"": {\n ""Adequacy"": fa...","{\n ""Actions_Adequacy"": {\n ""Adequacy"": fa...","{\n ""Actions_Adequacy"": {\n ""Adequacy"": fa...","{\n ""Actions_Adequacy"": {\n ""Adequacy"": fa...","{\n ""Actions_Adequacy"": {\n ""Adequacy"": fa...",True


In [16]:
rag.to_excel('analysis_results_rag_latest.xlsx', index=False)

In [12]:
print(rag.iloc[0, 2])

{
  "Actions_Adequacy": {
    "Adequacy": false,
    "Recommendations": [
      "‡∏î‡∏≥‡πÄ‡∏ô‡∏¥‡∏ô‡∏Å‡∏≤‡∏£‡∏õ‡∏¥‡∏î‡∏™‡∏ñ‡∏≤‡∏ô‡∏ó‡∏µ‡πà‡∏£‡πâ‡∏≤‡∏ô‡∏Ç‡πâ‡∏≤‡∏ß‡∏´‡∏°‡∏π‡πÅ‡∏î‡∏á‡∏ä‡∏±‡πà‡∏ß‡∏Ñ‡∏£‡∏≤‡∏ß‡∏à‡∏ô‡∏Å‡∏ß‡πà‡∏≤‡∏à‡∏∞‡πÑ‡∏î‡πâ‡∏£‡∏±‡∏ö‡∏Å‡∏≤‡∏£‡∏£‡∏±‡∏ö‡∏£‡∏≠‡∏á‡∏°‡∏≤‡∏ï‡∏£‡∏ê‡∏≤‡∏ô‡∏™‡∏∏‡∏Ç‡∏≤‡∏†‡∏¥‡∏ö‡∏≤‡∏•‡∏≠‡∏≤‡∏´‡∏≤‡∏£",
      "‡∏à‡∏±‡∏î‡∏ó‡∏≥‡πÅ‡∏ú‡∏ô‡∏Ñ‡∏ß‡∏ö‡∏Ñ‡∏∏‡∏°‡∏Å‡∏≤‡∏£‡πÅ‡∏û‡∏£‡πà‡∏Å‡∏£‡∏∞‡∏à‡∏≤‡∏¢‡πÄ‡∏ä‡∏∑‡πâ‡∏≠‡πÇ‡∏î‡∏¢‡πÄ‡∏ô‡πâ‡∏ô‡∏Å‡∏≤‡∏£‡πÅ‡∏¢‡∏Å‡∏†‡∏≤‡∏ä‡∏ô‡∏∞‡∏î‡∏¥‡∏ö/‡∏™‡∏∏‡∏Å‡πÅ‡∏•‡∏∞‡∏ó‡∏≥‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏∞‡∏≠‡∏≤‡∏î‡∏≠‡∏∏‡∏õ‡∏Å‡∏£‡∏ì‡πå‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î‡∏î‡πâ‡∏ß‡∏¢‡∏™‡∏≤‡∏£‡∏Ü‡πà‡∏≤‡πÄ‡∏ä‡∏∑‡πâ‡∏≠‡∏ó‡∏µ‡πà‡πÄ‡∏´‡∏°‡∏≤‡∏∞‡∏™‡∏°",
      "‡∏à‡∏±‡∏î‡∏Å‡∏≤‡∏£‡πÄ‡∏ù‡πâ‡∏≤‡∏£‡∏∞‡∏ß‡∏±‡∏á‡∏ú‡∏π‡πâ‡∏£‡πà‡∏ß‡∏°‡∏á‡∏≤‡∏ô‡∏®‡∏û‡∏ó‡∏µ‡πà‡∏¢‡∏±‡∏á‡πÑ‡∏°‡πà‡πÅ‡∏™‡∏î‡∏á‡∏≠‡∏≤‡∏Å‡∏≤‡∏£‡πÄ‡∏õ‡πá‡∏ô‡∏£‡∏∞‡∏¢‡∏∞‡πÄ‡∏ß‡∏•‡∏≤ 7 ‡∏ß‡∏±‡∏ô (‡∏£‡∏∞‡∏¢‡∏∞‡∏ü‡∏±‡∏Å‡∏ï‡∏±‡∏ß‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢‡∏Ç‡∏≠‡∏á EPEC ‡∏Ñ‡∏∑‡∏≠ 1-3 ‡∏ß‡∏±‡∏ô)",
      "‡∏à‡∏±‡∏î‡∏Å‡∏≤‡∏£‡πÉ‡∏´‡πâ‡∏ú‡∏π‡