In [143]:
import os
import json
import hashlib
from typing import Dict, List, Tuple, Optional
from pathlib import Path
from langchain_community.chat_models import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage
from dataclasses import dataclass
import difflib
from enum import Enum
from datetime import datetime
import PyPDF2
import fitz  # PyMuPDF
import pdfplumber
import logging
import warnings

In [None]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
warnings.filterwarnings("ignore", category=UserWarning)


class ModelChoice(Enum):
    """Enum for model selection results."""

    MODEL_A = "model_a"
    MODEL_B = "model_b"
    NEITHER = "neither"
    BOTH_GOOD = "both_good"


@dataclass
class PDFContent:
    """Represents extracted PDF content with metadata."""

    original_pdf_path: str
    extracted_content: str
    extraction_method: str
    extraction_timestamp: str
    content_hash: str
    log_file_path: str


@dataclass
class EvaluatorPrompt:
    """Structure for the original evaluator prompt and requirements."""

    original_prompt: str
    task_description: str = None
    requirements: List[str] = None
    success_criteria: List[str] = None
    priority_aspects: List[str] = None


@dataclass
class PDFBasedAnalysisRequest:
    """Enhanced request structure for PDF-based evaluation."""

    evaluator_prompt: EvaluatorPrompt
    pdf_content: PDFContent
    task_description: str = None
    analysis_type: str = "pdf_content_focused"
    custom_evaluation_criteria: List[str] = None
    prompt_history: List[str] = None

In [None]:
class PDFExtractionEngine:
    """Robust PDF extraction engine with fallback methods."""

    def __init__(self):
        self.extraction_methods = [
            ("pymupdf_robust", self._extract_with_pymupdf_robust),
            ("pdfplumber_safe", self._extract_with_pdfplumber_safe),
            ("pypdf2_safe", self._extract_with_pypdf2_safe),
            ("raw_text_fallback", self._extract_raw_text_fallback),
        ]

    def extract_pdf_to_log(
        self, pdf_path: str, output_format: str = "log"
    ) -> PDFContent:
        """
        Extract PDF content and save to log file with robust error handling.

        Args:
            pdf_path: Path to PDF file
            output_format: Format ('log' or 'txt')

        Returns:
            PDFContent object with extraction details
        """
        if not os.path.exists(pdf_path):
            raise FileNotFoundError(f"PDF file not found: {pdf_path}")

        # Create output file path
        pdf_name = Path(pdf_path).stem
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_extension = "log" if output_format == "log" else "txt"
        log_file_path = f"{pdf_name}_extracted_{timestamp}.{output_extension}"

        extracted_text = ""
        successful_method = None

        # Try extraction methods in order of robustness
        for method_name, method_func in self.extraction_methods:
            try:
                logger.info(f"Attempting extraction with {method_name}...")
                extracted_text = method_func(pdf_path)

                if extracted_text and len(extracted_text.strip()) > 0:
                    successful_method = method_name
                    logger.info(f"‚úÖ Extraction successful with {method_name}")
                    break
                else:
                    logger.warning(f"‚ö†Ô∏è {method_name} extracted no content")

            except Exception as e:
                logger.error(f"‚ùå Error with {method_name}: {str(e)}")
                continue

        if not extracted_text:
            raise Exception("Failed to extract content with any method")

        # Save to log file
        try:
            with open(
                log_file_path, "w", encoding="utf-8", errors="ignore"
            ) as output_file:
                if output_format == "log":
                    # Add log header
                    extraction_timestamp = datetime.now().strftime(
                        "%Y-%m-%d %H:%M:%S UTC"
                    )
                    output_file.write(f"[{extraction_timestamp}] PDF Extraction Log\n")
                    output_file.write("=" * 60 + "\n")
                    output_file.write(f"Source PDF: {pdf_path}\n")
                    output_file.write(f"Extraction Method: {successful_method}\n")
                    output_file.write(f"User: CAMB3LL\n")
                    output_file.write(f"Timestamp: 2025-08-10 22:12:32\n")
                    output_file.write("=" * 60 + "\n\n")

                output_file.write(extracted_text)

            # Calculate content hash
            content_hash = hashlib.md5(extracted_text.encode("utf-8")).hexdigest()

            return PDFContent(
                original_pdf_path=pdf_path,
                extracted_content=extracted_text,
                extraction_method=successful_method,
                extraction_timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S UTC"),
                content_hash=content_hash,
                log_file_path=log_file_path,
            )

        except Exception as e:
            logger.error(f"Error saving to log file: {str(e)}")
            raise

    def _extract_with_pymupdf_robust(self, pdf_path: str) -> str:
        """Robust extraction with PyMuPDF - handles font errors."""
        text = ""
        try:
            doc = fitz.open(pdf_path)

            for page_num in range(len(doc)):
                try:
                    page = doc.load_page(page_num)
                    text += f"\n{'='*20} P√°gina {page_num + 1} {'='*20}\n"

                    try:
                        page_text = page.get_text()
                        if page_text:
                            text += page_text + "\n"
                    except Exception as e:
                        logger.warning(f"Error in page {page_num + 1}: {str(e)}")
                        try:
                            page_text = page.get_text(
                                "text", flags=fitz.TEXT_PRESERVE_WHITESPACE
                            )
                            if page_text:
                                text += page_text + "\n"
                        except:
                            text += (
                                f"[Error extracting text from page {page_num + 1}]\n"
                            )

                    # Additional content info
                    images = page.get_images()
                    if images:
                        text += f"[Images found: {len(images)}]\n"

                except Exception as e:
                    text += f"\n[Error processing page {page_num + 1}: {str(e)}]\n"
                    continue

            doc.close()
            return text

        except Exception as e:
            logger.error(f"General PyMuPDF error: {str(e)}")
            return ""

    def _extract_with_pdfplumber_safe(self, pdf_path: str) -> str:
        """Safe extraction with pdfplumber."""
        text = ""
        try:
            with pdfplumber.open(pdf_path) as pdf:
                for page_num, page in enumerate(pdf.pages, 1):
                    try:
                        text += f"\n{'='*20} P√°gina {page_num} {'='*20}\n"

                        page_text = page.extract_text(
                            x_tolerance=2,
                            y_tolerance=2,
                            layout=True,
                            x_density=7.25,
                            y_density=13,
                        )

                        if page_text:
                            text += page_text + "\n"

                        # Try to extract tables
                        try:
                            tables = page.extract_tables()
                            if tables:
                                text += f"\n[TABLES - Page {page_num}]\n"
                                for table_num, table in enumerate(tables, 1):
                                    text += f"\nTable {table_num}:\n"
                                    for row in table:
                                        if row:
                                            text += (
                                                " | ".join(
                                                    [
                                                        str(cell) if cell else ""
                                                        for cell in row
                                                    ]
                                                )
                                                + "\n"
                                            )
                        except:
                            pass

                    except Exception as e:
                        text += f"\n[Error on page {page_num}: {str(e)}]\n"
                        continue

            return text

        except Exception as e:
            logger.error(f"pdfplumber error: {str(e)}")
            return ""

    def _extract_with_pypdf2_safe(self, pdf_path: str) -> str:
        """Safe extraction with PyPDF2 - ignores font errors."""
        text = ""
        try:
            with open(pdf_path, "rb") as file:
                pdf_reader = PyPDF2.PdfReader(file, strict=False)

                for page_num, page in enumerate(pdf_reader.pages, 1):
                    try:
                        text += f"\n{'='*20} P√°gina {page_num} {'='*20}\n"

                        try:
                            page_text = page.extract_text()
                            if page_text:
                                text += page_text + "\n"
                        except:
                            try:
                                if "/Contents" in page:
                                    text += "[Content detected but not extractable]\n"
                            except:
                                text += "[Page with no extractable text content]\n"

                    except Exception as e:
                        text += f"[Error on page {page_num}: {str(e)}]\n"
                        continue

            return text

        except Exception as e:
            logger.error(f"PyPDF2 error: {str(e)}")
            return ""

    def _extract_raw_text_fallback(self, pdf_path: str) -> str:
        """Last resort method - basic PDF file analysis."""
        text = ""
        try:
            with open(pdf_path, "rb") as file:
                content = file.read()

            text += "=" * 50 + "\n"
            text += "RAW TEXT EXTRACTION (FALLBACK METHOD)\n"
            text += "=" * 50 + "\n\n"

            try:
                content_str = content.decode("latin-1", errors="ignore")

                import re

                # Patterns to find readable text
                text_patterns = [
                    r"\((.*?)\)",  # Text in parentheses
                    r"/Title\s*\((.*?)\)",  # Titles
                    r"/Subject\s*\((.*?)\)",  # Subjects
                    r"/Author\s*\((.*?)\)",  # Authors
                    r"BT\s+(.*?)\s+ET",  # PDF text blocks
                ]

                found_text = []
                for pattern in text_patterns:
                    matches = re.findall(pattern, content_str, re.MULTILINE | re.DOTALL)
                    found_text.extend(matches)

                if found_text:
                    text += "Text fragments found:\n\n"
                    for i, fragment in enumerate(found_text[:50], 1):
                        clean_fragment = fragment.strip()
                        if len(clean_fragment) > 2:
                            text += f"{i}. {clean_fragment}\n"
                else:
                    text += "No readable text fragments found.\n"
                    text += f"File size: {len(content)} bytes\n"
                    text += "File may contain only images or be heavily encoded.\n"

            except Exception as e:
                text += f"Error in raw analysis: {str(e)}\n"

        except Exception as e:
            logger.error(f"Fallback method error: {str(e)}")
            text = "Could not extract content with any available method."

        return text

In [None]:
class EnhancedCodeReviewerAgent:
    """
    Enhanced agent for PDF-based code evaluation.
    Sends complete log content for LLM to identify model responses.
    """

    def __init__(
        self, api_key: str, model: str = "gpt-4o-mini", temperature: float = 0
    ):
        """Initialize the enhanced code reviewer agent."""
        self.llm = ChatOpenAI(model=model, temperature=temperature, api_key=api_key)
        self.pdf_extractor = PDFExtractionEngine()

    def _calculate_content_hash(self, content: str) -> str:
        """Calculate MD5 hash of content."""
        return hashlib.md5(content.encode("utf-8")).hexdigest()

    def _get_timestamp(self) -> str:
        """Get current timestamp."""
        return "2025-08-10 22:12:32"

    def process_pdf_conversation(self, pdf_path: str) -> PDFContent:
        """
        Process PDF file and extract content to log file.

        Args:
            pdf_path: Path to the PDF file to process

        Returns:
            PDFContent object with extraction details
        """
        logger.info(f"Processing PDF: {pdf_path}")
        return self.pdf_extractor.extract_pdf_to_log(pdf_path, output_format="log")

    def _extract_requirements_from_prompt(
        self, prompt: str, task_description: str = None
    ) -> EvaluatorPrompt:
        """
        Extract requirements and success criteria from the evaluator prompt.
        """
        context_section = ""
        if task_description:
            context_section = f"""
            
            TASK CONTEXT: "{task_description}"
            This context should inform the requirements extraction and help orient toward production-ready code quality.
            """

        extraction_prompt = f"""
        Analyze the following prompt and extract the key requirements and success criteria:
    
        PROMPT: "{prompt}"{context_section}
    
        Please identify:
        1. Main functional requirements (what the code should do)
        2. Technical requirements (language, frameworks, specific approaches)
        3. Quality requirements (performance, security, maintainability, production-readiness)
        4. Success criteria (how to measure if the implementation is successful)
    
        Format your response as:
        
        **BASE_PROMPT**
        {prompt}
        
        **FUNCTIONAL_REQUIREMENTS:**
        - [requirement 1]
        - [requirement 2]
    
        **TECHNICAL_REQUIREMENTS:**
        - [requirement 1]
        - [requirement 2]
    
        **QUALITY_REQUIREMENTS:**
        - [requirement 1]
        - [requirement 2]
    
        **SUCCESS_CRITERIA:**
        - [criteria 1]
        - [criteria 2]
    
        **PRIORITY_ASPECTS:**
        - [most important aspect 1]
        - [most important aspect 2]
        """

        try:
            messages = [
                SystemMessage(
                    content="You are an expert requirements analyst. Extract clear, actionable requirements from prompts with focus on production-ready code quality."
                ),
                HumanMessage(content=extraction_prompt),
            ]

            response = self.llm.invoke(messages)

            requirements = []
            success_criteria = []
            priority_aspects = []

            lines = response.content.split("\n")
            current_section = None

            for line in lines:
                line = line.strip()
                if "**FUNCTIONAL_REQUIREMENTS:**" in line:
                    current_section = "functional"
                elif "**TECHNICAL_REQUIREMENTS:**" in line:
                    current_section = "technical"
                elif "**QUALITY_REQUIREMENTS:**" in line:
                    current_section = "quality"
                elif "**SUCCESS_CRITERIA:**" in line:
                    current_section = "success"
                elif "**PRIORITY_ASPECTS:**" in line:
                    current_section = "priority"
                elif line.startswith("- ") and current_section:
                    item = line[2:].strip()
                    if current_section in ["functional", "technical", "quality"]:
                        requirements.append(f"[{current_section.upper()}] {item}")
                    elif current_section == "success":
                        success_criteria.append(item)
                    elif current_section == "priority":
                        priority_aspects.append(item)

            return EvaluatorPrompt(
                original_prompt=prompt,
                task_description=task_description,
                requirements=(
                    requirements if requirements else [f"Fulfill the request: {prompt}"]
                ),
                success_criteria=(
                    success_criteria
                    if success_criteria
                    else [
                        "Code works as requested",
                        "Follows best practices",
                        "Production-ready quality",
                    ]
                ),
                priority_aspects=(
                    priority_aspects
                    if priority_aspects
                    else ["Correctness", "Code quality", "Production-readiness"]
                ),
            )

        except Exception as e:
            print(f"Warning: Could not extract requirements automatically: {e}")
            return EvaluatorPrompt(
                original_prompt=prompt,
                task_description=task_description,
                requirements=[f"Fulfill the request: {prompt}"],
                success_criteria=[
                    "Code works as requested",
                    "Follows best practices",
                    "Production-ready quality",
                ],
                priority_aspects=[
                    "Correctness",
                    "Code quality",
                    "Production-readiness",
                ],
            )

    def _get_pdf_focused_system_message(self, analysis_type: str) -> str:
        """Get system message for PDF content-based evaluation."""
        return """
        You are an expert software engineer/code evaluator specializing in requirement compliance assessment and technical analysis.
        
        Your PRIMARY goal is to evaluate conversation content that contains interactions between a user and TWO DIFFERENT AI MODELS (Model A and Model B). You must identify which model provided better technical guidance, code quality, and production-readiness.
        
        CRITICAL INSTRUCTIONS:
        1. **IDENTIFY THE TWO MODELS**: From the conversation content, you must identify which responses belong to Model A and which belong to Model B
        2. **COMPLETE CONTENT ANALYSIS**: The entire log content represents a conversation - analyze all of it to understand the full context
        3. **NO ARTIFICIAL DIVISION**: The content has NOT been pre-divided - you must determine what constitutes each model's contribution
        
        EVALUATION PRIORITY ORDER:
        1. **interaction_rating** - How well does the model engage with the problem? Does it explore edge cases, explain its decisions, and allow for user iteration or clarification?
        2. **code_logic** - Is the code logically correct, efficient, and in line with best practices? Are there bugs, performance issues, or flawed reasoning?
        3. **naming_clarity** - Are the names of variables, functions, and classes descriptive, intuitive, and consistent with the task and codebase?
        4. **organization_modularity** - Is the code well-structured and modular? Does it promote readability, reuse, and maintainability?
        5. **interface_design (if applicable)** - Are any user interfaces clear, usable, and appropriate for the task?
        6. **error_handling** - Does the code handle invalid inputs and edge cases gracefully? Is exception handling or validation logic appropriate and secure?
        7. **documentation** - Are the comments and documentation useful, concise, and focused on non-obvious aspects? Avoid comments that merely restate what the code does or contain the model's reasoning process.
        8. **review_readiness** - Is the code ready for a pull request review? Does it reflect a clear, consistent style and adhere to the standards of a production-level contribution?
        
        REQUIRED OUTPUT FORMAT:
        You must provide your response in this EXACT structure with NO additional sections:
        
        ## üéØ Evaluation Result: üèÜ [MODEL_A/MODEL_B] (Winner)
        
        ## ‚úÖ Why [CHOSEN_MODEL] is the Superior Response
        [Write a technical argumentation paragraph of 200-300 words explaining why the chosen model offers the best technical guidance and code quality. Focus on production-readiness, problem-solving approach, and technical depth. Reference specific parts of the conversation.]
        
        ## ‚ùå Why [REJECTED_MODEL] is Inferior
        [Write a technical argumentation paragraph of 100-200 words explaining why the rejected model is inferior using technical concepts. Focus on specific technical shortcomings, gaps in reasoning, or production-readiness issues. Reference specific parts of the conversation.]
        
        **Issue Type:** [technical_inconsistency | tool | code_correctness | setup | production_readiness | other]
        
        ## üìà Technical Assessment Scores
        
        **Scoring Rules:**
        
        ***Scoring guideline***
        - The scores must be assigned according to the BEST RESPONSE AND MODEL CHOSEN. The analysis of the responses in such a way that these must reflect the choice of the model chosen for providing the best solution.
        
        **Interaction Scores:**
        - interaction_rating: [Excellent | Good | Fair  | Poor ]
        - code_logic: [Excellent | Good | Fair  | Poor ]
        - naming_clarity: [Excellent | Good | Fair  | Poor ]
        - organization_modularity: [Excellent | Good | Fair  | Poor ] 
        - interface_design: [Excellent | Good | Fair  | Poor ]
        - error_handling: [Excellent | Good | Fair  | Poor ]
        - documentation: [Excellent | Good | Fair  | Poor ]
        - review_readiness: [Excellent | Good | Fair  | Poor ]
        
        ## üîÑ Next Improvement Prompt
        [Generate a well-scoped follow-up prompt focused on improving the implementation toward production-ready quality. Next prompt should be SIMPLER than the task description. Consider the task context and current code quality gaps. The prompt should build upon previous work without repeating already implemented features. CRITICAL: Review the prompt history provided to avoid suggesting already completed tasks or implementations. Focus on logical next steps that advance toward the task description goal without increasing complexity or requesting tests/documentation updates. The next prompt should be simple. ALWAYS focused on solve problems in the choosen response and oriented to improve: Security Assessment, Performance Analysis, Build & Deploy, Code Quality or Architecture Review.]
        
        EVALUATION CRITERIA DEFINITIONS:
        - **interaction_rating**: How well does the model engage with the problem? Does it explore edge cases, explain decisions, and allow for user iteration?
        - **code_logic**: Is the code logically correct, efficient, and follows best practices? Are there bugs or performance issues?
        - **naming_clarity**: Are variable, function, and class names descriptive, intuitive, and consistent?
        - **organization_modularity**: Is the code well-structured, modular, readable, and maintainable?
        - **interface_design**: Are user interfaces clear, usable, and appropriate for the task?
        - **error_handling**: Does the code handle invalid inputs and edge cases gracefully with appropriate validation?
        - **documentation**: Are comments and documentation useful, concise, and focused on non-obvious aspects?
        - **production_readiness**: Is the code ready for deployment with proper configuration, security, and scalability considerations?
        - **review_readiness**: Is the code ready for production-level pull request review with consistent style?
        
        ISSUE TYPE DEFINITIONS:
        - **technical_inconsistency**: Code has inconsistent patterns, conflicting approaches, or technical contradictions
        - **tool**: Incorrect or inappropriate use of tools, libraries, or frameworks
        - **code_correctness**: Logical errors, bugs, or incorrect implementation that prevents proper functionality
        - **setup**: Problems with configuration, environment setup, or deployment-related issues
        - **production_readiness**: Code lacks necessary features for production deployment (logging, error handling, security, etc.)
        - **other**: Issues that don't fit the above categories but represent clear technical problems
        
        IMPORTANT LIMITATIONS FOR NEXT PROMPT GENERATION:
        - The Interaction Scores should reflect the final decision. It's not necessary to generate scores for each model, instead you must generate an overall score that demonstrates that choice according to the best model selected.
        - CRITICAL: The next prompt NEVER should be related to implement or update TEST or project documentation. It should be ALWAYS related to fix problems in the current response and NEVER should increase the complexity of the general task. 
        - CRITICAL: Always review the provided prompt history to avoid repeating previously completed implementations or tasks. The next prompt should be ALWAYS focused in to improve the current response or fix issues or weaknesses foun in the analyzed response. 
        - CRITICAL: The next prompt should be ALWAYS focused on solve problems in the choosen response and oriented to improve: Security Assessment, Performance Analysis, Build & Deploy, Code Quality or Architecture Review.
        - CRITICAL: Next prompt ALWAYS shoud mention the file or module where the improvement should be implemented.
        - Focus on incremental improvements that logically build upon existing work.
        - Ensure the next prompt advances toward the task description goal without unnecessary complexity.
        
        Focus on how well each implementation serves the original purpose while progressing toward production-ready code that can be deployed in real-world scenarios.
        """

    def _create_pdf_focused_analysis_prompt(
        self, request: PDFBasedAnalysisRequest
    ) -> str:
        """Create analysis prompt focused on complete PDF content evaluation."""
        evaluator_prompt = request.evaluator_prompt
        pdf_content = request.pdf_content

        prompt_parts = [
            "# Complete PDF Conversation Analysis",
            f"\n## Original Evaluator Request:",
            f'"{evaluator_prompt.original_prompt}"',
        ]

        if evaluator_prompt.task_description:
            prompt_parts.extend(
                [
                    f"\n## Task Context:",
                    f'"{evaluator_prompt.task_description}"',
                    "\nThis context should inform your evaluation and guide the next improvement prompt toward production-ready code quality.",
                ]
            )

        # Add prompt history section
        if request.prompt_history and len(request.prompt_history) > 0:
            prompt_parts.extend(
                [
                    f"\n## üìã Previous Prompts History:",
                    "The following prompts have been previously executed in this development session:",
                ]
            )
            for i, prev_prompt in enumerate(request.prompt_history, 1):
                prompt_parts.append(f'{i}. "{prev_prompt}"')

            prompt_parts.extend(
                [
                    "\n**CRITICAL**: When generating the next improvement prompt, you MUST:",
                    "- Review this history to avoid repeating already completed tasks",
                    "- Build upon previous work without duplicating implementations",
                    "- Focus on logical next steps that advance toward the task description goal",
                    "- Ensure progression without unnecessary complexity increases",
                ]
            )

        prompt_parts.extend(
            [
                "\n## Your Mission:",
                "Analyze the COMPLETE conversation content extracted from the PDF to identify TWO DIFFERENT AI MODELS (Model A and Model B) and determine which provided superior technical guidance.",
                "The content has NOT been pre-divided - you must identify what constitutes each model's responses within the conversation.",
                "Provide a structured response following the EXACT format specified in your system message.",
                "\n## COMPLETE CONVERSATION CONTENT TO ANALYZE:",
                "\n### Full Extracted Content:",
                "```",
                pdf_content.extracted_content,
                "```",
            ]
        )

        prompt_parts.extend(
            [
                "\n## ANALYSIS INSTRUCTIONS:",
                "1. **IDENTIFY THE MODELS**: Look for patterns, signatures, or formatting that distinguish Model A from Model B responses",
                "2. **ANALYZE COMPLETE INTERACTION**: Consider the full conversation flow and how each model handled the user's requests",
                "3. **EVALUATE TECHNICAL QUALITY**: Assess code quality, problem-solving approach, and production-readiness for each model",
                "4. **DETERMINE WINNER**: Choose which model provided overall superior technical guidance",
                "\n## IMPORTANT:",
                "Follow the EXACT output format specified in your system message.",
                "Include all required sections in the specified order.",
                "Provide technical justifications based on the complete conversation analysis.",
                "Generate scores according to the specified scoring rules.",
                "Create a meaningful next improvement prompt oriented toward production-ready code quality.",
                "Consider the full conversation context when generating the next improvement prompt.",
                "The next prompt never should be related to implement or update TEST or project documentation.",
                "CRITICAL: Review the prompt history to avoid repeating previously completed implementations.",
            ]
        )

        return "\n".join(prompt_parts)

    def _parse_structured_comparison_result(self, response_content: str) -> Dict:
        """
        Parse the structured LLM response into organized data.
        """
        # Initialize result structure
        result = {
            "chosen_model": "neither",
            "confidence_score": 5.0,
            "original_request": "",
            "task_context": "",
            "content_summary": {},
            "winner_justification": "",
            "loser_critique": "",
            "issue_type": "other",
            "interaction_scores": {},
            "next_prompt": "",
            "raw_response": response_content,
        }

        lines = response_content.split("\n")
        current_section = None
        current_content = []

        for line in lines:
            line_stripped = line.strip()

            # Detect sections
            if "üéØ Evaluation Result:" in line and "üèÜ" in line:
                if "MODEL_A" in line.upper():
                    result["chosen_model"] = "model_a"
                elif "MODEL_B" in line.upper():
                    result["chosen_model"] = "model_b"
                current_section = "evaluation_result"

            elif "üìä Content Analysis Summary" in line:
                current_section = "content_summary"

            elif "‚úÖ Why" in line and "Superior" in line:
                current_section = "winner_justification"
                current_content = []

            elif "‚ùå Why" in line and "Inferior" in line:
                current_section = "loser_critique"
                current_content = []

            elif "**Issue Type:**" in line:
                issue_type_line = line.replace("**Issue Type:**", "").strip()
                for issue_type in [
                    "technical_inconsistency",
                    "tool",
                    "code_correctness",
                    "setup",
                    "production_readiness",
                    "other",
                ]:
                    if issue_type in issue_type_line:
                        result["issue_type"] = issue_type
                        break

            elif "üìà Technical Assessment Scores" in line:
                current_section = "scores"

            elif "üîÑ Next Improvement Prompt" in line:
                current_section = "next_prompt"
                current_content = []

            elif current_section == "content_summary" and line_stripped.startswith(
                "- **"
            ):
                if "PDF Source:" in line:
                    result["content_summary"]["pdf_source"] = line_stripped.split(":")[
                        -1
                    ].strip()
                elif "Extraction Method:" in line:
                    result["content_summary"]["extraction_method"] = (
                        line_stripped.split(":")[-1].strip()
                    )
                elif "Total Content Length:" in line:
                    try:
                        result["content_summary"]["total_length"] = int(
                            line_stripped.split(":")[-1].strip().split()[0]
                        )
                    except:
                        pass
                elif "Log File Generated:" in line:
                    result["content_summary"]["log_file"] = line_stripped.split(":")[
                        -1
                    ].strip()
                elif "Model Identification:" in line:
                    result["content_summary"]["model_identification"] = (
                        line_stripped.split(":")[-1].strip()
                    )

            elif (
                current_section == "winner_justification"
                and line_stripped
                and not line_stripped.startswith("##")
            ):
                current_content.append(line_stripped)

            elif (
                current_section == "loser_critique"
                and line_stripped
                and not line_stripped.startswith("##")
                and not line_stripped.startswith("**Issue Type:**")
            ):
                current_content.append(line_stripped)

            elif (
                current_section == "scores"
                and ":" in line_stripped
                and line_stripped.startswith("- ")
            ):
                try:
                    score_line = line_stripped[2:].strip()  # Remove "- "
                    score_name, score_value = score_line.split(":", 1)
                    score_name = score_name.strip()
                    score_value = score_value.strip().strip("[]")

                    result["interaction_scores"][score_name] = score_value
                except:
                    pass

            elif (
                current_section == "next_prompt"
                and line_stripped
                and not line_stripped.startswith("##")
            ):
                current_content.append(line_stripped)

        # Finalize content sections
        if current_section == "winner_justification":
            result["winner_justification"] = " ".join(current_content).strip()
        elif current_section == "loser_critique":
            result["loser_critique"] = " ".join(current_content).strip()
        elif current_section == "next_prompt":
            result["next_prompt"] = " ".join(current_content).strip()

        return result

    def evaluate_pdf_based_models(
        self,
        evaluator_prompt_text: str,
        pdf_path: str,
        task_description: str = None,
        custom_criteria: List[str] = None,
        prompt_history: List[str] = None,
    ) -> Dict:
        """
        Evaluate model responses from complete PDF conversation content.

        Args:
            evaluator_prompt_text: The specific evaluation prompt
            pdf_path: Path to the PDF file containing conversation
            task_description: General task context for production-oriented improvements
            custom_criteria: Custom evaluation criteria
            prompt_history: List of previously executed prompts to avoid repetition

        Returns:
            Dictionary with structured evaluation results
        """

        # Extract PDF content
        print("Extracting content from PDF...")
        pdf_content = self.process_pdf_conversation(pdf_path)
        print(f"‚úÖ PDF content extracted to: {pdf_content.log_file_path}")
        print(
            f"üìä Total content length: {len(pdf_content.extracted_content)} characters"
        )

        # Extract requirements from the evaluator prompt
        print("Extracting requirements from evaluator prompt...")
        evaluator_prompt = self._extract_requirements_from_prompt(
            evaluator_prompt_text, task_description
        )

        # Create PDF-based analysis request
        analysis_request = PDFBasedAnalysisRequest(
            evaluator_prompt=evaluator_prompt,
            pdf_content=pdf_content,
            task_description=task_description,
            analysis_type="pdf_content_focused",
            custom_evaluation_criteria=custom_criteria,
            prompt_history=prompt_history,
        )

        # Generate the analysis prompt
        analysis_prompt = self._create_pdf_focused_analysis_prompt(analysis_request)

        # Get system message
        system_message = self._get_pdf_focused_system_message(
            analysis_request.analysis_type
        )

        # Create LLM messages
        messages = [
            SystemMessage(content=system_message),
            HumanMessage(content=analysis_prompt),
        ]

        # Get response from LLM
        print("Evaluating complete conversation content with LLM...")
        response = self.llm.invoke(messages)

        # Parse the structured response
        parsed_evaluation = self._parse_structured_comparison_result(response.content)

        # Process response
        analysis_result = {
            "evaluation_type": "pdf_complete_content_focused",
            "user_login": "CAMB3LL",
            "timestamp": "2025-08-10 22:12:32",
            "evaluator_prompt": {
                "original_text": evaluator_prompt_text,
                "task_description": task_description,
                "extracted_requirements": evaluator_prompt.requirements,
                "success_criteria": evaluator_prompt.success_criteria,
                "priority_aspects": evaluator_prompt.priority_aspects,
            },
            "pdf_analysis": {
                "original_pdf_path": pdf_content.original_pdf_path,
                "log_file_path": pdf_content.log_file_path,
                "extraction_method": pdf_content.extraction_method,
                "extraction_timestamp": pdf_content.extraction_timestamp,
                "content_hash": pdf_content.content_hash,
                "total_content_length": len(pdf_content.extracted_content),
            },
            "prompt_history": prompt_history,
            "parsed_evaluation": parsed_evaluation,
            "raw_response": response.content,
            "raw_prompt_sent": analysis_prompt,
        }

        return analysis_result

    def save_pdf_evaluation_report(self, analysis_result: Dict, output_file: str):
        """
        Save PDF-based evaluation results to a file.
        """
        output_path = Path(output_file)
        output_path.parent.mkdir(parents=True, exist_ok=True)

        report_content = self._generate_pdf_evaluation_report(analysis_result)

        with open(output_path, "w", encoding="utf-8") as f:
            f.write(report_content)

        print(f"PDF evaluation report saved to: {output_file}")

    def _generate_pdf_evaluation_report(self, analysis_result: Dict) -> str:
        """Generate a structured markdown report from PDF evaluation results."""

        parsed_result = analysis_result.get("parsed_evaluation", {})
        pdf_analysis = analysis_result.get("pdf_analysis", {})

        chosen_model = parsed_result.get("chosen_model", "neither")
        if chosen_model == "model_a":
            winner_display = "üèÜ **MODEL A** (Winner)"
        elif chosen_model == "model_b":
            winner_display = "üèÜ **MODEL B** (Winner)"
        else:
            winner_display = "‚ùå **NO CLEAR WINNER**"

        report_parts = [
            "# PDF-Based Complete Conversation Evaluation Report",
            f"\n## üéØ Evaluation Result: {winner_display}",
        ]

        report_parts.extend(
            [
                "\n---",
                "\n## üìù Raw LLM Response",
                "\n```markdown",
                parsed_result.get("raw_response", "No raw response captured"),
                "\n```",
            ]
        )

        return "\n".join(report_parts)

In [None]:
def main():
    """Example usage of the enhanced PDF-based evaluator."""

    # Initialize the agent
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        print("‚ùå Please set OPENAI_API_KEY environment variable")
        return

    agent = EnhancedCodeReviewerAgent(api_key=api_key)
    pdf_path = "conversation.pdf"

    # PROMPTING SECTION

    task_description = "The current HTTP response handling logic within the httpx repository is functional but lacks clarity and maintainability, especially as new features and edge cases are added. This task aims to refactor the existing response handling code to improve code readability, reduce complexity, and ensure consistency across the codebase. This will involve restructuring the logic for parsing and validating HTTP responses, separating concerns into smaller, more modular functions, and maintaining all public function signatures for backward compatibility."

    evaluator_prompt = "Review the `_response_handlers.py` module to ensure that all functions are optimized for performance and maintainability. Specifically, focus on the `parse_header_links` function to enhance its robustness against malformed input while maintaining clarity in its implementation. Consider edge cases that may not have been addressed in the current implementation."

    prompt_history = [
        "Refactor the HTTP response handling logic in the httpx library to improve maintainability and clarity. Focus on modularizing header parsing, status code validation, and body decoding into separate functions while ensuring backward compatibility. Introduce structured error classes for common HTTP response issues with meaningful error messages. Update internal API calls and test cases to align with the refactored structure, writing new tests for uncovered edge cases like malformed headers or partial response bodies. Ensure the refactored code adheres to repository style and quality standards and does not degrade performance."
        ""
        ""
        ""
    ]

    try:
        print("üöÄ Starting Complete PDF Conversation Evaluation")

        result = agent.evaluate_pdf_based_models(
            evaluator_prompt_text=evaluator_prompt,
            pdf_path=pdf_path,
            task_description=task_description,
            prompt_history=prompt_history,
        )

        # Save report
        report_file = (
            f"pdf_complete_evaluation_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md"
        )
        agent.save_pdf_evaluation_report(result, report_file)

        # Display results
        chosen_model = result["parsed_evaluation"]["chosen_model"]
        print(f"\nüéØ EVALUATION RESULT: {chosen_model.upper()} WINS!")

        if result["parsed_evaluation"]["next_prompt"]:
            print(f"\nüîÑ NEXT IMPROVEMENT PROMPT:")
            print(f"'{result['parsed_evaluation']['next_prompt']}'")

        # Display PDF extraction info
        pdf_info = result["pdf_analysis"]
        print(f"\nüìã PDF EXTRACTION DETAILS:")
        print(f"   Log file: {pdf_info['log_file_path']}")
        print(f"   Method: {pdf_info['extraction_method']}")
        print(f"   Length: {pdf_info['total_content_length']} characters")
        print(f"   Time: {pdf_info['extraction_timestamp']}")

        print(f"\nüìÑ Detailed report saved to: {report_file}")

    except Exception as e:
        print(f"‚ùå Error during evaluation: {str(e)}")
        import traceback

        traceback.print_exc()


if __name__ == "__main__":
    main()

INFO:__main__:Processing PDF: conversation.pdf
INFO:__main__:Attempting extraction with pymupdf_robust...


üöÄ Starting Complete PDF Conversation Evaluation
Extracting content from PDF...


INFO:__main__:‚úÖ Extraction successful with pymupdf_robust


‚úÖ PDF content extracted to: conversation_extracted_20250811_201708.log
üìä Total content length: 195461 characters
Extracting requirements from evaluator prompt...


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Evaluating complete conversation content with LLM...


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


PDF evaluation report saved to: pdf_complete_evaluation_20250811_201726.md

üéØ EVALUATION RESULT: MODEL_A WINS!

üîÑ NEXT IMPROVEMENT PROMPT:
'"Review the `_response_handlers_optimized.py` module to further enhance its performance and security. Focus on implementing additional caching strategies for the `BodyDecoder` and `StatusCodeValidator` classes to optimize repeated operations. Additionally, assess the current regex patterns for potential improvements in efficiency and robustness against edge cases. Ensure that all changes maintain backward compatibility and do not introduce breaking changes."'

üìã PDF EXTRACTION DETAILS:
   Log file: conversation_extracted_20250811_201708.log
   Method: pymupdf_robust
   Length: 195461 characters
   Time: 2025-08-11 20:17:09 UTC

üìÑ Detailed report saved to: pdf_complete_evaluation_20250811_201726.md
