In [None]:
!pip install langchain-community langchain-core openai

In [None]:
import os
import json
import hashlib
from typing import Dict, List, Tuple, Optional
from pathlib import Path
from langchain_community.chat_models import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage
from dataclasses import dataclass
import difflib
from enum import Enum
from datetime import datetime

In [None]:
class ModelChoice(Enum):
    """Enum for model selection results."""
    MODEL_A = "model_a"
    MODEL_B = "model_b"
    NEITHER = "neither"
    BOTH_GOOD = "both_good"

In [None]:
@dataclass
class FileContent:
    """Represents a file with its content and metadata."""
    path: str
    content: str
    file_type: str
    hash: str

@dataclass
class EvaluatorPrompt:
    """Structure for the original evaluator prompt and requirements."""
    original_prompt: str
    requirements: List[str] = None
    success_criteria: List[str] = None
    priority_aspects: List[str] = None

@dataclass
class ModelComparisonResult:
    """Result of comparing two model implementations."""
    chosen_model: ModelChoice
    confidence_score: float
    reasoning: str
    pros_model_a: List[str]
    cons_model_a: List[str]
    pros_model_b: List[str]
    cons_model_b: List[str]
    detailed_analysis: str

@dataclass
class RefinedMultiModelAnalysisRequest:
    """Enhanced request structure focusing on evaluator requirements."""
    evaluator_prompt: EvaluatorPrompt
    current_files: List[FileContent]
    model_a_files: List[FileContent]
    model_b_files: List[FileContent]
    analysis_type: str = "requirement_focused"
    custom_evaluation_criteria: List[str] = None

In [None]:
class CodeReviewerAgent:
    """
    Enhanced agent for performing structured requirement-focused code analysis and comparison.
    Evaluates two AI model implementations against original user requirements with structured output.
    """
    
    def __init__(self, api_key: str, model: str = "gpt-4o-mini", temperature: float = 0):
        """Initialize the code reviewer agent."""
        self.llm = ChatOpenAI(model=model, temperature=temperature, api_key=api_key)
        self.supported_extensions = {
            '.py': 'python',
            '.js': 'javascript', 
            '.ts': 'typescript',
            '.java': 'java',
            '.cpp': 'cpp',
            '.c': 'c',
            '.cs': 'csharp',
            '.go': 'go',
            '.rs': 'rust',
            '.php': 'php',
            '.rb': 'ruby',
            '.swift': 'swift',
            '.kt': 'kotlin',
            '.scala': 'scala',
            '.html': 'html',
            '.css': 'css',
            '.sql': 'sql',
            '.sh': 'shell',
            '.yaml': 'yaml',
            '.yml': 'yaml',
            '.json': 'json',
            '.xml': 'xml',
            '.md': 'markdown'
        }

    def _calculate_file_hash(self, content: str) -> str:
        """Calculate MD5 hash of file content."""
        return hashlib.md5(content.encode('utf-8')).hexdigest()

    def _get_file_type(self, file_path: str) -> str:
        """Determine file type from extension."""
        ext = Path(file_path).suffix.lower()
        return self.supported_extensions.get(ext, 'text')

    def _get_timestamp(self) -> str:
        """Get current timestamp."""
        return datetime.now().strftime("%Y-%m-%d %H:%M:%S UTC")

    def load_files_from_directory(self, directory_path: str) -> List[FileContent]:
        """
        Load all supported files from a directory.
        
        Args:
            directory_path: Path to the directory containing files
            
        Returns:
            List of FileContent objects
        """
        files = []
        directory = Path(directory_path)
        
        if not directory.exists():
            raise FileNotFoundError(f"Directory not found: {directory_path}")
        
        # Recursively find all files
        for file_path in directory.rglob('*'):
            if file_path.is_file() and file_path.suffix.lower() in self.supported_extensions:
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        content = f.read()
                    
                    # Create relative path from directory
                    relative_path = str(file_path.relative_to(directory))
                    
                    file_content = FileContent(
                        path=relative_path,
                        content=content,
                        file_type=self._get_file_type(str(file_path)),
                        hash=self._calculate_file_hash(content)
                    )
                    files.append(file_content)
                    
                except (UnicodeDecodeError, PermissionError) as e:
                    print(f"Warning: Could not read file {file_path}: {e}")
                    continue
        
        return files

    def _generate_file_diff(self, file_a: FileContent, file_b: FileContent, 
                          label_a: str = "Version A", label_b: str = "Version B") -> str:
        """Generate unified diff between two files."""
        lines_a = file_a.content.splitlines(keepends=True)
        lines_b = file_b.content.splitlines(keepends=True)
        
        diff = difflib.unified_diff(
            lines_a,
            lines_b,
            fromfile=f"{label_a}/{file_a.path}",
            tofile=f"{label_b}/{file_b.path}",
            lineterm=''
        )
        
        return ''.join(diff)

    def _extract_requirements_from_prompt(self, prompt: str) -> EvaluatorPrompt:
        """
        Extract requirements and success criteria from the evaluator prompt.
        
        Args:
            prompt: Original evaluator prompt
            
        Returns:
            EvaluatorPrompt with extracted requirements
        """
        extraction_prompt = f"""
        Analyze the following prompt and extract the key requirements and success criteria:

        PROMPT: "{prompt}"

        Please identify:
        1. Main functional requirements (what the code should do)
        2. Technical requirements (language, frameworks, specific approaches)
        3. Quality requirements (performance, security, maintainability)
        4. Success criteria (how to measure if the implementation is successful)

        Format your response as:
        **FUNCTIONAL_REQUIREMENTS:**
        - [requirement 1]
        - [requirement 2]

        **TECHNICAL_REQUIREMENTS:**
        - [requirement 1]
        - [requirement 2]

        **QUALITY_REQUIREMENTS:**
        - [requirement 1]
        - [requirement 2]

        **SUCCESS_CRITERIA:**
        - [criteria 1]
        - [criteria 2]

        **PRIORITY_ASPECTS:**
        - [most important aspect 1]
        - [most important aspect 2]
        """

        try:
            messages = [
                SystemMessage(content="You are an expert requirements analyst. Extract clear, actionable requirements from prompts."),
                HumanMessage(content=extraction_prompt)
            ]
            
            response = self.llm.invoke(messages)
            
            # Parse the response
            requirements = []
            success_criteria = []
            priority_aspects = []
            
            lines = response.content.split('\n')
            current_section = None
            
            for line in lines:
                line = line.strip()
                if "**FUNCTIONAL_REQUIREMENTS:**" in line:
                    current_section = "functional"
                elif "**TECHNICAL_REQUIREMENTS:**" in line:
                    current_section = "technical"
                elif "**QUALITY_REQUIREMENTS:**" in line:
                    current_section = "quality"
                elif "**SUCCESS_CRITERIA:**" in line:
                    current_section = "success"
                elif "**PRIORITY_ASPECTS:**" in line:
                    current_section = "priority"
                elif line.startswith("- ") and current_section:
                    item = line[2:].strip()
                    if current_section in ["functional", "technical", "quality"]:
                        requirements.append(f"[{current_section.upper()}] {item}")
                    elif current_section == "success":
                        success_criteria.append(item)
                    elif current_section == "priority":
                        priority_aspects.append(item)
            
            return EvaluatorPrompt(
                original_prompt=prompt,
                requirements=requirements if requirements else [f"Fulfill the request: {prompt}"],
                success_criteria=success_criteria if success_criteria else ["Code works as requested", "Follows best practices"],
                priority_aspects=priority_aspects if priority_aspects else ["Correctness", "Code quality"]
            )
            
        except Exception as e:
            print(f"Warning: Could not extract requirements automatically: {e}")
            return EvaluatorPrompt(
                original_prompt=prompt,
                requirements=[f"Fulfill the request: {prompt}"],
                success_criteria=["Code works as requested", "Follows best practices"],
                priority_aspects=["Correctness", "Code quality"]
            )

    def _get_requirement_focused_system_message(self, analysis_type: str) -> str:
        """Get refined system message for requirement-focused evaluation with specific output format."""
        
        return """
        You are an expert code evaluator specializing in requirement compliance assessment and technical analysis.
        
        Your PRIMARY goal is to determine which AI model implementation better fulfills the original evaluator request.
        
        EVALUATION PRIORITY ORDER:
        1. **Requirement Fulfillment** - Does it do what was asked?
        2. **Correctness** - Does the code work as intended?
        3. **Completeness** - Does it address all aspects of the request?
        4. **Code Quality** - Is it well-written and maintainable?
        5. **Best Practices** - Does it follow good coding standards?
        
        REQUIRED OUTPUT FORMAT:
        You must provide your response in this EXACT structure with NO additional sections:
        
        ## 🎯 Evaluation Result: 🏆 [MODEL_A/MODEL_B] (Winner)
        
        ## 📋 Original Evaluator Request
        > "[original request text]"
        
        ## 📊 File Analysis Summary
        - **Current Files:** [number]
        - **Model A Files:** [number] 
        - **Model B Files:** [number]
        - **Total Unique Files:** [number]
        
        ## ✅ Why [CHOSEN_MODEL] is the Superior Implementation
        [Write a technical argumentation paragraph of 200-300 words explaining why the chosen model offers the best option for the implementation requested in the user's prompt. Use technical concepts to justify this choice. Focus on how well it fulfills the original requirements, code quality, best practices, and technical soundness.]
        
        ## ❌ Why [REJECTED_MODEL] is Inferior
        [Write a technical argumentation paragraph of 100-200 words explaining why the rejected model is inferior using technical concepts. Focus on specific technical shortcomings, requirement gaps, or code quality issues.]
        
        **Issue Type:** [technical_inconsistency | tool | code_correctness | setup | other]
        
        ## 📈 Technical Assessment Scores
        
        **Scoring Rules:**
        - Numbers 0-3: Positive score for Model A (0 = best, 2-3 = not good enough or equal quality)
        - Numbers 4-7: Positive score for Model B (7 = best, 4-5 = not good enough or equal quality)
        
        **Model A Scores:**
        - interaction_rating: [1-7]
        - code_logic: [1-7]
        - naming_clarity: [1-7]
        - organization_modularity: [1-7]
        - interface_design: [1-7]
        - error_handling: [1-7]
        - documentation: [1-7]
        - review_readiness: [1-7]
        
        **Model B Scores:**
        - interaction_rating: [1-7]
        - code_logic: [1-7]
        - naming_clarity: [1-7]
        - organization_modularity: [1-7]
        - interface_design: [1-7]
        - error_handling: [1-7]
        - documentation: [1-7]
        - review_readiness: [1-7]
        
        ## 🔄 Next Improvement Prompt
        [Generate a well-scoped follow-up prompt focused on improving the implementation without increasing task difficulty or adding new requirements. The prompt should be oriented toward code review improvements and maintaining the original scope. Think of this as a detailed Jira ticket for a mid-level engineer that focuses on refinement rather than expansion.]
        
        EVALUATION CRITERIA DEFINITIONS:
        - **interaction_rating**: How well does the model engage with the problem? Does it explore edge cases, explain decisions, and allow for user iteration?
        - **code_logic**: Is the code logically correct, efficient, and follows best practices? Are there bugs or performance issues?
        - **naming_clarity**: Are variable, function, and class names descriptive, intuitive, and consistent?
        - **organization_modularity**: Is the code well-structured, modular, readable, and maintainable?
        - **interface_design**: Are user interfaces clear, usable, and appropriate for the task?
        - **error_handling**: Does the code handle invalid inputs and edge cases gracefully with appropriate validation?
        - **documentation**: Are comments and documentation useful, concise, and focused on non-obvious aspects?
        - **review_readiness**: Is the code ready for production-level pull request review with consistent style?
        
        ISSUE TYPE DEFINITIONS:
        - **technical_inconsistency**: Code has inconsistent patterns, conflicting approaches, or technical contradictions
        - **tool**: Incorrect or inappropriate use of tools, libraries, or frameworks
        - **code_correctness**: Logical errors, bugs, or incorrect implementation that prevents proper functionality
        - **setup**: Problems with configuration, environment setup, or deployment-related issues
        - **other**: Issues that don't fit the above categories but represent clear technical problems
        
        Be objective, thorough, and provide clear technical reasoning. Focus on how well each implementation serves the original purpose while maintaining high code quality standards.
        """

    def _create_requirement_focused_analysis_prompt(self, request: RefinedMultiModelAnalysisRequest) -> str:
        """
        Create a refined prompt focused on structured evaluation output.
        """
        evaluator_prompt = request.evaluator_prompt
        
        # Create file mappings
        current_files_map = {f.path: f for f in request.current_files}
        model_a_files_map = {f.path: f for f in request.model_a_files}
        model_b_files_map = {f.path: f for f in request.model_b_files}
        
        # Get all unique file paths
        all_file_paths = set(current_files_map.keys()) | set(model_a_files_map.keys()) | set(model_b_files_map.keys())
        
        prompt_parts = [
            "# Structured Code Implementation Evaluation",
            
            f"\n## Original Evaluator Request:",
            f'"{evaluator_prompt.original_prompt}"',
            
            "\n## Your Mission:",
            "Evaluate two AI model implementations and determine which better fulfills the original request.",
            "Provide a structured response following the EXACT format specified in your system message.",
            
            "\n## File Analysis Data:",
            f"- Current Files: {len(request.current_files)}",
            f"- Model A Files: {len(request.model_a_files)}",
            f"- Model B Files: {len(request.model_b_files)}",
            f"- Total Unique Files: {len(all_file_paths)}",
            
            "\n## Code Implementations to Evaluate:",
        ]
        
        # Show code files
        for file_path in sorted(all_file_paths):
            current_file = current_files_map.get(file_path)
            model_a_file = model_a_files_map.get(file_path)
            model_b_file = model_b_files_map.get(file_path)
            
            prompt_parts.append(f"\n### File: {file_path}")
            
            # Current version
            if current_file:
                prompt_parts.extend([
                    f"\n#### CURRENT VERSION ({current_file.file_type}):",
                    f"```{current_file.file_type}",
                    current_file.content,
                    "```"
                ])
            else:
                prompt_parts.append("\n#### CURRENT VERSION: *File does not exist*")
            
            # Model A version
            if model_a_file:
                prompt_parts.extend([
                    f"\n#### MODEL A IMPLEMENTATION ({model_a_file.file_type}):",
                    f"```{model_a_file.file_type}",
                    model_a_file.content,
                    "```"
                ])
            else:
                prompt_parts.append("\n#### MODEL A IMPLEMENTATION: *File does not exist*")
            
            # Model B version
            if model_b_file:
                prompt_parts.extend([
                    f"\n#### MODEL B IMPLEMENTATION ({model_b_file.file_type}):",
                    f"```{model_b_file.file_type}",
                    model_b_file.content,
                    "```"
                ])
            else:
                prompt_parts.append("\n#### MODEL B IMPLEMENTATION: *File does not exist*")
        
        prompt_parts.extend([
            "\n## IMPORTANT:",
            "Follow the EXACT output format specified in your system message.",
            "Include all required sections in the specified order.",
            "Provide technical justifications for your evaluations.",
            "Generate scores according to the specified scoring rules.",
            "Create a meaningful next improvement prompt."
        ])
        
        return "\n".join(prompt_parts)

    def _parse_structured_comparison_result(self, response_content: str) -> Dict:
        """
        Parse the structured LLM response into organized data.
        """
        
        # Initialize result structure
        result = {
            "chosen_model": "neither",
            "confidence_score": 5.0,
            "original_request": "",
            "file_summary": {},
            "winner_justification": "",
            "loser_critique": "",
            "issue_type": "other",
            "model_a_scores": {},
            "model_b_scores": {},
            "next_prompt": "",
            "raw_response": response_content
        }
        
        lines = response_content.split('\n')
        current_section = None
        current_content = []
        
        for line in lines:
            line_stripped = line.strip()
            
            # Detect sections
            if "🎯 Evaluation Result:" in line and "🏆" in line:
                if "MODEL_A" in line.upper():
                    result["chosen_model"] = "model_a"
                elif "MODEL_B" in line.upper():
                    result["chosen_model"] = "model_b"
                current_section = "evaluation_result"
                
            elif "📋 Original Evaluator Request" in line:
                current_section = "original_request"
                
            elif "📊 File Analysis Summary" in line:
                current_section = "file_summary"
                
            elif "✅ Why" in line and "Superior Implementation" in line:
                current_section = "winner_justification"
                current_content = []
                
            elif "❌ Why" in line and "Inferior" in line:
                current_section = "loser_critique"
                current_content = []
                
            elif "**Issue Type:**" in line:
                issue_type_line = line.replace("**Issue Type:**", "").strip()
                for issue_type in ["technical_inconsistency", "tool", "code_correctness", "setup", "other"]:
                    if issue_type in issue_type_line:
                        result["issue_type"] = issue_type
                        break
                current_section = "issue_type"
                
            elif "📈 Technical Assessment Scores" in line:
                current_section = "scores"
                
            elif "**Model A Scores:**" in line:
                current_section = "model_a_scores"
                
            elif "**Model B Scores:**" in line:
                current_section = "model_b_scores"
                
            elif "🔄 Next Improvement Prompt" in line:
                current_section = "next_prompt"
                current_content = []
                
            # Parse content based on current section
            elif current_section == "original_request" and line_stripped.startswith(">"):
                result["original_request"] = line_stripped[1:].strip().strip('"')
                
            elif current_section == "file_summary" and line_stripped.startswith("- **"):
                if "Current Files:" in line:
                    try:
                        result["file_summary"]["current"] = int(line_stripped.split(":")[-1].strip())
                    except:
                        pass
                elif "Model A Files:" in line:
                    try:
                        result["file_summary"]["model_a"] = int(line_stripped.split(":")[-1].strip())
                    except:
                        pass
                elif "Model B Files:" in line:
                    try:
                        result["file_summary"]["model_b"] = int(line_stripped.split(":")[-1].strip())
                    except:
                        pass
                elif "Total Unique Files:" in line:
                    try:
                        result["file_summary"]["total"] = int(line_stripped.split(":")[-1].strip())
                    except:
                        pass
                        
            elif current_section == "winner_justification" and line_stripped and not line_stripped.startswith("##"):
                current_content.append(line_stripped)
                
            elif current_section == "loser_critique" and line_stripped and not line_stripped.startswith("##") and not line_stripped.startswith("**Issue Type:**"):
                current_content.append(line_stripped)
                
            elif current_section in ["model_a_scores", "model_b_scores"] and ":" in line_stripped and line_stripped.startswith("- "):
                try:
                    score_line = line_stripped[2:].strip()  # Remove "- "
                    score_name, score_value = score_line.split(":", 1)
                    score_name = score_name.strip()
                    score_value = score_value.strip().strip("[]")
                    
                    # Extract numeric score
                    score_num = None
                    for char in score_value:
                        if char.isdigit():
                            score_num = int(char)
                            break
                    
                    if score_num and current_section == "model_a_scores":
                        result["model_a_scores"][score_name] = score_num
                    elif score_num and current_section == "model_b_scores":
                        result["model_b_scores"][score_name] = score_num
                except:
                    pass
                    
            elif current_section == "next_prompt" and line_stripped and not line_stripped.startswith("##"):
                current_content.append(line_stripped)
        
        # Join multi-line content
        if current_section == "winner_justification":
            result["winner_justification"] = " ".join(current_content).strip()
        elif current_section == "loser_critique":
            result["loser_critique"] = " ".join(current_content).strip()
        elif current_section == "next_prompt":
            result["next_prompt"] = " ".join(current_content).strip()
        
        return result

    def _parse_comparison_result(self, response_content: str) -> ModelComparisonResult:
        """Parse the LLM response into a structured result (legacy method for backward compatibility)."""
        
        # Initialize default values
        chosen_model = ModelChoice.NEITHER
        confidence_score = 5.0
        reasoning = "Unable to parse reasoning from response"
        pros_model_a = []
        cons_model_a = []
        pros_model_b = []
        cons_model_b = []
        detailed_analysis = response_content
        
        lines = response_content.split('\n')
        current_section = None
        
        for line in lines:
            line = line.strip()
            
            # Parse chosen model
            if line.startswith("**CHOSEN MODEL:**"):
                model_text = line.replace("**CHOSEN MODEL:**", "").strip().upper()
                if "MODEL_A" in model_text:
                    chosen_model = ModelChoice.MODEL_A
                elif "MODEL_B" in model_text:
                    chosen_model = ModelChoice.MODEL_B
                elif "BOTH_GOOD" in model_text:
                    chosen_model = ModelChoice.BOTH_GOOD
                else:
                    chosen_model = ModelChoice.NEITHER
            
            # Parse confidence score
            elif line.startswith("**CONFIDENCE SCORE:**") or line.startswith("**PRIMARY REASONING:**"):
                if "CONFIDENCE SCORE" in line:
                    try:
                        score_text = line.replace("**CONFIDENCE SCORE:**", "").strip()
                        confidence_score = float(score_text.split()[0])
                    except:
                        confidence_score = 5.0
                elif "PRIMARY REASONING" in line:
                    reasoning = line.replace("**PRIMARY REASONING:**", "").strip()
            
            # Track sections
            elif "**MODEL A OVERALL ASSESSMENT:**" in line:
                current_section = "model_a"
            elif "**MODEL B OVERALL ASSESSMENT:**" in line:
                current_section = "model_b"
            elif "✅ Strengths:" in line:
                current_section += "_pros"
            elif "❌ Weaknesses:" in line:
                current_section += "_cons"
            elif "**DETAILED ANALYSIS:**" in line:
                current_section = "detailed"
            
            # Parse lists
            elif line.startswith("- ") and current_section:
                item = line[2:].strip()
                if current_section == "model_a_pros":
                    pros_model_a.append(item)
                elif current_section == "model_a_cons":
                    cons_model_a.append(item)
                elif current_section == "model_b_pros":
                    pros_model_b.append(item)
                elif current_section == "model_b_cons":
                    cons_model_b.append(item)
        
        return ModelComparisonResult(
            chosen_model=chosen_model,
            confidence_score=confidence_score,
            reasoning=reasoning,
            pros_model_a=pros_model_a,
            cons_model_a=cons_model_a,
            pros_model_b=pros_model_b,
            cons_model_b=cons_model_b,
            detailed_analysis=detailed_analysis
        )

    def evaluate_models_against_requirements(self, 
                                           evaluator_prompt_text: str,
                                           current_files: List[FileContent],
                                           model_a_files: List[FileContent],
                                           model_b_files: List[FileContent],
                                           custom_criteria: List[str] = None) -> Dict:
        """
        Evaluate two model implementations with structured output parsing.
        """
        
        # Extract requirements from the evaluator prompt
        print("Extracting requirements from evaluator prompt...")
        evaluator_prompt = self._extract_requirements_from_prompt(evaluator_prompt_text)
        
        # Create refined analysis request
        analysis_request = RefinedMultiModelAnalysisRequest(
            evaluator_prompt=evaluator_prompt,
            current_files=current_files,
            model_a_files=model_a_files,
            model_b_files=model_b_files,
            analysis_type="requirement_focused",
            custom_evaluation_criteria=custom_criteria
        )
        
        # Generate the analysis prompt
        analysis_prompt = self._create_requirement_focused_analysis_prompt(analysis_request)
        
        # Get system message
        system_message = self._get_requirement_focused_system_message(analysis_request.analysis_type)
        
        # Create LLM messages
        messages = [
            SystemMessage(content=system_message),
            HumanMessage(content=analysis_prompt)
        ]
        
        # Get response from LLM
        print("Evaluating model implementations with structured output...")
        response = self.llm.invoke(messages)
        
        # Parse the structured response
        parsed_evaluation = self._parse_structured_comparison_result(response.content)
        
        # Process response
        analysis_result = {
            "evaluation_type": "structured_requirement_focused",
            "evaluator_prompt": {
                "original_text": evaluator_prompt_text,
                "extracted_requirements": evaluator_prompt.requirements,
                "success_criteria": evaluator_prompt.success_criteria,
                "priority_aspects": evaluator_prompt.priority_aspects
            },
            "files_analyzed": {
                "current_files": len(current_files),
                "model_a_files": len(model_a_files),
                "model_b_files": len(model_b_files),
                "total_unique_files": len(set(f.path for f in current_files) | 
                                        set(f.path for f in model_a_files) |
                                        set(f.path for f in model_b_files))
            },
            "parsed_evaluation": parsed_evaluation,
            "raw_response": response.content,
            "raw_prompt_sent": analysis_prompt
        }
        
        return analysis_result

    def save_structured_evaluation_report(self, analysis_result: Dict, output_file: str):
        """
        Save structured evaluation results to a file.
        """
        output_path = Path(output_file)
        output_path.parent.mkdir(parents=True, exist_ok=True)
        
        # Generate structured markdown report
        report_content = self._generate_structured_evaluation_report(analysis_result)
        
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(report_content)
        
        print(f"Structured evaluation report saved to: {output_file}")

    def _generate_structured_evaluation_report(self, analysis_result: Dict) -> str:
        """Generate a structured markdown report from evaluation results."""
        
        parsed_result = analysis_result.get("parsed_evaluation", {})
        
        chosen_model = parsed_result.get("chosen_model", "neither")
        if chosen_model == "model_a":
            winner_display = "🏆 **MODEL A** (Winner)"
        elif chosen_model == "model_b":
            winner_display = "🏆 **MODEL B** (Winner)"
        else:
            winner_display = "❌ **NO CLEAR WINNER**"
        
        report_parts = [
            "# Structured Code Evaluation Report",
            f"\n**Evaluation Type:** Requirement-Focused Assessment",
            f"\n**Generated:** {self._get_timestamp()}",
            f"\n**Evaluator:** CAMB3LL",
            
            f"\n## 🎯 Evaluation Result: {winner_display}",
            
            "\n## 📋 Original Evaluator Request",
            f"\n> \"{parsed_result.get('original_request', 'Not captured')}\"",
            
            "\n## 📊 File Analysis Summary",
            f"- **Current Files:** {parsed_result.get('file_summary', {}).get('current', 'N/A')}",
            f"- **Model A Files:** {parsed_result.get('file_summary', {}).get('model_a', 'N/A')}",
            f"- **Model B Files:** {parsed_result.get('file_summary', {}).get('model_b', 'N/A')}",
            f"- **Total Unique Files:** {parsed_result.get('file_summary', {}).get('total', 'N/A')}",
            
            "\n## ✅ Winner Justification",
            f"\n{parsed_result.get('winner_justification', 'No justification provided')}",
            
            "\n## ❌ Rejected Model Critique",
            f"\n{parsed_result.get('loser_critique', 'No critique provided')}",
            f"\n**Issue Type:** {parsed_result.get('issue_type', 'other')}",
            
            "\n## 📈 Technical Assessment Scores",
            "\n### Model A Scores:",
        ]
        
        # Add Model A scores
        model_a_scores = parsed_result.get('model_a_scores', {})
        for metric, score in model_a_scores.items():
            report_parts.append(f"- **{metric}:** {score}/7")
        
        report_parts.append("\n### Model B Scores:")
        
        # Add Model B scores  
        model_b_scores = parsed_result.get('model_b_scores', {})
        for metric, score in model_b_scores.items():
            report_parts.append(f"- **{metric}:** {score}/7")
        
        report_parts.extend([
            "\n## 🔄 Next Improvement Prompt",
            f"\n{parsed_result.get('next_prompt', 'No follow-up prompt provided')}",
            
            "\n---",
            "\n## 📝 Raw LLM Response",
            "\n```markdown",
            parsed_result.get('raw_response', 'No raw response captured'),
            "\n```"
        ])
        
        return "\n".join(report_parts)

    # Legacy methods for backward compatibility
    def save_requirement_evaluation_report(self, analysis_result: Dict, output_file: str):
        """
        Legacy method - redirects to structured evaluation report.
        """
        return self.save_structured_evaluation_report(analysis_result, output_file)

    def _generate_requirement_evaluation_report(self, analysis_result: Dict) -> str:
        """
        Legacy method - redirects to structured evaluation report.
        """
        return self._generate_structured_evaluation_report(analysis_result)

In [None]:
def main_structured_evaluation_example():
    """Example usage for structured requirement-focused evaluation."""
    
    # Initialize the agent
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        print("Error: OPENAI_API_KEY environment variable not set")
        return
    
    agent = CodeReviewerAgent(api_key=api_key)
    
    # Example evaluator prompt
    evaluator_prompt = "Genera un código de Python para imprimir en consola 'Hello World' de la manera más eficiente y siguiendo las mejores prácticas de Python"
    
    # Example paths
    current_code_dir = "current_code"
    model_a_code_dir = "model_generated_code_a"
    model_b_code_dir = "model_generated_code_b"
    
    try:
        # Load files from all directories
        print("Loading files for structured requirement evaluation...")
        current_files = agent.load_files_from_directory(current_code_dir)
        model_a_files = agent.load_files_from_directory(model_a_code_dir)
        model_b_files = agent.load_files_from_directory(model_b_code_dir)
        
        print(f"Loaded {len(current_files)} current, {len(model_a_files)} Model A, {len(model_b_files)} Model B files")
        
        # Perform structured requirement-focused evaluation
        print(f"\nEvaluating implementations against: '{evaluator_prompt}'")
        results = agent.evaluate_models_against_requirements(
            evaluator_prompt_text=evaluator_prompt,
            current_files=current_files,
            model_a_files=model_a_files,
            model_b_files=model_b_files,
            custom_criteria=["Code efficiency", "Python best practices", "Simplicity"]
        )
        
        # Save structured report
        agent.save_structured_evaluation_report(results, "evaluation_reports/structured_requirement_evaluation.md")
        
        # Print structured summary
        parsed_eval = results.get('parsed_evaluation', {})
        chosen = parsed_eval.get('chosen_model', 'unknown')
        next_prompt = parsed_eval.get('next_prompt', 'No follow-up prompt generated')
        issue_type = parsed_eval.get('issue_type', 'other')
        
        print(f"\n🎯 STRUCTURED EVALUATION RESULT:")
        print(f"   Original Request: '{evaluator_prompt}'")
        print(f"   Winner: {chosen.upper()}")
        print(f"   Issue Type: {issue_type}")
        print(f"   Next Improvement Prompt: {next_prompt}")
        print("\nStructured requirement-focused evaluation completed successfully!")
        
        # Print scores summary if available
        model_a_scores = parsed_eval.get('model_a_scores', {})
        model_b_scores = parsed_eval.get('model_b_scores', {})
        
        if model_a_scores or model_b_scores:
            print(f"\n📈 SCORING SUMMARY:")
            print(f"   Model A Scores: {len(model_a_scores)} metrics evaluated")
            print(f"   Model B Scores: {len(model_b_scores)} metrics evaluated")
            
            # Show average scores if available
            if model_a_scores:
                avg_a = sum(model_a_scores.values()) / len(model_a_scores)
                print(f"   Model A Average: {avg_a:.1f}/7")
            if model_b_scores:
                avg_b = sum(model_b_scores.values()) / len(model_b_scores)
                print(f"   Model B Average: {avg_b:.1f}/7")
        
    except Exception as e:
        print(f"Error during structured evaluation: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
        main_structured_evaluation_example()