In [None]:
import os
import json
import tarfile
import zipfile
import hashlib
import re
from typing import Dict, List, Optional, Tuple, Union
from pathlib import Path
from dataclasses import dataclass
from datetime import datetime
import PyPDF2
import logging

In [None]:
@dataclass
class FileContent:
    """Represents a file with its content and metadata."""

    path: str
    content: str
    file_type: str
    hash: str


@dataclass
class ExtractedFileInfo:
    """Information about an extracted file."""

    original_path: str
    extracted_path: str
    model_source: str  # "A" or "B"
    file_type: str
    extraction_status: str
    file_size: int
    hash: str


@dataclass
class ExtractionReport:
    """Report of the extraction process."""

    timestamp: str
    total_files_extracted: int
    model_a_files: int
    model_b_files: int
    failed_extractions: List[str]
    extracted_files: List[ExtractedFileInfo]
    pdf_analysis: Dict

In [None]:
class FileExtractionAgent:
    """
    Enhanced agent for extracting files from AI model responses with selective filtering.
    Only extracts files specifically mentioned in each model's response.
    """

    def __init__(self, base_output_dir: str = "extracted_files"):
        """Initialize the enhanced file extraction agent."""
        self.base_output_dir = Path(base_output_dir)
        self.model_a_dir = self.base_output_dir / "model_A_extracted"
        self.model_b_dir = self.base_output_dir / "model_B_extracted"

        # Supported file extensions (same as evaluator_builder)
        self.supported_extensions = {
            ".py": "python",
            ".js": "javascript",
            ".ts": "typescript",
            ".java": "java",
            ".cpp": "cpp",
            ".c": "c",
            ".cs": "csharp",
            ".go": "go",
            ".rs": "rust",
            ".php": "php",
            ".rb": "ruby",
            ".swift": "swift",
            ".kt": "kotlin",
            ".scala": "scala",
            ".html": "html",
            ".css": "css",
            ".sql": "sql",
            ".sh": "shell",
            ".yaml": "yaml",
            ".yml": "yaml",
            ".json": "json",
            ".xml": "xml",
            ".md": "markdown",
            ".txt": "text",
            ".conf": "config",
            ".ini": "config",
            ".log": "text",
        }

        # Setup logging
        logging.basicConfig(
            level=logging.INFO,
            format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
        )
        self.logger = logging.getLogger(__name__)

        # Create output directories
        self._setup_directories()

    def _setup_directories(self):
        """Create necessary output directories."""
        self.model_a_dir.mkdir(parents=True, exist_ok=True)
        self.model_b_dir.mkdir(parents=True, exist_ok=True)
        self.logger.info(
            f"Created output directories: {self.model_a_dir}, {self.model_b_dir}"
        )

    def _calculate_file_hash(self, content: str) -> str:
        """Calculate MD5 hash of file content."""
        return hashlib.md5(content.encode("utf-8")).hexdigest()

    def _get_file_type(self, file_path: str) -> str:
        """Determine file type from extension."""
        ext = Path(file_path).suffix.lower()
        return self.supported_extensions.get(ext, "text")

    def _get_timestamp(self) -> str:
        """Get current timestamp."""
        return datetime.now().strftime("%Y-%m-%d %H:%M:%S UTC")

    def parse_pdf_responses_enhanced(self, pdf_path: str) -> Dict:
        """Enhanced PDF parsing with better model differentiation."""
        pdf_path = Path(pdf_path)
        if not pdf_path.exists():
            raise FileNotFoundError(f"PDF file not found: {pdf_path}")

        self.logger.info(f"Parsing PDF: {pdf_path}")

        # Extract text from PDF
        pdf_text = ""
        try:
            with open(pdf_path, "rb") as file:
                pdf_reader = PyPDF2.PdfReader(file)
                for page_num, page in enumerate(pdf_reader.pages):
                    page_text = page.extract_text()
                    pdf_text += f"\n--- PAGE {page_num + 1} ---\n" + page_text
        except Exception as e:
            self.logger.error(f"Error reading PDF: {e}")
            raise

        # Split into logical sections for better analysis
        sections = self._split_into_logical_sections(pdf_text)

        # Analyze each section for model indicators
        model_analysis = self._analyze_sections_for_models(sections)

        return {
            "raw_text": pdf_text,
            "sections": sections,
            "model_analysis": model_analysis,
            "file_references": self._extract_all_file_references(pdf_text),
            "model_sections": self._map_files_to_models(model_analysis),
        }

    def _split_into_logical_sections(self, text: str) -> List[Dict]:
        """Split text into logical sections based on headers, breaks, etc."""
        sections = []
        lines = text.split("\n")
        current_section = {"lines": [], "start_line": 0}

        for i, line in enumerate(lines):
            line_stripped = line.strip()

            # Detect section breaks
            is_section_break = (
                re.match(r"^#{1,6}\s+", line_stripped)  # Markdown headers
                or re.match(r"^[A-Z\s]{5,}$", line_stripped)  # ALL CAPS headers
                or re.match(r"^[\*\-=]{5,}$", line_stripped)  # Separator lines
                or re.match(
                    r"^(RESPONSE|MODEL|ASSISTANT|ANSWER)\s*[AB12]?[:\.]",
                    line_stripped,
                    re.IGNORECASE,
                )
                or re.match(
                    r"^(MODELO|RESPUESTA|ASISTENTE)\s*[AB12]?[:\.]",
                    line_stripped,
                    re.IGNORECASE,
                )
            )

            if is_section_break and len(current_section["lines"]) > 3:
                # End current section
                current_section["end_line"] = i
                current_section["text"] = "\n".join(current_section["lines"])
                sections.append(current_section)

                # Start new section
                current_section = {"lines": [line], "start_line": i}
            else:
                current_section["lines"].append(line)

        # Add final section
        if current_section["lines"]:
            current_section["end_line"] = len(lines)
            current_section["text"] = "\n".join(current_section["lines"])
            sections.append(current_section)

        self.logger.info(f"Split PDF into {len(sections)} logical sections")
        return sections

    def _analyze_sections_for_models(self, sections: List[Dict]) -> Dict:
        """Analyze each section to determine which model it belongs to."""
        model_sections = {"A": [], "B": [], "unknown": []}

        # Enhanced patterns for model detection
        model_a_indicators = [
            r"(?:^|\W)(?:model|assistant|response|option|implementation|solution)\s*a(?:\W|$)",
            r"(?:^|\W)(?:modelo|asistente|respuesta|opción|implementación|solución)\s*a(?:\W|$)",
            r"(?:^|\W)first\s+(?:model|assistant|response|option|implementation)",
            r"(?:^|\W)primer(?:o|a)?\s+(?:modelo|asistente|respuesta|opción)",
            r"(?:^|\W)a[:\.\s]\s*(?:model|assistant|implementation)",
            r"\ba\)\s*(?:model|assistant|implementation|solution)",
            r"approach\s*a\b",
            r"version\s*a\b",
        ]

        model_b_indicators = [
            r"(?:^|\W)(?:model|assistant|response|option|implementation|solution)\s*b(?:\W|$)",
            r"(?:^|\W)(?:modelo|asistente|respuesta|opción|implementación|solución)\s*b(?:\W|$)",
            r"(?:^|\W)second\s+(?:model|assistant|response|option|implementation)",
            r"(?:^|\W)segundo?\s+(?:modelo|asistente|respuesta|opción)",
            r"(?:^|\W)b[:\.\s]\s*(?:model|assistant|implementation)",
            r"\bb\)\s*(?:model|assistant|implementation|solution)",
            r"approach\s*b\b",
            r"version\s*b\b",
        ]

        for section in sections:
            text_lower = section["text"].lower()
            section_id = (
                len(model_sections["A"])
                + len(model_sections["B"])
                + len(model_sections["unknown"])
            )

            # Score each section for model A and B
            score_a = sum(
                len(re.findall(pattern, text_lower, re.IGNORECASE))
                for pattern in model_a_indicators
            )
            score_b = sum(
                len(re.findall(pattern, text_lower, re.IGNORECASE))
                for pattern in model_b_indicators
            )

            # Assign to model based on scores
            if score_a > score_b and score_a > 0:
                model_sections["A"].append(section)
                self.logger.info(
                    f"Section {section_id} assigned to Model A (score: A={score_a}, B={score_b})"
                )
            elif score_b > score_a and score_b > 0:
                model_sections["B"].append(section)
                self.logger.info(
                    f"Section {section_id} assigned to Model B (score: A={score_a}, B={score_b})"
                )
            else:
                model_sections["unknown"].append(section)
                self.logger.debug(
                    f"Section {section_id} unassigned (score: A={score_a}, B={score_b})"
                )

        return model_sections

    def _map_files_to_models(self, model_analysis: Dict) -> Dict:
        """Map specific files mentioned to each model."""
        model_a_content = "\n".join([s["text"] for s in model_analysis["A"]])
        model_b_content = "\n".join([s["text"] for s in model_analysis["B"]])

        model_a_files = self._extract_file_references(model_a_content)
        model_b_files = self._extract_file_references(model_b_content)

        self.logger.info(
            f"Model A files identified: {len(model_a_files)} - {model_a_files[:5]}"
        )
        self.logger.info(
            f"Model B files identified: {len(model_b_files)} - {model_b_files[:5]}"
        )

        return {
            "model_a_content": model_a_content,
            "model_b_content": model_b_content,
            "model_a_files": model_a_files,
            "model_b_files": model_b_files,
        }

    def _extract_file_references(self, text: str) -> List[str]:
        """Extract file references with improved patterns and code block detection."""
        found_files = set()

        # Enhanced file patterns with word boundaries
        file_patterns = [
            r"\b[a-zA-Z_][a-zA-Z0-9_]*\.py\b",
            r"\b[a-zA-Z_][a-zA-Z0-9_]*\.js\b",
            r"\b[a-zA-Z_][a-zA-Z0-9_]*\.html\b",
            r"\b[a-zA-Z_][a-zA-Z0-9_]*\.css\b",
            r"\b[a-zA-Z_][a-zA-Z0-9_]*\.json\b",
            r"\b[a-zA-Z_][a-zA-Z0-9_]*\.md\b",
            r"\b[a-zA-Z_][a-zA-Z0-9_]*\.yaml\b",
            r"\b[a-zA-Z_][a-zA-Z0-9_]*\.yml\b",
            r"\b[a-zA-Z_][a-zA-Z0-9_]*\.txt\b",
            r"\b[a-zA-Z_][a-zA-Z0-9_]*\.conf\b",
            r"\b[a-zA-Z_][a-zA-Z0-9_]*\.ini\b",
            r"\b[a-zA-Z_][a-zA-Z0-9_]*\.log\b",
        ]

        # First, extract from regular text
        for pattern in file_patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            found_files.update(matches)

        # Extract from code blocks specifically
        code_blocks = re.findall(r"```[\w]*\n(.*?)\n```", text, re.DOTALL)
        for block in code_blocks:
            for pattern in file_patterns:
                matches = re.findall(pattern, block, re.IGNORECASE)
                found_files.update(matches)

        # Look for file paths in quotes or backticks
        quoted_patterns = [
            r'["`\']([a-zA-Z_][a-zA-Z0-9_/]*\.[a-zA-Z]{1,5})["`\']',
            r"File:\s*([a-zA-Z_][a-zA-Z0-9_/]*\.[a-zA-Z]{1,5})",
            r"filename:\s*([a-zA-Z_][a-zA-Z0-9_/]*\.[a-zA-Z]{1,5})",
        ]

        for pattern in quoted_patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            found_files.update(matches)

        # Filter out common false positives
        filtered_files = []
        for file_ref in found_files:
            # Skip overly generic names or obvious false positives
            if not any(
                skip in file_ref.lower()
                for skip in ["example.", "test.", "sample.", "dummy."]
            ):
                filtered_files.append(file_ref)

        self.logger.info(
            f"Found {len(filtered_files)} file references: {filtered_files[:10]}"
        )
        return sorted(list(set(filtered_files)))

    def _extract_all_file_references(self, text: str) -> List[str]:
        """Extract all file references from the entire text."""
        return self._extract_file_references(text)

    def extract_files_selectively(
        self, source_dir: str, target_dir: str, model_id: str, required_files: List[str]
    ) -> List[ExtractedFileInfo]:
        """Extract ONLY the files that are specifically mentioned in the model's response."""
        source_path = Path(source_dir)
        target_path = Path(target_dir)

        if not source_path.exists():
            self.logger.error(f"Source directory not found: {source_path}")
            return []

        if not required_files:
            self.logger.warning(f"No required files specified for Model {model_id}")
            return []

        self.logger.info(
            f"Looking for {len(required_files)} specific files for Model {model_id}"
        )
        self.logger.info(f"Required files: {required_files}")

        target_path.mkdir(parents=True, exist_ok=True)
        extracted_files = []
        files_found = set()

        # Find all archive files
        archive_files = self._find_archive_files(source_path)

        if not archive_files:
            self.logger.error(f"No archive files found in {source_path}")
            return []

        # Search for required files in each archive
        for archive_file in archive_files:
            try:
                found_in_archive = self._extract_specific_files_from_archive(
                    archive_file, target_path, model_id, required_files
                )
                extracted_files.extend(found_in_archive)
                files_found.update([f.original_path for f in found_in_archive])

            except Exception as e:
                self.logger.error(f"Failed to process {archive_file}: {e}")

        # Report on missing files
        missing_files = set(required_files) - files_found
        if missing_files:
            self.logger.warning(
                f"Model {model_id}: Could not find {len(missing_files)} required files: {list(missing_files)[:5]}"
            )
        else:
            self.logger.info(
                f"Model {model_id}: Successfully found all {len(required_files)} required files"
            )

        return extracted_files

    def _find_archive_files(self, source_path: Path) -> List[Path]:
        """Find all archive files in the source directory."""
        archive_extensions = [".tar", ".tar.gz", ".tar.bz2", ".tar.xz", ".zip", ".7z"]
        archive_files = []

        for ext in archive_extensions:
            found = list(source_path.glob(f"*{ext}"))
            archive_files.extend(found)
            if found:
                self.logger.info(f"Found {len(found)} files with extension {ext}")

        # Also check for files without clear extensions that might be archives
        for file_path in source_path.iterdir():
            if file_path.is_file() and not any(
                str(file_path).endswith(ext) for ext in archive_extensions
            ):
                # Try to identify by content
                try:
                    with open(file_path, "rb") as f:
                        header = f.read(10)
                        # Check magic numbers for common archive formats
                        if header.startswith(b"PK"):  # ZIP
                            archive_files.append(file_path)
                            self.logger.info(
                                f"Detected ZIP archive by magic number: {file_path}"
                            )
                        elif header.startswith(b"\x1f\x8b"):  # GZIP
                            archive_files.append(file_path)
                            self.logger.info(
                                f"Detected GZIP archive by magic number: {file_path}"
                            )
                except Exception as e:
                    self.logger.debug(f"Could not read {file_path}: {e}")

        self.logger.info(
            f"Total archive files found in {source_path}: {len(archive_files)}"
        )
        for archive in archive_files:
            self.logger.info(f"  - {archive.name} ({archive.stat().st_size} bytes)")

        return archive_files

    def _extract_specific_files_from_archive(
        self,
        archive_path: Path,
        target_dir: Path,
        model_id: str,
        required_files: List[str],
    ) -> List[ExtractedFileInfo]:
        """Extract only specific files from an archive."""
        extracted_files = []

        self.logger.info(
            f"Searching {archive_path.name} for {len(required_files)} specific files"
        )

        if archive_path.suffix == ".zip":
            with zipfile.ZipFile(archive_path, "r") as zip_ref:
                available_files = zip_ref.namelist()

                for required_file in required_files:
                    # Try exact match first
                    matching_files = self._find_matching_files(
                        required_file, available_files
                    )

                    for match in matching_files:
                        try:
                            # Extract only this specific file
                            extracted_path = target_dir / match
                            extracted_path.parent.mkdir(parents=True, exist_ok=True)

                            zip_ref.extract(match, target_dir)

                            # Create FileInfo
                            file_info = self._create_file_info(
                                extracted_path, match, model_id
                            )
                            if file_info:
                                extracted_files.append(file_info)
                                self.logger.info(f"✅ Extracted: {match}")

                        except Exception as e:
                            self.logger.error(f"Failed to extract {match}: {e}")

        elif archive_path.suffix in [".tar", ".tar.gz", ".tar.bz2"]:
            mode = self._get_tar_mode(archive_path)

            with tarfile.open(archive_path, mode) as tar_ref:
                available_files = [m.name for m in tar_ref.getmembers() if m.isfile()]

                for required_file in required_files:
                    matching_files = self._find_matching_files(
                        required_file, available_files
                    )

                    for match in matching_files:
                        try:
                            # Extract only this specific file
                            extracted_path = target_dir / match
                            extracted_path.parent.mkdir(parents=True, exist_ok=True)

                            member = tar_ref.getmember(match)
                            tar_ref.extract(member, target_dir)

                            # Create FileInfo
                            file_info = self._create_file_info(
                                extracted_path, match, model_id
                            )
                            if file_info:
                                extracted_files.append(file_info)
                                self.logger.info(f"✅ Extracted: {match}")

                        except Exception as e:
                            self.logger.error(f"Failed to extract {match}: {e}")

        return extracted_files

    def _get_tar_mode(self, archive_path: Path) -> str:
        """Get the appropriate mode for opening tar archives."""
        if archive_path.suffix == ".tar.gz":
            return "r:gz"
        elif archive_path.suffix == ".tar.bz2":
            return "r:bz2"
        elif archive_path.suffix == ".tar.xz":
            return "r:xz"
        else:
            return "r"

    def _find_matching_files(
        self, required_file: str, available_files: List[str]
    ) -> List[str]:
        """Find files in archive that match the required file name."""
        matches = []

        # Clean the required file name
        required_clean = required_file.strip().replace("\\", "/")

        for available in available_files:
            available_clean = available.replace("\\", "/")

            # Exact match
            if available_clean == required_clean:
                matches.append(available)
                continue

            # Basename match (file.py matches path/to/file.py)
            if Path(available_clean).name == Path(required_clean).name:
                matches.append(available)
                continue

            # Path ends with required file
            if available_clean.endswith(required_clean):
                matches.append(available)
                continue

        if matches:
            self.logger.debug(
                f"Found {len(matches)} matches for '{required_file}': {matches}"
            )
        else:
            self.logger.debug(f"No matches found for '{required_file}' in archive")

        return matches

    def _create_file_info(
        self, extracted_path: Path, original_path: str, model_id: str
    ) -> Optional[ExtractedFileInfo]:
        """Create FileInfo object for successfully extracted file."""
        try:
            if extracted_path.exists():
                with open(extracted_path, "r", encoding="utf-8") as f:
                    content = f.read()

                return ExtractedFileInfo(
                    original_path=original_path,
                    extracted_path=str(extracted_path),
                    model_source=model_id,
                    file_type=self._get_file_type(original_path),
                    extraction_status="SUCCESS",
                    file_size=len(content),
                    hash=self._calculate_file_hash(content),
                )
            else:
                self.logger.error(f"Extracted file not found: {extracted_path}")
                return None

        except UnicodeDecodeError:
            # Handle binary files
            file_size = extracted_path.stat().st_size if extracted_path.exists() else 0
            return ExtractedFileInfo(
                original_path=original_path,
                extracted_path=str(extracted_path),
                model_source=model_id,
                file_type=self._get_file_type(original_path),
                extraction_status="SUCCESS (BINARY)",
                file_size=file_size,
                hash="",
            )
        except Exception as e:
            self.logger.error(f"Error creating file info for {extracted_path}: {e}")
            return None

    def organize_extracted_files(self, files: List[ExtractedFileInfo]) -> Dict:
        """Organize extracted files and generate summary."""
        model_a_files = [f for f in files if f.model_source == "A"]
        model_b_files = [f for f in files if f.model_source == "B"]

        successful_a = [
            f for f in model_a_files if f.extraction_status.startswith("SUCCESS")
        ]
        successful_b = [
            f for f in model_b_files if f.extraction_status.startswith("SUCCESS")
        ]

        failed_a = [
            f for f in model_a_files if not f.extraction_status.startswith("SUCCESS")
        ]
        failed_b = [
            f for f in model_b_files if not f.extraction_status.startswith("SUCCESS")
        ]

        # Group by file type
        file_types_a = {}
        file_types_b = {}

        for f in successful_a:
            if f.file_type not in file_types_a:
                file_types_a[f.file_type] = []
            file_types_a[f.file_type].append(f)

        for f in successful_b:
            if f.file_type not in file_types_b:
                file_types_b[f.file_type] = []
            file_types_b[f.file_type].append(f)

        return {
            "summary": {
                "total_files": len(files),
                "model_a_files": len(model_a_files),
                "model_b_files": len(model_b_files),
                "successful_extractions": len(successful_a) + len(successful_b),
                "failed_extractions": len(failed_a) + len(failed_b),
            },
            "model_a": {
                "successful": successful_a,
                "failed": failed_a,
                "by_type": file_types_a,
            },
            "model_b": {
                "successful": successful_b,
                "failed": failed_b,
                "by_type": file_types_b,
            },
        }

    def convert_to_file_content(
        self, extracted_files: List[ExtractedFileInfo]
    ) -> List[FileContent]:
        """Convert ExtractedFileInfo to FileContent for compatibility with evaluator_builder."""
        file_contents = []

        for file_info in extracted_files:
            if file_info.extraction_status.startswith("SUCCESS") and file_info.hash:
                try:
                    with open(file_info.extracted_path, "r", encoding="utf-8") as f:
                        content = f.read()

                    file_content = FileContent(
                        path=file_info.original_path,
                        content=content,
                        file_type=file_info.file_type,
                        hash=file_info.hash,
                    )
                    file_contents.append(file_content)

                except Exception as e:
                    self.logger.error(
                        f"Failed to convert {file_info.extracted_path}: {e}"
                    )

        return file_contents

    def generate_extraction_report(
        self, pdf_analysis: Dict, organization: Dict, output_path: Optional[str] = None
    ) -> ExtractionReport:
        """Generate comprehensive extraction report."""

        all_files = []
        if organization.get("model_a", {}).get("successful"):
            all_files.extend(organization["model_a"]["successful"])
        if organization.get("model_a", {}).get("failed"):
            all_files.extend(organization["model_a"]["failed"])
        if organization.get("model_b", {}).get("successful"):
            all_files.extend(organization["model_b"]["successful"])
        if organization.get("model_b", {}).get("failed"):
            all_files.extend(organization["model_b"]["failed"])

        failed_extractions = [
            f.original_path
            for f in all_files
            if not f.extraction_status.startswith("SUCCESS")
        ]

        report = ExtractionReport(
            timestamp=self._get_timestamp(),
            total_files_extracted=organization["summary"]["successful_extractions"],
            model_a_files=len(organization["model_a"]["successful"]),
            model_b_files=len(organization["model_b"]["successful"]),
            failed_extractions=failed_extractions,
            extracted_files=all_files,
            pdf_analysis=pdf_analysis,
        )

        # Save report if path provided
        if output_path:
            self._save_report(report, output_path)

        return report

    def _save_report(self, report: ExtractionReport, output_path: str):
        """Save extraction report to file."""
        report_data = {
            "timestamp": report.timestamp,
            "summary": {
                "total_files_extracted": report.total_files_extracted,
                "model_a_files": report.model_a_files,
                "model_b_files": report.model_b_files,
                "failed_extractions_count": len(report.failed_extractions),
            },
            "failed_extractions": report.failed_extractions,
            "pdf_analysis": report.pdf_analysis,
            "extracted_files": [
                {
                    "original_path": f.original_path,
                    "extracted_path": f.extracted_path,
                    "model_source": f.model_source,
                    "file_type": f.file_type,
                    "extraction_status": f.extraction_status,
                    "file_size": f.file_size,
                    "hash": f.hash,
                }
                for f in report.extracted_files
            ],
        }

        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(report_data, f, indent=2, ensure_ascii=False)

        self.logger.info(f"Extraction report saved to: {output_path}")

    def run_selective_extraction(
        self, pdf_path: str, model_a_archives_dir: str, model_b_archives_dir: str
    ) -> Tuple[List[FileContent], List[FileContent], ExtractionReport]:
        """Run extraction process with selective file filtering based on PDF analysis."""
        self.logger.info("Starting selective extraction process...")

        # Enhanced PDF parsing
        pdf_analysis = self.parse_pdf_responses_enhanced(pdf_path)

        # Get the files specifically mentioned for each model
        model_sections = pdf_analysis["model_sections"]
        model_a_required_files = model_sections["model_a_files"]
        model_b_required_files = model_sections["model_b_files"]

        self.logger.info(
            f"Model A requires {len(model_a_required_files)} specific files"
        )
        self.logger.info(
            f"Model B requires {len(model_b_required_files)} specific files"
        )

        if not model_a_required_files and not model_b_required_files:
            self.logger.warning(
                "No specific files identified for either model - falling back to all file extraction"
            )
            # Fallback to extracting all files mentioned in PDF
            all_files = pdf_analysis["file_references"]
            model_a_required_files = all_files[
                : len(all_files) // 2
            ]  # Split arbitrarily
            model_b_required_files = all_files[len(all_files) // 2 :]

        # Selective extraction for each model
        model_a_extracted = self.extract_files_selectively(
            model_a_archives_dir, str(self.model_a_dir), "A", model_a_required_files
        )

        model_b_extracted = self.extract_files_selectively(
            model_b_archives_dir, str(self.model_b_dir), "B", model_b_required_files
        )

        # Generate comprehensive report
        all_extracted = model_a_extracted + model_b_extracted
        organization = self.organize_extracted_files(all_extracted)

        # Enhanced report with selective extraction info
        enhanced_pdf_analysis = pdf_analysis.copy()
        enhanced_pdf_analysis["selective_extraction"] = {
            "model_a_required": model_a_required_files,
            "model_b_required": model_b_required_files,
            "model_a_found": len(model_a_extracted),
            "model_b_found": len(model_b_extracted),
        }

        report = self.generate_extraction_report(
            enhanced_pdf_analysis,
            organization,
            str(self.base_output_dir / "selective_extraction_report.json"),
        )

        # Convert to FileContent
        model_a_files = self.convert_to_file_content(model_a_extracted)
        model_b_files = self.convert_to_file_content(model_b_extracted)

        self.logger.info(f"Selective extraction complete:")
        self.logger.info(
            f"  Model A: {len(model_a_files)}/{len(model_a_required_files)} files"
        )
        self.logger.info(
            f"  Model B: {len(model_b_files)}/{len(model_b_required_files)} files"
        )

        return model_a_files, model_b_files, report

    def debug_extraction_setup(self) -> Dict:
        """Debug function to check extraction setup and environment."""
        debug_info = {
            "working_director": str(Path.cwd()),
            "base_output_dir": str(self.base_output_dir),
            "model_a_dir": str(self.model_a_dir),
            "model_b_dir": str(self.model_b_dir),
            "directories_exist": {
                "base_output": self.base_output_dir.exists(),
                "model_a": self.model_a_dir.exists(),
                "model_b": self.model_b_dir.exists(),
            },
            "available_files_and_dirs": [],
        }

        # List current directory contents
        try:
            for item in Path.cwd().iterdir():
                item_info = {
                    "name": item.name,
                    "type": "directory" if item.is_dir() else "file",
                    "path": str(item),
                }
                if item.is_dir():
                    try:
                        contents = [f.name for f in item.iterdir()]
                        item_info["contents"] = contents[:10]  # First 10 items
                    except:
                        item_info["contents"] = ["<access denied>"]
                debug_info["available_files_and_dirs"].append(item_info)
        except Exception as e:
            debug_info["directory_listing_error"] = str(e)

        return debug_info

In [None]:
def main_selective_extraction():
    """Main function to run selective extraction."""

    # Initialize the agent
    agent = FileExtractionAgent(base_output_dir="extracted_files")

    # Run selective extraction
    try:
        model_a_files, model_b_files, report = agent.run_selective_extraction(
            pdf_path="conversation.pdf",
            model_a_archives_dir="files_model_A",
            model_b_archives_dir="files_model_B",
        )

        print(f"\n🎯 SELECTIVE EXTRACTION COMPLETED!")
        print(f"📁 Model A Files: {len(model_a_files)}")
        print(f"📁 Model B Files: {len(model_b_files)}")
        print(f"📊 Total Extractions: {report.total_files_extracted}")
        print(f"❌ Failed Extractions: {len(report.failed_extractions)}")
        print(f"📋 Report saved to: extracted_files/selective_extraction_report.json")

        # The files are now ready for use with evaluator_builder
        print(f"\n✅ Files ready for evaluation with CodeReviewerAgent!")

        return model_a_files, model_b_files, report

    except Exception as e:
        print(f"❌ Extraction failed: {e}")
        import traceback

        traceback.print_exc()
        return None, None, None

In [None]:
if __name__ == "__main__":
    main_selective_extraction()