# 01: PDF Preprocessing Agent Development

**Objective**: Develop and test the PDF preprocessing agent that extracts and normalizes text from scientific PDFs.

**Outputs**:
- Normalized text by page/section
- Document structure map
- Metadata (title, authors, journal)

**Tools Used**:
- PyMuPDF for PDF extraction
- Text normalization utilities
- Structure analysis

## Setup

In [None]:
import sys
sys.path.append('../src')

import os
from pathlib import Path
import json
from pprint import pprint

# Import our tools
from tools.pdf_tools import (
    extract_text_from_pdf,
    extract_text_blocks_from_pdf,
    get_pdf_metadata,
    extract_section_text
)
from tools.text_tools import (
    normalize_scientific_text,
    extract_sentences,
    extract_numbers_from_text
)

# Import base agent
from agents.base_agent import BaseAgent

# Set up paths
PROJECT_ROOT = Path('../')
DATA_DIR = PROJECT_ROOT / 'data'
SAMPLE_PDFS_DIR = DATA_DIR / 'sample_pdfs'

print("✅ Imports successful")

## Load API Key

In [None]:
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

API_KEY = os.getenv('GOOGLE_API_KEY')

if API_KEY:
    print("✅ API Key loaded")
else:
    print("❌ API Key not found. Please set GOOGLE_API_KEY in .env file")

## Test PDF Extraction Tools

In [None]:
# List available sample PDFs
if SAMPLE_PDFS_DIR.exists():
    pdfs = list(SAMPLE_PDFS_DIR.glob('*.pdf'))
    print(f"Found {len(pdfs)} PDF(s) in sample directory:")
    for pdf in pdfs:
        print(f"  - {pdf.name}")
else:
    print(f"⚠️  Sample PDF directory not found: {SAMPLE_PDFS_DIR}")
    print("Please create the directory and add sample PDFs")

In [None]:
# Select a PDF to work with (update path as needed)
# For testing, you can use any scientific PDF
test_pdf_path = "path/to/your/test.pdf"  # UPDATE THIS

# Or use first available PDF
if 'pdfs' in locals() and pdfs:
    test_pdf_path = str(pdfs[0])
    print(f"Using: {test_pdf_path}")
else:
    print("Please specify test_pdf_path manually")

In [None]:
# Extract basic text
pdf_data = extract_text_from_pdf(test_pdf_path)

print(f"Pages: {pdf_data['page_count']}")
print(f"Total characters: {len(pdf_data['full_text'])}")
print(f"\nMetadata:")
pprint(pdf_data['metadata'])
print(f"\nFirst 500 characters:")
print(pdf_data['full_text'][:500])

In [None]:
# Extract structured blocks
pdf_blocks = extract_text_blocks_from_pdf(test_pdf_path)

print(f"Extracted {sum(len(blocks) for blocks in pdf_blocks['blocks'].values())} text blocks")
print(f"\nSample blocks from page 0:")
for i, block in enumerate(pdf_blocks['blocks'].get(0, [])[:3]):
    print(f"\nBlock {i}:")
    print(block[:200] + "..." if len(block) > 200 else block)

## Test Text Normalization

In [None]:
# Test normalization on extracted text
sample_text = pdf_data['page_texts'][0][:500] if pdf_data['page_texts'] else "Sample text"

print("Original:")
print(sample_text)
print("\n" + "="*80 + "\n")

normalized = normalize_scientific_text(sample_text)
print("Normalized:")
print(normalized)

## Create Preprocessing Agent

In [None]:
class PreprocessingAgent(BaseAgent):
    """
    Agent for preprocessing PDF files.
    
    Extracts text, normalizes it, and structures it for downstream processing.
    """
    
    def __init__(self, **kwargs):
        super().__init__(agent_name="PreprocessingAgent", **kwargs)
    
    def get_system_prompt(self) -> str:
        return """
You are a PDF preprocessing specialist for scientific literature.

Your task is to analyze extracted PDF text and identify key structural elements.

Given PDF text, identify:
1. Document metadata (title, authors, journal, year)
2. Section boundaries (Abstract, Methods, Results, Discussion, etc.)
3. Any extraction quality issues

Return a JSON object with:
{
  "title": "Full paper title",
  "authors": "Author list in format: LastName1 I1, LastName2 I2, ...",
  "journal": "Journal name",
  "year": Publication year (integer),
  "sections": {
    "abstract": "Text of abstract section",
    "methods": "Text of methods section",
    "results": "Text of results section",
    "discussion": "Text of discussion section"
  },
  "quality_notes": ["Any issues with extraction quality"]
}

If a section cannot be identified, use empty string.
"""
    
    def process(self, pdf_path: str) -> dict:
        """
        Process a PDF file.
        
        Args:
            pdf_path: Path to PDF file
            
        Returns:
            Preprocessed data structure
        """
        self.logger.info(f"Processing PDF: {pdf_path}")
        
        # Extract text
        pdf_data = extract_text_from_pdf(pdf_path)
        pdf_blocks = extract_text_blocks_from_pdf(pdf_path)
        
        # Normalize text
        normalized_text = normalize_scientific_text(pdf_data['full_text'])
        
        # Use LLM to identify structure
        prompt = f"""
Analyze this scientific paper and extract the key metadata and sections.

First 2000 characters:
{normalized_text[:2000]}

Full text length: {len(normalized_text)} characters

Provide the structured metadata and sections in JSON format.
"""
        
        try:
            response = self.call_model(prompt, json_mode=True)
            structure = self.parse_json_response(response)
        except Exception as e:
            self.logger.warning(f"LLM structure extraction failed: {e}")
            # Fallback to heuristic extraction
            structure = {
                "title": pdf_data['metadata'].get('title', ''),
                "authors": pdf_data['metadata'].get('author', ''),
                "journal": "",
                "year": None,
                "sections": {},
                "quality_notes": ["LLM structure extraction failed"]
            }
        
        # Combine all data
        result = {
            "file_path": pdf_path,
            "raw_data": pdf_data,
            "blocks_data": pdf_blocks,
            "normalized_text": normalized_text,
            "structure": structure,
            "page_count": pdf_data['page_count'],
            "total_chars": len(normalized_text)
        }
        
        self.logger.info("Preprocessing complete")
        return result

print("✅ PreprocessingAgent defined")

## Test Preprocessing Agent

In [None]:
# Initialize agent
preprocessing_agent = PreprocessingAgent()

print("✅ Agent initialized")

In [None]:
# Process the test PDF
result = preprocessing_agent.process(test_pdf_path)

print("Processing Results:")
print(f"  Pages: {result['page_count']}")
print(f"  Characters: {result['total_chars']:,}")
print(f"\nExtracted Structure:")
pprint(result['structure'])

## Save Preprocessed Data

In [None]:
# Save preprocessed data for use by other agents
output_path = DATA_DIR / 'outputs' / 'preprocessed_example.json'
output_path.parent.mkdir(parents=True, exist_ok=True)

# Remove large raw data before saving (keep structure and normalized text)
save_data = {
    "file_path": result['file_path'],
    "normalized_text": result['normalized_text'],
    "structure": result['structure'],
    "page_count": result['page_count'],
    "total_chars": result['total_chars']
}

with open(output_path, 'w') as f:
    json.dump(save_data, f, indent=2)

print(f"✅ Saved to: {output_path}")

## Next Steps

1. **Validate extraction quality**: Check if structure is correctly identified
2. **Test on multiple PDFs**: Ensure robustness across different formats
3. **Refine prompts**: Improve metadata extraction accuracy
4. **Add error handling**: Handle malformed PDFs gracefully
5. **Move to production**: Refactor into `src/agents/preprocessing_agent.py`

**Proceed to**: `02_gap_extraction.ipynb`