# Complete Paper Processing Pipeline Test

This notebook walks through each step of the paperprocessor pipeline to verify everything works correctly.

## Pipeline Overview
1. **PDF to Images**: Convert PDF to page images and create ProcessedDocument
2. **OCR to Markdown**: Extract markdown text from each page using Mistral OCR
3. **Extract Metadata**: Get title and authors from the document
4. **Extract Structure**: Identify headers and structural elements
5. **Format Headers**: Convert headers to proper markdown # levels
6. **Rewrite Sections**: Clean up and rewrite content sections


## Setup and Constants

First, we'll define all the constants and imports needed for this test.


In [None]:
### IMPORTS ###
import os
import sys
import json
import asyncio
import logging
from pathlib import Path
from typing import Optional

# Add project root to Python path for imports
# When running in Jupyter, we need to go up from notebooks/ to project root
current_dir = Path.cwd()
if current_dir.name == "notebooks":
    project_root = current_dir.parent.parent
else:
    # Fallback: assume we're in the project root or find it
    project_root = current_dir
    while project_root.name != "papersummarizer" and project_root.parent != project_root:
        project_root = project_root.parent

sys.path.insert(0, str(project_root))
print(f"Added to Python path: {project_root}")

# Import our paperprocessor modules
from paperprocessor.models import ProcessedDocument, ProcessedPage
from paperprocessor.internals.pdf_to_image import convert_pdf_to_images
from paperprocessor.internals.mistral_ocr import extract_markdown_from_pages
from paperprocessor.internals.metadata_extractor import extract_metadata
from paperprocessor.internals.structure_extractor import extract_structure
from paperprocessor.internals.header_formatter import format_headers
from paperprocessor.internals.section_rewriter import rewrite_sections

### CONSTANTS ###
# Path to input PDF file - using the magnetic interactions paper for testing
INPUT_PDF_PATH = "/Users/Focus/Downloads/2212.14024v2.pdf"

# Get the notebook filename for output naming
NOTEBOOK_NAME = Path(__file__).stem if "__file__" in globals() else "2025.09.08-test_complete_pipeline"

# Output directory
OUTPUT_DIR = Path("outputs")
OUTPUT_DIR.mkdir(exist_ok=True)

# Setup logging to see what's happening
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
logger = logging.getLogger(__name__)

print(f"📄 Input PDF: {INPUT_PDF_PATH}")
print(f"📁 Output directory: {OUTPUT_DIR}")
print(f"📝 Notebook name: {NOTEBOOK_NAME}")
print(f"✅ Setup complete!")


## Ready to Start

Let's begin the pipeline test!


In [None]:
print("Ready to start pipeline!")


## Load Input PDF

First, we'll load the PDF file that we want to process through our pipeline.


In [None]:
# Load the PDF file
print("📂 Loading PDF file...")

if not os.path.exists(INPUT_PDF_PATH):
    raise FileNotFoundError(f"PDF file not found at: {INPUT_PDF_PATH}")

with open(INPUT_PDF_PATH, "rb") as f:
    pdf_contents = f.read()

pdf_size_mb = len(pdf_contents) / (1024 * 1024)
print(f"✅ PDF loaded successfully!")
print(f"📊 File size: {pdf_size_mb:.2f} MB")
print(f"📊 File size: {len(pdf_contents):,} bytes")


## Step 1: PDF to Images and Create Document

Convert PDF to images and create the ProcessedDocument object.


In [None]:
import base64
import io

# Convert PDF to images
images = await convert_pdf_to_images(pdf_contents)
print(f"Converted PDF to {len(images)} page images")

# Create ProcessedDocument
pdf_base64 = base64.b64encode(pdf_contents).decode('utf-8')

pages = []
for i, image in enumerate(images):
    page_num = i + 1
    
    # Convert PIL Image to base64
    buffered = io.BytesIO()
    image.save(buffered, format="PNG")
    img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
    
    page = ProcessedPage(page_number=page_num, img_base64=img_base64)
    pages.append(page)

document = ProcessedDocument(pdf_base64=pdf_base64, pages=pages)
print(f"Created document with {len(document.pages)} pages")


## Step 2: OCR - Extract Markdown

Run OCR on each page to extract markdown text.


In [None]:
# Run OCR on all pages
await extract_markdown_from_pages(document)

# Show sample OCR output
for page in document.pages[:2]:  # First 2 pages
    if page.ocr_markdown:
        print(f"\nPage {page.page_number} OCR (first 200 chars):")
        print(page.ocr_markdown[:200] + "...")

ocr_count = sum(1 for p in document.pages if p.ocr_markdown)
print(f"\nOCR completed on {ocr_count}/{len(document.pages)} pages")


## Step 3: Extract Metadata

Extract title and authors from the document.


In [None]:
# Extract metadata
await extract_metadata(document)

print(f"Title: {document.title}")
print(f"Authors: {document.authors}")


## Step 4: Extract Structure

Find headers and structural elements in the document.


In [None]:
# Extract structure
await extract_structure(document)

print(f"Found {len(document.headers)} headers")

# Show first few headers
for i, header in enumerate(document.headers):
    print(f"{i+1}. Level {header.level}: {header.text[:50]}...")


## Step 5: Format Headers

Convert headers to proper markdown # levels.


In [None]:
# Format headers
await format_headers(document)

print("Headers formatted to markdown levels")

# Show formatted headers
for i, header in enumerate(document.headers[:5]):
    level_indicator = "#" * header.level
    print(f"{i+1}. {level_indicator} {header.text[:50]}...")


## Step 6: Rewrite Sections

Final step - rewrite and clean up the content.


In [None]:
# Rewrite sections
await rewrite_sections(document)

if document.final_markdown:
    print(f"Final markdown generated ({len(document.final_markdown)} chars)")
    print(f"\nFirst 300 chars of final output:")
    print(document.final_markdown[:300] + "...")
    
    # Save final output
    output_file = OUTPUT_DIR / f"{NOTEBOOK_NAME}_final_output.md"
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(document.final_markdown)
    print(f"\nSaved final output to: {output_file}")
else:
    print("No final markdown generated")

print(f"\nPipeline complete! Document has:")
print(f"- Title: {document.title}")
print(f"- {len(document.pages)} pages")
print(f"- {len(document.headers)} headers")
print(f"- Final markdown: {'Yes' if document.final_markdown else 'No'}")
