# PDF Extraction with MarkItDown

This notebook demonstrates how to extract structured content from PDFs using the `PdfService`.

## Setup

In [None]:
import sys
from pathlib import Path
import json

# Add parent directory to path
sys.path.append(str(Path.cwd().parent))

from src.services.pdf_service import PdfService
from src.models.paper import Paper

from IPython.display import Markdown

## Example 1: Extract from Downloaded Paper

Let's load a paper from its metadata file and extract the PDF content.

In [None]:
# Initialize the PDF service
pdf_service = PdfService()

# Find metadata files in downloads folder
download_dir = Path("downloads")
metadata_files = list(download_dir.glob("*.json"))
print(metadata_files)

if not metadata_files:
    print("No metadata files found in downloads folder!")
    print("Please download a paper first using the arxiv_example.ipynb notebook")
else:
    # Load the first paper's metadata
    metadata_path = metadata_files[0]
    
    with open(metadata_path, 'r') as f:
        metadata = json.load(f)
    
    paper = Paper.from_dict(metadata)
    
    print(f"Found {len(metadata_files)} paper(s) in downloads folder")
    print(f"Using: {paper.title}")
    print(f"PDF path: {paper.pdf_path}\n")
    
    # Extract content using the pdf_path from the paper metadata
    content = pdf_service.extract(paper)
    
    print(f"Content length: {len(content.markdown)} characters")

    # View first 2000 characters of the markdown
    display(Markdown(content.markdown[:2000]))

## Example 2: Save Extracted Content

Save the extracted markdown content to a text file.

In [None]:
if metadata_files:
    # Extract content
    content = pdf_service.extract(paper)
    
    # Save to file
    saved_path = pdf_service.save(
        content,
        filename=paper.pdf_filename,
        output_dir="extracted_content"
    )
    
    print(f"Extracted and saved to: {saved_path}")
    print(f"Content length: {len(content.markdown)} characters")