In [None]:
# %% ULTRA-SIMPLIFIED: Let Docling do EVERYTHING
from docling.document_converter import DocumentConverter
from pathlib import Path

def parse_pdf_simple(pdf_path: Path, output_dir: Path):
    """
    Dead simple parsing - let Docling handle it all!
    """
    
    print(f"Converting {pdf_path.name}...")
    converter = DocumentConverter()
    result = converter.convert(str(pdf_path))
    
    # Export to Markdown - Docling includes captions automatically!
    markdown = result.document.export_to_markdown()
    
    # Save
    md_path = output_dir / f"{pdf_path.stem}.md"
    with open(md_path, 'w', encoding='utf-8') as f:
        f.write(markdown)
    
    print(f"✓ Saved to: {md_path}")
    
    # Also save as JSON for structured access
    json_path = output_dir / f"{pdf_path.stem}.json"
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(result.document.export_to_dict(), f, indent=2)
    
    print(f"✓ Saved to: {json_path}")
    
    # Stats
    doc = result.document
    print(f"\nStatistics:")
    print(f"  Pictures: {len(doc.pictures)}")
    print(f"  Tables: {len(doc.tables)}")
    print(f"  Text items: {len(doc.texts)}")
    
    # Check captions
    pics_with_captions = sum(1 for pic in doc.pictures if pic.caption_text(doc=doc))
    tables_with_captions = sum(1 for tbl in doc.tables if tbl.caption_text(doc=doc))
    
    print(f"\nCaptions:")
    print(f"  Pictures with captions: {pics_with_captions}/{len(doc.pictures)} ({pics_with_captions/len(doc.pictures)*100:.1f}%)")
    print(f"  Tables with captions: {tables_with_captions}/{len(doc.tables)} ({tables_with_captions/len(doc.tables)*100:.1f}%)")
    
    return result

# Run it
result = parse_pdf_simple(
    pdf_path=Path("./data/fintbx.pdf"),
    output_dir=Path("./data/outputs/simple")    
)


2025-10-17 18:10:24,278 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-17 18:10:24,366 - INFO - Going to convert document batch...
2025-10-17 18:10:24,368 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 4f2edc0f7d9bb60b38ebfecf9a2609f5
2025-10-17 18:10:24,378 - INFO - Loading plugin 'docling_defaults'
2025-10-17 18:10:24,380 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-10-17 18:10:24,391 - INFO - Loading plugin 'docling_defaults'
2025-10-17 18:10:24,394 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']


Converting fintbx.pdf...


2025-10-17 18:10:25,289 - INFO - Auto OCR model selected ocrmac.
2025-10-17 18:10:25,318 - INFO - Accelerator device: 'mps'
2025-10-17 18:10:26,927 - INFO - Accelerator device: 'mps'
2025-10-17 18:10:27,669 - INFO - Processing document fintbx.pdf
2025-10-17 18:48:33,408 - INFO - Finished converting document fintbx.pdf in 2289.09 sec.


✓ Saved to: /Users/RiyanshiKedia/Documents/GitHub/project-aurelia/DATA/outputs/simple/fintbx.md


NameError: name 'json' is not defined

In [None]:
# %% PHASE A FINAL - Ready for Phase B Chunking
from docling.document_converter import DocumentConverter
from docling_core.types.doc import TextItem, TableItem, PictureItem
from pathlib import Path
import json

# Parse
print("="*60)
print("PHASE A: PARSE & NORMALIZE")
print("="*60)

pdf_path = Path("/Users/RiyanshiKedia/Documents/GitHub/project-aurelia/DATA/fintbx_ex.pdf")
output_dir = Path("../DATA/outputs/phase_a/artifacts")
output_dir.mkdir(parents=True, exist_ok=True)

# Convert
converter = DocumentConverter()
result = converter.convert(str(pdf_path))
doc = result.document

# Extract blocks
blocks = []
block_id = 0
section_stack = []

for item, level in doc.iterate_items():
    
    # Metadata
    page, bbox = None, None
    if hasattr(item, 'prov') and item.prov:
        prov = item.prov[0]
        page = getattr(prov, 'page_no', None)
        if hasattr(prov, 'bbox'):
            b = prov.bbox
            bbox = {"x0": float(b.l), "y0": float(b.t), "x1": float(b.r), "y1": float(b.b)}
    
    # Type and text
    if isinstance(item, PictureItem):
        item_type, text = 'figure', '[Figure]'
    elif isinstance(item, TableItem):
        item_type, text = 'table', item.export_to_markdown(doc=doc)
    elif isinstance(item, TextItem):
        label = str(item.label).lower() if hasattr(item, 'label') else ''
        item_type = ('heading' if 'section' in label or 'title' in label else
                    'code' if 'code' in label else
                    'equation' if 'formula' in label else
                    'list' if 'list' in label else 'paragraph')
        text = item.text or ""
    else:
        continue
    
    # Build block
    block = {
        "block_id": block_id,
        "type": item_type,
        "text": text,
        "page": page,
        "bbox": bbox,
        "section_path": " > ".join(section_stack) if section_stack else "root",
        "pdf_hash": ingest_plan["pdf_info"]["pdf_hash"]
    }
    
    # Type-specific fields
    if item_type == "heading":
        block["heading_level"] = level
        if text and len(text) > 3:
            if level <= len(section_stack):
                section_stack = section_stack[:level-1]
            section_stack.append(text)
    elif item_type == "figure":
        block["caption"] = item.caption_text(doc=doc)
        block["figure_id"] = str(item.self_ref)
    elif item_type == "table":
        block["caption"] = item.caption_text(doc=doc)
        block["table_id"] = str(item.self_ref)
    elif item_type == "code":
        block["code_language"] = getattr(item, 'code_language', 'unknown')
    elif item_type == "equation":
        block["latex"] = getattr(item, 'latex', None)
    
    blocks.append(block)
    block_id += 1

# Augment missing captions
for i, block in enumerate(blocks):
    if block['type'] not in ['figure', 'table'] or block.get('caption'):
        continue
    for j in range(i-1, max(i-20, -1), -1):
        prev = blocks[j]
        if prev['page'] != block['page']:
            continue
        if prev['type'] == 'heading':
            block['caption'] = prev['text']
            break

# Save JSONL
with open(output_dir / "docling_blocks.jsonl", 'w') as f:
    for block in blocks:
        f.write(json.dumps(block, ensure_ascii=False) + '\n')

# Stats
fig_tab = [b for b in blocks if b['type'] in ['figure', 'table']]
with_caps = [b for b in fig_tab if b.get('caption')]

print(f"\n✓ Extracted {len(blocks)} blocks")
print(f"✓ Captions: {len(with_caps)}/{len(fig_tab)} ({len(with_caps)/len(fig_tab)*100:.1f}%)")
print(f"✓ Saved: {output_dir / 'docling_blocks.jsonl'}")
print("\nREADY FOR PHASE B: CHUNKING EXPERIMENTS")

✓ Converted to JSON: /Users/RiyanshiKedia/Documents/GitHub/project-aurelia/DATA/outputs/simple/fintbx_structured.json
  Sections: 10203
  Total words: 813913


In [None]:
# %% PHASE A FINAL - Ready for Phase B Chunking
from docling.document_converter import DocumentConverter
from docling_core.types.doc import TextItem, TableItem, PictureItem
from pathlib import Path
import json

# Parse
print("="*60)
print("PHASE A: PARSE & NORMALIZE")
print("="*60)

pdf_path = Path("/Users/RiyanshiKedia/Documents/GitHub/project-aurelia/DATA/fintbx.pdf")
output_dir = Path("outputs/phase_a/artifacts")
output_dir.mkdir(parents=True, exist_ok=True)

# Convert
converter = DocumentConverter()
result = converter.convert(str(pdf_path))
doc = result.document

# Extract blocks
blocks = []
block_id = 0
section_stack = []

for item, level in doc.iterate_items():
    
    # Metadata
    page, bbox = None, None
    if hasattr(item, 'prov') and item.prov:
        prov = item.prov[0]
        page = getattr(prov, 'page_no', None)
        if hasattr(prov, 'bbox'):
            b = prov.bbox
            bbox = {"x0": float(b.l), "y0": float(b.t), "x1": float(b.r), "y1": float(b.b)}
    
    # Type and text
    if isinstance(item, PictureItem):
        item_type, text = 'figure', '[Figure]'
    elif isinstance(item, TableItem):
        item_type, text = 'table', item.export_to_markdown(doc=doc)
    elif isinstance(item, TextItem):
        label = str(item.label).lower() if hasattr(item, 'label') else ''
        item_type = ('heading' if 'section' in label or 'title' in label else
                    'code' if 'code' in label else
                    'equation' if 'formula' in label else
                    'list' if 'list' in label else 'paragraph')
        text = item.text or ""
    else:
        continue
    
    # Build block
    block = {
        "block_id": block_id,
        "type": item_type,
        "text": text,
        "page": page,
        "bbox": bbox,
        "section_path": " > ".join(section_stack) if section_stack else "root",
        "pdf_hash": ingest_plan["pdf_info"]["pdf_hash"]
    }
    
    # Type-specific fields
    if item_type == "heading":
        block["heading_level"] = level
        if text and len(text) > 3:
            if level <= len(section_stack):
                section_stack = section_stack[:level-1]
            section_stack.append(text)
    elif item_type == "figure":
        block["caption"] = item.caption_text(doc=doc)
        block["figure_id"] = str(item.self_ref)
    elif item_type == "table":
        block["caption"] = item.caption_text(doc=doc)
        block["table_id"] = str(item.self_ref)
    elif item_type == "code":
        block["code_language"] = getattr(item, 'code_language', 'unknown')
    elif item_type == "equation":
        block["latex"] = getattr(item, 'latex', None)
    
    blocks.append(block)
    block_id += 1

# Augment missing captions
for i, block in enumerate(blocks):
    if block['type'] not in ['figure', 'table'] or block.get('caption'):
        continue
    for j in range(i-1, max(i-20, -1), -1):
        prev = blocks[j]
        if prev['page'] != block['page']:
            continue
        if prev['type'] == 'heading':
            block['caption'] = prev['text']
            break

# Save JSONL
with open(output_dir / "docling_blocks.jsonl", 'w') as f:
    for block in blocks:
        f.write(json.dumps(block, ensure_ascii=False) + '\n')

# Stats
fig_tab = [b for b in blocks if b['type'] in ['figure', 'table']]
with_caps = [b for b in fig_tab if b.get('caption')]

print(f"\n✓ Extracted {len(blocks)} blocks")
print(f"✓ Captions: {len(with_caps)}/{len(fig_tab)} ({len(with_caps)/len(fig_tab)*100:.1f}%)")
print(f"✓ Saved: {output_dir / 'docling_blocks.jsonl'}")
print("\nREADY FOR PHASE B: CHUNKING EXPERIMENTS")

2025-10-17 19:19:12,548 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-17 19:19:12,621 - INFO - Going to convert document batch...
2025-10-17 19:19:12,627 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 4f2edc0f7d9bb60b38ebfecf9a2609f5
2025-10-17 19:19:12,642 - INFO - Auto OCR model selected ocrmac.
2025-10-17 19:19:12,643 - INFO - Accelerator device: 'mps'


Converting fintbx_ex.pdf...


2025-10-17 19:19:13,893 - INFO - Accelerator device: 'mps'
2025-10-17 19:19:14,389 - INFO - Processing document fintbx_ex.pdf
2025-10-17 19:19:38,790 - INFO - Finished converting document fintbx_ex.pdf in 26.25 sec.


Extracting blocks with metadata...
✓ Extracted 907 blocks
✓ Saved JSONL: outputs/phase_a/artifacts/docling_blocks.jsonl

STATISTICS:
Total blocks: 907
Figures: 8
Tables: 13
Captions: 1/21 (4.8%)

Sample blocks with metadata:

  Block 0:
    Type: heading
    Page: 1
    BBox: {'x0': 54.0, 'y0': 748.764, 'x1': 266.256, 'y1': 686.778}
    Section: root
    Text: Financial Toolbox™ User's Guide...

  Block 1:
    Type: figure
    Page: 1
    BBox: {'x0': 112.21453857421875, 'y0': 595.7596588134766, 'x1': 494.5715026855469, 'y1': 257.2275390625}
    Section: Financial Toolbox™ User's Guide
    Caption: 
    Text: [Figure]...

  Block 2:
    Type: heading
    Page: 1
    BBox: {'x0': 143.50362630178907, 'y0': 193.67892024425078, 'x1': 486.9023893226224, 'y1': 119.43735835460234}
    Section: Financial Toolbox™ User's Guide
    Text: MATLAB®...

  Block 3:
    Type: figure
    Page: 1
    BBox: {'x0': 436.68817138671875, 'y0': 66.8349609375, 'x1': 584.9988403320312, 'y1': 36.52667236328125}
