## Simple Code and Library Exploration

In [96]:
from docling.document_converter import DocumentConverter

# Create the converter (this is your main tool)
converter = DocumentConverter()

# Convert a PDF file (replace 'sample.pdf' with your PDF path)
result = converter.convert("SampleFiles/PO100000.pdf")

# Get the text content as markdown
markdown_content = result.document.export_to_markdown()

# See what we extracted!
print(markdown_content)

## Complete Contract: System Upgrade and Professional Services Agreement SYSTEM UPGRADE AND PROFESSIONAL SERVICES AGREEMENT

This System Upgrade and Professional Services Agreement (this "Agreement") is entered into by and between Jeddah Airport Operations , whose billing address is Unit 7647 Box 5938, DPO AE 37442 (hereinafter, "Customer"), and SkyBridge IT Solutions , a company with an office at [Vendor Address not provided in sources, would need to be added ] (hereinafter, "Vendor"), as of this [Day] day of [Month], 2025 (the "Effective Date").

WHEREAS , Customer desires to engage Vendor to provide services for the upgrade of an existing CCTV system to an IP-based solution; and WHEREAS , Vendor represents that it has the necessary expertise and capacity to perform such services; NOW, THEREFORE , in consideration of the mutual promises, covenants, and conditions contained herein, the parties do hereby agree as follows:

## 1. DEFINITIONS

- 1.1. "Agreement" refers to this System Upg

## Basic Info Extraction

In [95]:
doc = result.document

print(f"Pages: {len(doc.pages)}")
print(f"Pages content: {doc.pages}")
print(f"Text length: {len(doc.export_to_text())}")
print(f"Tables: {len(doc.tables)}")

Parameter `strict_text` has been deprecated and will be ignored.


Pages: 0
Pages content: {}
Text length: 14293
Tables: 0


In [97]:
doc = result.document

# print(doc.pages)

basic_info = {
    "page_count": len(doc.pages),
    "text_length": len(doc.export_to_text()),
    "has_tables": len(doc.tables),
    "has_pictures": len(doc.pictures),

    # Content structure
    "total_text_elements": len(doc.texts),
    "section_headers": len([t for t in doc.texts if t.label.value == 'section_header']),
    "paragraphs": len([t for t in doc.texts if t.label.value == 'text']),
    "list_items": len([t for t in doc.texts if t.label.value == 'list_item']),
    "has_code": len([t for t in doc.texts if t.label.value == 'code']) > 0,

    # Complexity indicators
    # "content_density": len(doc.export_to_text()) // len(doc.pages),  # chars per page
}

print(basic_info)

Parameter `strict_text` has been deprecated and will be ignored.


{'page_count': 6, 'text_length': 14244, 'has_tables': 0, 'has_pictures': 0, 'total_text_elements': 73, 'section_headers': 20, 'paragraphs': 12, 'list_items': 41, 'has_code': False}


## Metadata Extraction

In [98]:
metadata_attrs = ['name', 'title', 'author', 'subject', 'creator', 'producer',
                 'creation_date', 'modification_date', 'origin', 'version']

print("Available metadata:")
available_metadata = {}
for attr in metadata_attrs:
    value = getattr(doc, attr, None)
    if value and str(value).strip():
        available_metadata[attr] = value
        print(f"  {attr}: {value}")
    else:
        print(f"  {attr}: None")

# Separate dictionaries
content_info = {
    # Your existing content analysis
}

metadata_info = available_metadata  # Only populated fields

Available metadata:
  name: PO100000
  title: None
  author: None
  subject: None
  creator: None
  producer: None
  creation_date: None
  modification_date: None
  origin: mimetype='application/pdf' binary_hash=7178378074092617384 filename='PO100000.pdf' uri=None
  version: 1.5.0


## Extract the header titles

In [99]:
# Extract all section headers
headers = []
for text in doc.texts:
    if text.label.value == 'section_header':
        header_info = {
            "text": text.text,
            "level": getattr(text, 'level', 1),  # Header level (1=# 2=## etc.)
            "page": text.prov[0].page_no if text.prov else None
        }
        headers.append(header_info)

print("Document Headers:")
for i, header in enumerate(headers):
    print(f"{i+1}. Level {header['level']}: {header['text']} (page {header['page']})")

Document Headers:
1. Level 1: Complete Contract: System Upgrade and Professional Services Agreement SYSTEM UPGRADE AND PROFESSIONAL SERVICES AGREEMENT (page 1)
2. Level 1: 1. DEFINITIONS (page 1)
3. Level 1: 2. SCOPE OF SERVICES AND DELIVERABLES (page 1)
4. Level 1: 3. TERM OF AGREEMENT (page 2)
5. Level 1: 4. CHARGES AND PAYMENT (page 2)
6. Level 1: 5. DELIVERY AND ACCEPTANCE (page 2)
7. Level 1: 6. VENDOR'S OBLIGATIONS (page 2)
8. Level 1: 7. CUSTOMER'S OBLIGATIONS (page 3)
9. Level 1: 8. WARRANTIES (page 3)
10. Level 1: 9. LIABILITY (page 3)
11. Level 1: 10. INDEMNIFICATION (page 4)
12. Level 1: 11. CONFIDENTIALITY (page 4)
13. Level 1: 12. FORCE MAJEURE (page 4)
14. Level 1: 13. ASSIGNMENT (page 5)
15. Level 1: 14. NOTICES (page 5)
16. Level 1: 15. ENTIRE AGREEMENT (page 5)
17. Level 1: 16. GOVERNING LAW AND JURISDICTION (page 5)
18. Level 1: 17. MISCELLANEOUS (page 6)
19. Level 1: FOR JEDDAH AIRPORT OPERATIONS (CUSTOMER): (page 6)
20. Level 1: FOR SKYBRIDGE IT SOLUTIONS (VENDOR): 

## Remap the Parent & Child relationship

In [104]:
def build_semantic_hierarchy(doc):
    current_section = None
    hierarchy = {}

    for text in doc.texts:
        if text.label.value == 'section_header':
            current_section = text.text
            hierarchy[current_section] = []
        elif current_section and text.label.value in ['text', 'list_item']:
            hierarchy[current_section].append(text.text)

    return hierarchy

In [105]:
hierarchy = build_semantic_hierarchy(doc)

# Print the structure
print("DOCUMENT HIERARCHY:")
print("="*50)
for section, content in hierarchy.items():
    print(f"\n📋 SECTION: {section}")
    print(f"   Content items: {len(content)}")

    # Show first 2 items as sample
    for i, item in enumerate(content[:2]):
        print(f"   {i+1}. {item[:100]}...")

    if len(content) > 2:
        print(f"   ... and {len(content) - 2} more items")

DOCUMENT HIERARCHY:

📋 SECTION: Complete Contract: System Upgrade and Professional Services Agreement SYSTEM UPGRADE AND PROFESSIONAL SERVICES AGREEMENT
   Content items: 2
   1. This System Upgrade and Professional Services Agreement (this "Agreement") is entered into by and be...
   2. WHEREAS , Customer desires to engage Vendor to provide services for the upgrade of an existing CCTV ...

📋 SECTION: 1. DEFINITIONS
   Content items: 7
   1. 1.1. "Agreement" refers to this System Upgrade and Professional Services Agreement, including any ex...
   2. 1.2. "Customer" means Jeddah Airport Operations [Customized from PO100000]....
   ... and 5 more items

📋 SECTION: 2. SCOPE OF SERVICES AND DELIVERABLES
   Content items: 2
   1. 2.1. Vendor shall provide the Customer with the following Services and deliver the specified Equipme...
   2. [Customized from PO100000]. * Installation, configuration, and testing of the entire system [Customi...

📋 SECTION: 3. TERM OF AGREEMENT
   Content items: 

## Export after Semantic Hierarchy

In [106]:
def export_to_semantic_markdown(doc):
    """Export with proper semantic structure"""
    hierarchy = build_semantic_hierarchy(doc)
    markdown = ""

    for section, content in hierarchy.items():
        # Add section header
        markdown += f"# {section}\n\n"

        # Add section content
        for item in content:
            markdown += f"{item}\n\n"

        markdown += "---\n\n"  # Section separator

    return markdown

def export_to_semantic_dict(doc):
    """Export as structured dictionary"""
    hierarchy = build_semantic_hierarchy(doc)

    return {
        "document_structure": hierarchy,
        "metadata": {
            "total_sections": len(hierarchy),
            "content_distribution": {section: len(content) for section, content in hierarchy.items()}
        }
    }

In [108]:
# Instead of doc.export_to_markdown()
semantic_markdown = export_to_semantic_markdown(doc)

# Instead of doc.export_to_dict()
semantic_dict = export_to_semantic_dict(doc)

print("First 500 chars of semantic markdown:")
print(semantic_markdown)

First 500 chars of semantic markdown:
# Complete Contract: System Upgrade and Professional Services Agreement SYSTEM UPGRADE AND PROFESSIONAL SERVICES AGREEMENT

This System Upgrade and Professional Services Agreement (this "Agreement") is entered into by and between Jeddah Airport Operations , whose billing address is Unit 7647 Box 5938, DPO AE 37442 (hereinafter, "Customer"), and SkyBridge IT Solutions , a company with an office at [Vendor Address not provided in sources, would need to be added ] (hereinafter, "Vendor"), as of this [Day] day of [Month], 2025 (the "Effective Date").

WHEREAS , Customer desires to engage Vendor to provide services for the upgrade of an existing CCTV system to an IP-based solution; and WHEREAS , Vendor represents that it has the necessary expertise and capacity to perform such services; NOW, THEREFORE , in consideration of the mutual promises, covenants, and conditions contained herein, the parties do hereby agree as follows:

---

# 1. DEFINITIONS

1.1