# IMPORTS

In [1]:
import json
import logging
from pathlib import Path
import yaml
import logging
import time
import os
from glob import glob

from rich.console import Console
from rich.panel import Panel

In [2]:
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend

from docling.document_converter import (
    DocumentConverter,
    PdfFormatOption,
    WordFormatOption,
)

from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.pipeline.vlm_pipeline import VlmPipeline

from docling.datamodel.base_models import InputFormat
from docling.datamodel import vlm_model_specs
from docling.datamodel.pipeline_options import (
    VlmPipelineOptions,
    PdfPipelineOptions,
)

from docling_core.types.doc import ImageRefMode, PictureItem, TableItem

from docling_core.transforms.serializer.html import HTMLDocSerializer
from docling_core.transforms.serializer.markdown import MarkdownDocSerializer

from docling.chunking import HybridChunker


# LOGGER

In [3]:
_log = logging.getLogger(__name__)

# Helper Functions

##### Input Files

In [4]:
def get_single_input_file(folder_path, filename, extension=[".docx", ".pdf"]):
    """
    Fetch a single input file by name.
    
    Args:
        folder_path (str): Path to folder (e.g., "Data/DOCX Files").
        filename (str): File name without extension (e.g., "Sample-1").
        extension (list[str]): List of file extensions (default [".docx", ".pdf"]).

    Returns:
        str: Full path to the file if exists, else None.
    """
    for ext in extension:
        file_path = os.path.join(folder_path, filename + ext)
        if os.path.exists(file_path):
            return file_path
    return None

##### Output Files

In [5]:
def get_output_path(input_file, base_output="Output"):
    parent_folder = os.path.basename(os.path.dirname(input_file))
    file_stem = os.path.splitext(os.path.basename(input_file))[0]
    
    output_dir = Path(base_output) / parent_folder / file_stem
    output_dir.mkdir(parents=True, exist_ok=True)
    
    return output_dir


# Simple Conversion

In [6]:
def SimpleConversion():
    source = get_single_input_file("Data/PDF Files", "Master Approval Letter")

    converter = DocumentConverter()
    doc = converter.convert(source).document

    # print(doc.export_to_markdown())

    # Export to json
    json_output = json.dumps(doc.export_to_dict(), indent=4)
    print(json_output)

SimpleConversion()




{
    "schema_name": "DoclingDocument",
    "version": "1.5.0",
    "name": "Master Approval Letter",
    "origin": {
        "mimetype": "application/pdf",
        "binary_hash": 17509862723772035765,
        "filename": "Master Approval Letter.pdf"
    },
    "furniture": {
        "self_ref": "#/furniture",
        "children": [],
        "content_layer": "furniture",
        "name": "_root_",
        "label": "unspecified"
    },
    "body": {
        "self_ref": "#/body",
        "children": [
            {
                "$ref": "#/texts/0"
            },
            {
                "$ref": "#/texts/1"
            },
            {
                "$ref": "#/groups/0"
            },
            {
                "$ref": "#/groups/1"
            },
            {
                "$ref": "#/pictures/0"
            },
            {
                "$ref": "#/pictures/1"
            },
            {
                "$ref": "#/texts/9"
            },
            {
                "$r

# Multi-Format Conversion

In [7]:
def MultiFormat():
    input_paths = [
        get_single_input_file("Data/DOCX Files", "Sample-1"),
        get_single_input_file("Data/DOCX Files", "Sample-2")
    ]

    # Filter out None values (e.g., if file not found or user canceled selection)
    input_paths = [p for p in input_paths if p is not None]

    if not input_paths:
        raise FileNotFoundError("No valid input files were selected!")

    doc_converter = DocumentConverter(
        allowed_formats=[InputFormat.DOCX, InputFormat.PDF],
        format_options={
            InputFormat.DOCX: WordFormatOption(pipeline_cls=SimplePipeline),
            InputFormat.PDF: PdfFormatOption(
                pipeline_cls=StandardPdfPipeline,
                backend=PyPdfiumDocumentBackend
            ),
        },
    )

    conv_results = doc_converter.convert_all(input_paths)

    for res in conv_results:
        file_ext = res.input.file.suffix.lower().lstrip('.')
        
        if file_ext == 'docx':
            base_output = "DOCX Files"
        elif file_ext == 'pdf':
            base_output = "PDF Files"
        else:
            base_output = "Output"

        out_path = get_output_path(res.input.file, base_output="Output/" + base_output)

        print(
            f"Document {res.input.file.name} converted."
            f"\nSaved markdown output to: {out_path!s}"
        )
        
        _log.debug(res.document._export_to_indented_text(max_text_len=16))
        
        json_path = out_path / f"{res.input.file.stem}.json"
        with json_path.open("w") as fp:
            fp.write(json.dumps(res.document.export_to_dict(), indent=4))

MultiFormat()

Document Sample-1.docx converted.
Saved markdown output to: Output\DOCX Files\Sample-1
Document Sample-2.docx converted.
Saved markdown output to: Output\DOCX Files\Sample-2


# Figure Export

In [8]:
def FigureExport():
    IMAGE_RESOLUTION_SCALE = 2.0
    logging.basicConfig(level=logging.INFO)

    input_doc_path = get_single_input_file("Data/PDF Files", "Master Approval Letter")
    output_dir = get_output_path(input_doc_path, base_output="Output")

    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
    # will destroy them for cleaning up memory.
    # This is done by setting PdfPipelineOptions.images_scale, which also defines the scale of images.
    # scale=1 correspond of a standard 72 DPI image
    # The PdfPipelineOptions.generate_* are the selectors for the document elements which will be enriched
    # with the image field
    pipeline_options = PdfPipelineOptions()
    pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
    pipeline_options.generate_page_images = True
    pipeline_options.generate_picture_images = True

    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
            InputFormat.DOCX: WordFormatOption(pipeline_cls=SimplePipeline) # Will try with PdfPipelineOptions()
        }
    )

    start_time = time.time()

    conv_res = doc_converter.convert(input_doc_path)

    output_dir.mkdir(parents=True, exist_ok=True)
    doc_filename = conv_res.input.file.stem

    # Save page images
    for page_no, page in conv_res.document.pages.items():
        page_no = page.page_no
        page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
        with page_image_filename.open("wb") as fp:
            page.image.pil_image.save(fp, format="PNG")

    # Save images of figures and tables
    table_counter = 0
    picture_counter = 0
    for element, _level in conv_res.document.iterate_items():
        if isinstance(element, TableItem):
            table_counter += 1
            element_image_filename = (
                output_dir / f"{doc_filename}-table-{table_counter}.png"
            )
            with element_image_filename.open("wb") as fp:
                element.get_image(conv_res.document).save(fp, "PNG")

        if isinstance(element, PictureItem):
            picture_counter += 1
            element_image_filename = (
                output_dir / f"{doc_filename}-picture-{picture_counter}.png"
            )
            with element_image_filename.open("wb") as fp:
                element.get_image(conv_res.document).save(fp, "PNG")

    # Right now removed saving as REFERENCED as it was creating RECURSIVE FOLDER
    # https://docling-project.github.io/docling/examples/export_figures/
    
    # Save markdown with embedded pictures
    md_filename = output_dir / f"{doc_filename}-with-images.md"
    conv_res.document.save_as_markdown(
        md_filename,
        image_mode=ImageRefMode.EMBEDDED
    )

    # Save json with embedded pictures
    json_filename = output_dir / f"{doc_filename}-with-images.json"
    conv_res.document.save_as_json(
        json_filename,
        image_mode=ImageRefMode.EMBEDDED
    )

    # Save HTML with embedded pictures
    html_filename = output_dir / f"{doc_filename}-with-images.html"
    conv_res.document.save_as_html(
        html_filename,
        image_mode=ImageRefMode.EMBEDDED
    )

    end_time = time.time() - start_time

    _log.info(f"Document converted and figures exported in {end_time:.2f} seconds.")

FigureExport()

INFO:docling.datamodel.document:detected formats: [<InputFormat.PDF: 'pdf'>]
INFO:docling.document_converter:Going to convert document batch...
INFO:docling.document_converter:Initializing pipeline for StandardPdfPipeline with options hash 45e823ad9aa4b6fa53c56667a4a8e97c
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.pipeline.base_pipeline:Processing document Master Approval Letter.pdf
INFO:docling.document_converter:Finished converting document Master Approval Letter.pdf in 44.81 sec.
INFO:__main__:Document converted and figures exported in 45.52 seconds.


# Multimodal Export

# Serialization

In [9]:
console = Console(width=210)  # for preventing Markdown table wrapped rendering
DOC_SOURCE = get_single_input_file("Data/DOCX Files", "Sample-3")

def print_in_console(text):
    console.print(Panel(text))

converter = DocumentConverter()
doc = converter.convert(source=DOC_SOURCE).document

# Serialize to HTML
# serializer = HTMLDocSerializer(doc=doc)

# Serialize to Markdown
serializer = MarkdownDocSerializer(doc=doc)
ser_result = serializer.serialize()
ser_text = ser_result.text

# we here only print an excerpt to keep the output brief:
print_in_console(ser_text)

INFO:docling.datamodel.document:detected formats: [<InputFormat.DOCX: 'docx'>]
INFO:docling.document_converter:Going to convert document batch...
INFO:docling.document_converter:Initializing pipeline for SimplePipeline with options hash 4cc01982ae99b46a2a63fcda46c47c35
INFO:docling.pipeline.base_pipeline:Processing document Sample-3.docx


INFO:docling.document_converter:Finished converting document Sample-3.docx in 0.30 sec.


# Hybrid Chunking
### Can also be used with other Tokenizers like OpenAI, Hugging Face etc.
#### https://docling-project.github.io/docling/examples/advanced_chunking_and_serialization/

In [10]:
doc = DocumentConverter().convert(source=DOC_SOURCE).document

chunker = HybridChunker()
chunk_iter = chunker.chunk(dl_doc=doc)

for i, chunk in enumerate(chunk_iter):
    print(f"=== {i} ===")
    print(f"chunk.text:\n{f'{chunk.text[:300]}…'!r}")

    enriched_text = chunker.contextualize(chunk=chunk)
    print(f"chunker.contextualize(chunk):\n{f'{enriched_text[:300]}…'!r}")

    print()

INFO:docling.datamodel.document:detected formats: [<InputFormat.DOCX: 'docx'>]
INFO:docling.document_converter:Going to convert document batch...
INFO:docling.document_converter:Initializing pipeline for SimplePipeline with options hash 4cc01982ae99b46a2a63fcda46c47c35
INFO:docling.pipeline.base_pipeline:Processing document Sample-3.docx


INFO:docling.document_converter:Finished converting document Sample-3.docx in 0.34 sec.


=== 0 ===
chunk.text:
'*Prepared by: Business Analytics Team\nDate: October 15, 2024*…'
chunker.contextualize(chunk):
'Quarterly Business Performance Analysis\nQ3 2024 Executive Summary\n*Prepared by: Business Analytics Team\nDate: October 15, 2024*…'

=== 1 ===
chunk.text:
'[Executive Summary](.)\n[Key Highlights](.)\n[Financial Performance](.)\n[Revenue Analysis](.)\n[Margin Analysis](.)\n[Market Analysis](.)\n[Geographic Expansion](.)\n[Competitive Landscape](.)\n[Operational Metrics](.)\n[Efficiency Indicators](.)\n[Quality Metrics](.)\n[Future Outlook](.)\n[Strategic Initiativ…'
chunker.contextualize(chunk):
'Quarterly Business Performance Analysis\nTable of Contents\n[Executive Summary](.)\n[Key Highlights](.)\n[Financial Performance](.)\n[Revenue Analysis](.)\n[Margin Analysis](.)\n[Market Analysis](.)\n[Geographic Expansion](.)\n[Competitive Landscape](.)\n[Operational Metrics](.)\n[Efficiency Indicators](.)\n[Qu…'

=== 2 ===
chunk.text:
'The third quarter of 2024 demonstrated 