In [1]:
from docling_core.transforms.serializer.markdown import MarkdownDocSerializer, MarkdownPictureSerializer
from docling_core.transforms.serializer.base import BaseDocSerializer
from docling_core.types.doc.base import ImageRefMode
from docling_core.types.doc.document import PictureClassificationData, PictureDescriptionData, PictureItem, PictureMoleculeData

## Example doc 1

In [2]:
from pathlib import Path
from docling_core.types.doc.document import DoclingDocument


doc = DoclingDocument.load_from_yaml(Path("..") / "test" / "data" / "doc" / "dummy_doc.yaml")

### 1a: Default serializer

The default Markdown serializer opts for a caption-based serialization of pictures:

In [3]:
serializer = MarkdownDocSerializer(
    doc=doc,
)
text = serializer.serialize().text
print(text)

# DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis

Figure 1: Four examples of complex page layouts across different document categories

<!-- image -->


### 1b: Custom serializer

We can instead customize to do an annotation-based serialization:

In [4]:
from typing import Optional
from docling_core.transforms.serializer.base import SerializationResult


class AnnotationMarkdownPictureSerializer(MarkdownPictureSerializer):
    def serialize(
        self,
        *,
        item: PictureItem,
        doc: DoclingDocument,
        doc_serializer: BaseDocSerializer,
        image_mode: Optional[ImageRefMode] = None,
        image_placeholder: Optional[str] = None,
        **kwargs,
    ) -> SerializationResult:
        my_image_mode = (
            image_mode if image_mode is not None else ImageRefMode.PLACEHOLDER
        )
        my_image_placeholder = (
            image_placeholder if image_placeholder is not None else "<!-- image -->"
        )
        text_parts: list[str] = []

        # annotation
        ann_text_parts: list[str] = []
        for annotation in item.annotations:
            if isinstance(annotation, PictureClassificationData):
                predicted_class = (
                    annotation.predicted_classes[0].class_name
                    if annotation.predicted_classes
                    else None
                )
                if predicted_class is not None:
                    ann_text_parts.append(f"Picture type: {predicted_class}")
            elif isinstance(annotation, PictureMoleculeData):
                ann_text_parts.append(f"SMILES: {annotation.smi}")
            elif isinstance(annotation, PictureDescriptionData):
                ann_text_parts.append(f"Description: {annotation.text}")

        ann_text = "\n\n".join(ann_text_parts)
        ann_text = doc_serializer.post_process(text=ann_text)
        if ann_text:
            text_parts.append(ann_text)

        # actual image
        img_text = self._serialize_image_part(
            item=item,
            doc=doc,
            image_mode=my_image_mode,
            image_placeholder=my_image_placeholder,
        ).text
        if img_text:
            text_parts.append(img_text)

        text_res = "\n\n".join(text_parts)

        return SerializationResult(text=text_res)

In [5]:
serializer = MarkdownDocSerializer(
    doc=doc,
    picture_serializer=AnnotationMarkdownPictureSerializer(),
    include_formatting=True,
    include_hyperlinks=True,
)
text = serializer.serialize().text
print(text)


# DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis

Picture type: illustration

Description: ...

SMILES: ...

<!-- image -->


## Example doc 2

We now look at another document with various items, including also formatting information.

In [6]:
from examples.utils import construct_test_doc


doc = construct_test_doc()

### 2a: Default serializer

The Markdown serializer by default ignores formatting:

In [7]:
serializer = MarkdownDocSerializer(
    doc=doc,
)
text = serializer.serialize().text
print(text)

# Title of the Document

## 1. Introduction

This is the caption of table 1.

| Product   |   Years |   Years |
|-----------|---------|---------|
| Product   |    2016 |    2017 |
| Apple     |   49823 |  695944 |

- item 1 of neighboring list
- item 2 of neighboring list
    - item 1 of sub list
    - Here a code snippet: `<p>Hello world</p>` (to be displayed inline)
    - Here a formula: $E=mc^2$ (to be displayed inline)

Here a code block:

```
print("Hello world")
```

Here a formula block:

$$E=mc^2$$

<!-- missing-key-value-item -->

<!-- missing-form-item -->

Some formatting chops: **bold** *italic* underline ~~strikethrough~~ [hyperlink](.) &amp; [~~***everything at the same time.***~~](https://github.com/DS4SD/docling)


### 2b: Reconfigured serializer

We can instead configure the serializer to consider formatting and hyperlinks:

In [8]:
from docling_core.transforms.serializer.common import _DEFAULT_LABELS
from docling_core.types.doc.document import DOCUMENT_TOKENS_EXPORT_LABELS, ContentLayer
from docling_core.types.doc.labels import DocItemLabel


serializer = MarkdownDocSerializer(
    doc=doc,
    labels=DOCUMENT_TOKENS_EXPORT_LABELS - {DocItemLabel.TABLE},
)
text = serializer.serialize().text
print(text)

# Title of the Document

## 1. Introduction

This is the caption of table 1.

- item 1 of neighboring list
- item 2 of neighboring list
    - item 1 of sub list
    - Here a code snippet: `<p>Hello world</p>` (to be displayed inline)
    - Here a formula: $E=mc^2$ (to be displayed inline)

Here a code block:

```
print("Hello world")
```

Here a formula block:

$$E=mc^2$$

<!-- missing-key-value-item -->

<!-- missing-form-item -->

Some formatting chops: **bold** *italic* underline ~~strikethrough~~ [hyperlink](.) &amp; [~~***everything at the same time.***~~](https://github.com/DS4SD/docling)


In [9]:
from docling_core.transforms.serializer.markdown import MarkdownListSerializer


serializer = MarkdownDocSerializer(
    doc=doc,
    list_serializer=MarkdownListSerializer(
        indent=2,
    )
)
text = serializer.serialize().text
print(text)

# Title of the Document

## 1. Introduction

This is the caption of table 1.

| Product   |   Years |   Years |
|-----------|---------|---------|
| Product   |    2016 |    2017 |
| Apple     |   49823 |  695944 |

- item 1 of neighboring list
- item 2 of neighboring list
  - item 1 of sub list
  - Here a code snippet: `<p>Hello world</p>` (to be displayed inline)
  - Here a formula: $E=mc^2$ (to be displayed inline)

Here a code block:

```
print("Hello world")
```

Here a formula block:

$$E=mc^2$$

<!-- missing-key-value-item -->

<!-- missing-form-item -->

Some formatting chops: **bold** *italic* underline ~~strikethrough~~ [hyperlink](.) &amp; [~~***everything at the same time.***~~](https://github.com/DS4SD/docling)


In [10]:
doc = DoclingDocument.load_from_json(Path("2408.09869v5.json"))

In [11]:
serializer = MarkdownDocSerializer(
    doc=doc,
    pages=[1],
    # image_placeholder=
)
text = serializer.serialize().text
print(text)

<!-- image -->

## Docling Technical Report

Version 1.0

Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar

AI4K Group, IBM Research R¨ uschlikon, Switzerland

## Abstract

This technical report introduces Docling , an easy to use, self-contained, MITlicensed open-source package for PDF document conversion. It is powered by state-of-the-art specialized AI models for layout analysis (DocLayNet) and table structure recognition (TableFormer), and runs efficiently on commodity hardware in a small resource budget. The code interface allows for easy extensibility and addition of new features and models.

## 1 Introduction

Converting PDF documents back into a machine-processable format has been a major challenge for decades due to their huge variabi

In [12]:
serializer = MarkdownDocSerializer(
    doc=doc,
    start=1,
    stop=10,
    image_mode=ImageRefMode.PLACEHOLDER,
)
text = serializer.serialize().text
print(text)

<!-- image -->

## Docling Technical Report

Version 1.0

Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar

AI4K Group, IBM Research R¨ uschlikon, Switzerland

## Abstract

This technical report introduces Docling , an easy to use, self-contained, MITlicensed open-source package for PDF document conversion. It is powered by state-of-the-art specialized AI models for layout analysis (DocLayNet) and table structure recognition (TableFormer), and runs efficiently on commodity hardware in a small resource budget. The code interface allows for easy extensibility and addition of new features and models.


In [13]:
serializer = MarkdownDocSerializer(
    doc=doc,
    # pages=[1],
    # image_placeholder=
)
text = serializer.serialize().text
print(text)

<!-- image -->

## Docling Technical Report

Version 1.0

Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar

AI4K Group, IBM Research R¨ uschlikon, Switzerland

## Abstract

This technical report introduces Docling , an easy to use, self-contained, MITlicensed open-source package for PDF document conversion. It is powered by state-of-the-art specialized AI models for layout analysis (DocLayNet) and table structure recognition (TableFormer), and runs efficiently on commodity hardware in a small resource budget. The code interface allows for easy extensibility and addition of new features and models.

## 1 Introduction

Converting PDF documents back into a machine-processable format has been a major challenge for decades due to their huge variabi