# 1. Setup Asyncio

In [1]:
import nest_asyncio

nest_asyncio.apply()

# 2. Setup the Qdrant vector database

In [2]:
import qdrant_client

collection_name = "chat_with_docs_chonkie"

client = qdrant_client.QdrantClient(
    host="localhost",
    port=6333,
)



# 3. Read the documents

In [42]:
from llama_index.core import SimpleDirectoryReader

input_dir_path = "./docs"

loader = SimpleDirectoryReader(
    input_dir=input_dir_path,
    required_exts=[".pdf"],
    recursive=True
)

docs = loader.load_data()

In [43]:
docs

[Document(id_='34662115-a2bd-4594-8f6a-7b7affa99e20', embedding=None, metadata={'page_label': '1', 'file_name': 'Rational AI - Full Deck.pdf', 'file_path': '/Users/fc/experiments/rag-project/docs/Rational AI - Full Deck.pdf', 'file_type': 'application/pdf', 'file_size': 7979133, 'creation_date': '2025-06-13', 'last_modified_date': '2025-06-10'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='Your Data,Your AI\nEnsuring a safe LLM adoption with Rational AI', path=None, url=None, mimetype=None), image_resource=None, audio_resource=None, video_resource=None, text_template='{metadata_str}\n\n{content}'),
 Document(id_='8b8cb9b5-a8f4-45b6-9799

In [44]:
type(docs), len(docs)

(list, 80)

## 4. Use Chonkie to chunk the documents

In [None]:
from chonkie import SemanticChunker
from llama_index.core.schema import Document
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding


semantic_chunker = SemanticChunker(
    embedding_model="BAAI/bge-large-en-v1.5",
    threshold=0.5,
    chunk_size=512,
    min_sentences=1
)

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5",
                                   trust_remote_code=True)

Settings.embed_model = embed_model

all_chunks = []
for doc in docs:
    chunks = semantic_chunker.chunk(doc.text)
    for chunk in chunks:
        # Use LlamaIndex's embedding model to embed the chunk text
        chunk_embedding = Settings.embed_model.get_text_embedding(chunk.text)
        all_chunks.append(
            Document(
                text=chunk.text,
                metadata=doc.metadata,
                embedding=chunk_embedding
            )
        )

In [9]:
len(all_chunks)

144

In [10]:
all_chunks[:10]

[Document(id_='72c08a5c-8e50-428a-9e2a-7117d142ac4d', embedding=[0.04567744955420494, 0.023911356925964355, -0.0002776296460069716, -0.031402792781591415, -0.024951230734586716, -0.010465634986758232, 0.028284667059779167, 0.014046593569219112, 0.01609227806329727, 0.0352574847638607, -0.007125934585928917, 0.03838617727160454, -0.0002155945694539696, 0.008623963221907616, -0.02769974246621132, 0.046651773154735565, -0.006880850065499544, 0.017693324014544487, -0.013728069141507149, -0.01454898715019226, 0.033459123224020004, 0.04205651953816414, -0.07957055419683456, -0.03467193618416786, -0.019749199971556664, 0.017816336825489998, 0.02426242083311081, -0.007875252515077591, 0.1063794419169426, 0.03253454342484474, -0.005120623391121626, -0.016434285789728165, 0.0007517744670622051, -0.041246671229600906, -0.07258458435535431, -0.023251408711075783, 0.0029126584995537996, -0.004442622419446707, -0.06817472726106644, -0.06902074068784714, 0.019993798807263374, -0.0020357242319732904, 

## 5. Create Ddrant Collection

In [None]:
# import numpy as np
# from qdrant_client.models import VectorParams, Distance
# from qdrant_client.models import PointStruct


# # Create the collection if it doesn't exist
# client.recreate_collection(
#     collection_name=collection_name,
#     vectors_config=VectorParams(
#         size=np.array(all_chunks[0].embedding).shape[0],  # dimension of your embedding
#         distance=Distance.COSINE                # or Distance.DOT, Distance.EUCLID
#     )
# )

# points = []
# for i, chunk in enumerate(all_chunks):
#     if chunk.embedding is not None:
#         points.append(
#             PointStruct(
#                 id=i,
#                 vector=chunk.embedding,
#                 payload={"text": chunk.text}
#             )
#         )

# client.upsert(
#     collection_name=collection_name,
#     points=points
# )

# 5. Load the embedding model and index data

In [11]:
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import VectorStoreIndex, StorageContext

def create_index(documents):

    vector_store = QdrantVectorStore(client=client,
                                     collection_name=collection_name)
    
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    
    index = VectorStoreIndex.from_documents(documents,
                                            storage_context=storage_context)
    
    return index

In [12]:
from llama_index.core import Settings

index = create_index(all_chunks)

In [13]:
type(index)

llama_index.core.indices.vector_store.base.VectorStoreIndex

## 6. Load the LLM

In [14]:
from llama_index.llms.ollama import Ollama
from llama_index.core import Settings


llm = Ollama(model="llama3.2:1b", request_timeout=120.0)

Settings.llm = llm

In [15]:
type(Settings), Settings.llm, Settings.embed_model

(llama_index.core.settings._Settings,
 Ollama(callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x11d0cdac0>, system_prompt=None, messages_to_prompt=<function messages_to_prompt at 0x10cb4bc10>, completion_to_prompt=<function default_completion_to_prompt at 0x10d012040>, output_parser=None, pydantic_program_mode=<PydanticProgramMode.DEFAULT: 'default'>, query_wrapper_prompt=None, base_url='http://localhost:11434', model='llama3.2:1b', temperature=None, context_window=-1, request_timeout=120.0, prompt_key='prompt', json_mode=False, additional_kwargs={}, is_function_calling_model=True, keep_alive=None, thinking=None),
 HuggingFaceEmbedding(model_name='BAAI/bge-large-en-v1.5', embed_batch_size=10, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x11d0cdac0>, num_workers=None, embeddings_cache=None, max_length=512, normalize=True, query_instruction=None, text_instruction=None, cache_folder=None, show_progress_bar=False))

# 7. Define the prompt template

In [16]:
from llama_index.core import PromptTemplate

template = """Context information is below:
              ---------------------
              {context_str}
              ---------------------
              Given the context information above I want you to think
              step by step to answer the query in a crisp manner,
              incase you don't know the answer say 'I don't know!'
            
              Query: {query_str}
        
              Answer:"""

qa_prompt_tmpl = PromptTemplate(template)

In [17]:
## 8. Query Qdrant directly with your own embedding


In [18]:
# query = "What exactly is DSPy?"

# # Use the same embedding model as for chunking
# query_embedding = semantic_chunker.chunk([query])[0]

# search_result = client.search(
#     collection_name=collection_name,
#     query_vector=query_embedding.tolist(),
#     limit=5
# )

# # Gather the top results' texts
# top_chunks = [hit.payload["text"] for hit in search_result]

# # Optionally, synthesize an answer using your LLM
# context_str = "\n\n".join(top_chunks)
# prompt = template.format(context_str=context_str, query_str=query)

# response = llm.complete(prompt)
# print(response)

# 8. Reranking

Here, we use a cross-encoder to re-rank the document chunks. Also, we limit the output to the top 3 most relevant chunks based on the model’s scoring.

In [19]:
from llama_index.core.postprocessor import SentenceTransformerRerank

rerank = SentenceTransformerRerank(
    model="cross-encoder/ms-marco-MiniLM-L-2-v2", 
    top_n=3
)

In [20]:
rerank

SentenceTransformerRerank(callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x3e49a9220>, model='cross-encoder/ms-marco-MiniLM-L-2-v2', top_n=3, device='mps', keep_retrieval_score=False, trust_remote_code=False)

# 9. Query the document

In [32]:
query_engine = index.as_query_engine(similarity_top_k=10,
                                     node_postprocessors=[rerank])

query_engine.update_prompts(
    {"response_synthesizer:text_qa_template": qa_prompt_tmpl}
)

# response = query_engine.query("What exactly is DSPy?")
# response = query_engine.query("How is DSPy pronounced?")
response = query_engine.query("What is the github repo for docling?")
# response = query_engine.query("Which is the RationalAI product that addresses customer care?")
# response = query_engine.query("What is the point with the Rational control room?")

In [33]:
from IPython.display import Markdown, display

display(Markdown(str(response)))

The GitHub repository for Docling is located at `github.com/DS4SD/docling`.

In [34]:
response.metadata

{'dbc90c67-754e-4d06-ad65-9b4a5449c797': {'page_label': '5',
  'file_name': 'docling.pdf',
  'file_path': '/Users/fc/experiments/rag-project/docs/docling.pdf',
  'file_type': 'application/pdf',
  'file_size': 5566575,
  'creation_date': '2025-06-13',
  'last_modified_date': '2025-06-13'},
 '5aa9aecf-6d98-4cf6-8009-b5bacc4234e5': {'page_label': '2',
  'file_name': 'docling.pdf',
  'file_path': '/Users/fc/experiments/rag-project/docs/docling.pdf',
  'file_type': 'application/pdf',
  'file_size': 5566575,
  'creation_date': '2025-06-13',
  'last_modified_date': '2025-06-13'},
 '9883ca17-2ada-462e-b1ff-2826de7bf8fe': {'page_label': '3',
  'file_name': 'docling.pdf',
  'file_path': '/Users/fc/experiments/rag-project/docs/docling.pdf',
  'file_type': 'application/pdf',
  'file_size': 5566575,
  'creation_date': '2025-06-13',
  'last_modified_date': '2025-06-13'}}

In [35]:
response.response

'The GitHub repository for Docling is located at `github.com/DS4SD/docling`.'

## Debug

In [58]:
from IPython.display import Markdown, display
import re

def highlight(text, query, color):
        # Case-insensitive highlight
        pattern = re.compile(re.escape(query), re.IGNORECASE)
        return pattern.sub(f"<mark style='background-color:{color};'>{query}</mark>", text)

def display_sources_with_highlight(response, docs, query, highlight_color="#ffff00"):
    """
    Display source documents for the response, highlighting the query in the text.
    """
    
    for source in response.metadata.values():
        source_document = source.get("file_name")
        source_page = source.get("page_label")
        if source_page:
            # We need also to filter per document file_name
            d = next((doc for doc in docs if doc.metadata.get("file_name") == source_document and doc.metadata.get("page_label") == source_page), None)
            if d:
                highlighted = highlight(d.text, query, highlight_color)
                display(Markdown(f"### Source Document (page_label: {source_page})\n\n{highlighted}"))

# Example usage:
word = "github.com/DS4SD/docling"
display_sources_with_highlight(response, docs, word)

### Source Document (page_label: 5)

torch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future
version of this report.
Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our
test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution
(TTS), computed throughput in pages per second, and the peak memory used (resident set size) for
both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads.
CPU Thread
budget
native backend pypdfium backend
TTS Pages/s Mem TTS Pages/s Mem
Apple M3 Max
(16 cores)
4 177 s 1.27 6.20 GB 103 s 2.18 2.56 GB16 167 s 1.34 92 s 2.45
Intel(R) Xeon
E5-2690
(16 cores)
4 375 s 0.60 6.16 GB 239 s 0.94 2.42 GB16 244 s 0.92 143 s 1.57
5 Applications
Thanks to the high-quality, richly structured document conversion achieved by Docling, its out-
put qualifies for numerous downstream applications. For example, Docling can provide a base
for detailed enterprise document search, passage retrieval or classification use-cases, or support
knowledge extraction pipelines, allowing specific treatment of different structures in the document,
such as tables, figures, section structure or references. For popular generative AI application pat-
terns, such as retrieval-augmented generation (RAG), we providequackling, an open-source package
which capitalizes on Docling’s feature-rich document output to enable document-native optimized
vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIn-
dex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build
document-derived datasets. With its powerful table structure recognition, it provides significant ben-
efit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open
IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal
training datasets.
6 Future work and contributions
Docling is designed to allow easy extension of the model library and pipelines. In the future, we
plan to extend Docling with several more models, such as a figure-classifier model, an equation-
recognition model, a code-recognition model and more. This will help improve the quality of con-
version for specific types of content, as well as augment extracted document metadata with ad-
ditional information. Further investment into testing and optimizing GPU acceleration as well as
improving the Docling-native PDF backend are on our roadmap, too.
We encourage everyone to propose or implement additional features and models, and will
gladly take your inputs and contributions under review. The codebase of Docling is open for use
and contribution, under the MIT license agreement and in alignment with our contributing guidelines
included in the Docling repository. If you use Docling in your projects, please consider citing this
technical report.
References
[1] J. AI. Easyocr: Ready-to-use ocr with 80+ supported languages. https://github.com/
JaidedAI/EasyOCR, 2024. Version: 1.7.0.
[2] J. Ansel, E. Yang, H. He, N. Gimelshein, A. Jain, M. V oznesensky, B. Bao, P. Bell, D. Berard,
E. Burovski, G. Chauhan, A. Chourdia, W. Constable, A. Desmaison, Z. DeVito, E. Ellison,
W. Feng, J. Gong, M. Gschwind, B. Hirsh, S. Huang, K. Kalambarkar, L. Kirsch, M. La-
zos, M. Lezcano, Y . Liang, J. Liang, Y . Lu, C. Luk, B. Maher, Y . Pan, C. Puhrsch, M. Reso,
M. Saroufim, M. Y . Siraichi, H. Suk, M. Suo, P. Tillet, E. Wang, X. Wang, W. Wen, S. Zhang,
X. Zhao, K. Zhou, R. Zou, A. Mathews, G. Chanan, P. Wu, and S. Chintala. Pytorch 2: Faster
5

### Source Document (page_label: 2)

Here is what Docling delivers today:
• Converts PDF documents to JSON or Markdown format, stable and lightning fast
• Understands detailed page layout, reading order, locates figures and recovers table struc-
tures
• Extracts metadata from the document, such as title, authors, references and language
• Optionally applies OCR, e.g. for scanned PDFs
• Can be configured to be optimal for batch-mode (i.e high throughput, low time-to-solution)
or interactive mode (compromise on efficiency, low time-to-solution)
• Can leverage different accelerators (GPU, MPS, etc).
2 Getting Started
To use Docling, you can simply install thedocling package from PyPI. Documentation and examples
are available in our GitHub repository at <mark style='background-color:#ffff00;'>github.com/DS4SD/docling</mark>. All required model assets1 are
downloaded to a local huggingface datasets cache on first use, unless you choose to pre-install the
model assets in advance.
Docling provides an easy code interface to convert PDF documents from file system, URLs or binary
streams, and retrieve the output in either JSON or Markdown format. For convenience, separate
methods are offered to convert single documents or batches of documents. A basic usage example
is illustrated below. Further examples are available in the Doclign code repository.
from docling . document_converter import DocumentConverter
source = " https :// arxiv . org / pdf /2206.01062 " # PDF path or URL
converter = DocumentConverter ()
result = converter . convert_single ( source )
print ( result . render_as_markdown ()) # output : "## DocLayNet : A Large
Human - Annotated Dataset for Document - Layout Analysis [...]"
Optionally, you can configure custom pipeline features and runtime options, such as turning on or
off features (e.g. OCR, table structure recognition), enforcing limits on the input document size, and
defining the budget of CPU threads. Advanced usage examples and options are documented in the
README file. Docling also provides a Dockerfile to demonstrate how to install and run it inside a
container.
3 Processing pipeline
Docling implements a linear pipeline of operations, which execute sequentially on each given docu-
ment (see Fig. 1). Each document is first parsed by a PDF backend, which retrieves the programmatic
text tokens, consisting of string content and its coordinates on the page, and also renders a bitmap
image of each page to support downstream operations. Then, the standard model pipeline applies a
sequence of AI models independently on every page in the document to extract features and content,
such as layout and table structures. Finally, the results from all pages are aggregated and passed
through a post-processing stage, which augments metadata, detects the document language, infers
reading-order and eventually assembles a typed document object which can be serialized to JSON
or Markdown.
3.1 PDF backends
Two basic requirements to process PDF documents in our pipeline are a) to retrieve all text content
and their geometric coordinates on each page and b) to render the visual representation of each
page as it would appear in a PDF viewer. Both these requirements are encapsulated in Docling’s
PDF backend interface. While there are several open-source PDF parsing libraries available for
python, we faced major obstacles with all of them for different reasons, among which were restrictive
1see huggingface.co/ds4sd/docling-models/
2

### Source Document (page_label: 3)

LayoutAnalysis
Serialize as JSONor Markdown
{;}
Parse PDF pages
 Table Structure
OCR
Model Pipeline
Assemble results,Apply document post-processing
Figure 1: Sketch of Docling’s default processing pipeline. The inner part of the model pipeline is
easily customizable and extensible.
licensing (e.g. pymupdf [7]), poor speed or unrecoverable quality issues, such as merged text cells
across far-apart text tokens or table columns (pypdfium, PyPDF) [15, 14].
We therefore decided to provide multiple backend choices, and additionally open-source a custom-
built PDF parser, which is based on the low-level qpdf [4] library. It is made available in a separate
package named docling-parse and powers the default PDF backend in Docling. As an alternative,
we provide a PDF backend relying onpypdfium, which may be a safe backup choice in certain cases,
e.g. if issues are seen with particular font encodings.
3.2 AI models
As part of Docling, we initially release two highly capable AI models to the open-source community,
which have been developed and published recently by our team. The first model is a layout analysis
model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9],
a state-of-the-art table structure recognition model. We provide the pre-trained weights (hosted on
huggingface) and a separate package for the inference code as docling-ibm-models. Both models
are also powering the open-access deepsearch-experience, our cloud-native service for knowledge
exploration tasks.
Layout Analysis Model
Our layout analysis model is an object-detector which predicts the bounding-boxes and classes of
various elements on the image of a given page. Its architecture is derived from RT-DETR [16] and
re-trained on DocLayNet [13], our popular human-annotated dataset for document-layout analysis,
among other proprietary datasets. For inference, our implementation relies on the onnxruntime [5].
The Docling pipeline feeds page images at 72 dpi resolution, which can be processed on a single
CPU with sub-second latency. All predicted bounding-box proposals for document elements are
post-processed to remove overlapping proposals based on confidence and size, and then intersected
with the text tokens in the PDF to group them into meaningful and complete units such as paragraphs,
section titles, list items, captions, figures or tables.
Table Structure Recognition
The TableFormer model [12], first published in 2022 and since refined with a custom structure token
language [9], is a vision-transformer model for table structure recovery. It can predict the logical
row and column structure of a given table based on an input image, and determine which table
cells belong to column headers, row headers or the table body. Compared to earlier approaches,
TableFormer handles many characteristics of tables, such as partial or no borderlines, empty cells,
rows or columns, cell spans and hierarchy both on column-heading or row-heading level, tables with
inconsistent indentation or alignment and other complexities. For inference, our implementation
relies on PyTorch[2].
3

Let's try retrieving the docs containing the query instead

In [59]:
# Write an inline function to find the first document containing a specific text
def find_document_with_text(text):
    for i, doc in enumerate(docs):
        if text in doc.text:
            return i, doc
    return None, None

i, doc = find_document_with_text(word)

In [60]:
doc

Document(id_='cfb14440-dc56-4483-8129-fd83728fc6c7', embedding=None, metadata={'page_label': '2', 'file_name': 'docling.pdf', 'file_path': '/Users/fc/experiments/rag-project/docs/docling.pdf', 'file_type': 'application/pdf', 'file_size': 5566575, 'creation_date': '2025-06-13', 'last_modified_date': '2025-06-13'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='Here is what Docling delivers today:\n• Converts PDF documents to JSON or Markdown format, stable and lightning fast\n• Understands detailed page layout, reading order, locates figures and recovers table struc-\ntures\n• Extracts metadata from the document, such as title, authors, re

In [63]:
Markdown(highlight(doc.text, word, "#ffff00"))

Here is what Docling delivers today:
• Converts PDF documents to JSON or Markdown format, stable and lightning fast
• Understands detailed page layout, reading order, locates figures and recovers table struc-
tures
• Extracts metadata from the document, such as title, authors, references and language
• Optionally applies OCR, e.g. for scanned PDFs
• Can be configured to be optimal for batch-mode (i.e high throughput, low time-to-solution)
or interactive mode (compromise on efficiency, low time-to-solution)
• Can leverage different accelerators (GPU, MPS, etc).
2 Getting Started
To use Docling, you can simply install thedocling package from PyPI. Documentation and examples
are available in our GitHub repository at <mark style='background-color:#ffff00;'>github.com/DS4SD/docling</mark>. All required model assets1 are
downloaded to a local huggingface datasets cache on first use, unless you choose to pre-install the
model assets in advance.
Docling provides an easy code interface to convert PDF documents from file system, URLs or binary
streams, and retrieve the output in either JSON or Markdown format. For convenience, separate
methods are offered to convert single documents or batches of documents. A basic usage example
is illustrated below. Further examples are available in the Doclign code repository.
from docling . document_converter import DocumentConverter
source = " https :// arxiv . org / pdf /2206.01062 " # PDF path or URL
converter = DocumentConverter ()
result = converter . convert_single ( source )
print ( result . render_as_markdown ()) # output : "## DocLayNet : A Large
Human - Annotated Dataset for Document - Layout Analysis [...]"
Optionally, you can configure custom pipeline features and runtime options, such as turning on or
off features (e.g. OCR, table structure recognition), enforcing limits on the input document size, and
defining the budget of CPU threads. Advanced usage examples and options are documented in the
README file. Docling also provides a Dockerfile to demonstrate how to install and run it inside a
container.
3 Processing pipeline
Docling implements a linear pipeline of operations, which execute sequentially on each given docu-
ment (see Fig. 1). Each document is first parsed by a PDF backend, which retrieves the programmatic
text tokens, consisting of string content and its coordinates on the page, and also renders a bitmap
image of each page to support downstream operations. Then, the standard model pipeline applies a
sequence of AI models independently on every page in the document to extract features and content,
such as layout and table structures. Finally, the results from all pages are aggregated and passed
through a post-processing stage, which augments metadata, detects the document language, infers
reading-order and eventually assembles a typed document object which can be serialized to JSON
or Markdown.
3.1 PDF backends
Two basic requirements to process PDF documents in our pipeline are a) to retrieve all text content
and their geometric coordinates on each page and b) to render the visual representation of each
page as it would appear in a PDF viewer. Both these requirements are encapsulated in Docling’s
PDF backend interface. While there are several open-source PDF parsing libraries available for
python, we faced major obstacles with all of them for different reasons, among which were restrictive
1see huggingface.co/ds4sd/docling-models/
2