# 1. Setup Asyncio

In [6]:
import nest_asyncio

nest_asyncio.apply()

# 2. Setup the Qdrant vector database

In [7]:
import qdrant_client

collection_name = "chat_with_docs_docling"

client = qdrant_client.QdrantClient(
    host="localhost",
    port=6333,
)

# 3. Read the documents

In [98]:
import os
from glob import glob
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode

pdf_dir = "./docs"
pdf_files = glob(os.path.join(pdf_dir, "*.pdf"))
md_files = []

pipeline_options = PdfPipelineOptions(do_table_structure=True)
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE  # use more accurate TableFormer model

doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

# Convert each PDF to Markdown using Docling's Python API
for pdf_path in pdf_files:
    result = converter.convert(pdf_path)
    md = result.document.export_to_markdown()
    md_files.append(md)



In [99]:
from llama_index.core.schema import Document

docs = [Document(text=md) for md in md_files]

In [100]:
docs

[Document(id_='d6e79c5a-f314-4d63-84a5-44567e9b6deb', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='<!-- image -->\n\n## Docling Technical Report\n\nVersion 1.0\n\nChristoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar\n\nAI4K Group, IBM Research R¨ uschlikon, Switzerland\n\n## Abstract\n\nThis technical report introduces Docling , an easy to use, self-contained, MITlicensed open-source package for PDF document conversion. It is powered by state-of-the-art specialized AI models for layout analysis (DocLayNet) and table structure recognition (TableFormer), and runs 

In [101]:
type(docs), len(docs)

(list, 3)

In [102]:
### Debug
#Check what's the content retrieved from the PDF file

for doc in docs:
    print(doc.get_content())
    print(doc.metadata)
    print("-----")

<!-- image -->

## Docling Technical Report

Version 1.0

Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar

AI4K Group, IBM Research R¨ uschlikon, Switzerland

## Abstract

This technical report introduces Docling , an easy to use, self-contained, MITlicensed open-source package for PDF document conversion. It is powered by state-of-the-art specialized AI models for layout analysis (DocLayNet) and table structure recognition (TableFormer), and runs efficiently on commodity hardware in a small resource budget. The code interface allows for easy extensibility and addition of new features and models.

## 1 Introduction

Converting PDF documents back into a machine-processable format has been a major challenge for decades due to their huge variabi

## 4. Use Chonkie to chunk the documents

In [29]:
from chonkie import SemanticChunker
from llama_index.core.schema import Document
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding


semantic_chunker = SemanticChunker(
    embedding_model="BAAI/bge-large-en-v1.5",
    threshold=0.5,
    chunk_size=512,
    min_sentences=1
)

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5",
                                   trust_remote_code=True)

Settings.embed_model = embed_model

all_chunks = []
for doc in docs:
    chunks = semantic_chunker.chunk(doc.text)
    for chunk in chunks:
        # Use LlamaIndex's embedding model to embed the chunk text
        chunk_embedding = Settings.embed_model.get_text_embedding(chunk.text)
        all_chunks.append(
            Document(
                text=chunk.text,
                metadata=doc.metadata,
                embedding=chunk_embedding
            )
        )

In [30]:
len(all_chunks)

102

In [31]:
all_chunks[:10]

[Document(id_='3a2f2a6a-6e11-46e0-89fa-b46773b9ec15', embedding=[0.03874894231557846, -0.033250775188207626, 0.004479407332837582, -0.0009298397344537079, 0.0032200419809669256, -0.015122611075639725, 0.010351400822401047, -0.020331185311079025, -0.01733342558145523, 0.04987389221787453, -0.010139093734323978, -0.006230470724403858, 0.012250307947397232, -0.020232191309332848, -0.01017018873244524, 0.014157791621983051, 0.008904274553060532, -0.018197093158960342, -0.04590492323040962, 0.013514004647731781, -0.010511751286685467, 0.012334516271948814, -0.05663621425628662, -0.01408647745847702, -0.02684773877263069, 0.0276507455855608, 0.029467083513736725, 0.0015230486169457436, 0.10512250661849976, 0.04112725704908371, -0.010294654406607151, -0.04328908026218414, 0.02199608087539673, -0.0071905991062521935, -0.018187014386057854, -0.025023018941283226, 0.021822739392518997, 0.005542535334825516, 0.010610105469822884, -0.03241569921374321, 0.009761345572769642, -0.04360151290893555, 0

## 5. Create Ddrant Collection

In [None]:
# import numpy as np
# from qdrant_client.models import VectorParams, Distance
# from qdrant_client.models import PointStruct


# # Create the collection if it doesn't exist
# client.recreate_collection(
#     collection_name=collection_name,
#     vectors_config=VectorParams(
#         size=np.array(all_chunks[0].embedding).shape[0],  # dimension of your embedding
#         distance=Distance.COSINE                # or Distance.DOT, Distance.EUCLID
#     )
# )

# points = []
# for i, chunk in enumerate(all_chunks):
#     if chunk.embedding is not None:
#         points.append(
#             PointStruct(
#                 id=i,
#                 vector=chunk.embedding,
#                 payload={"text": chunk.text}
#             )
#         )

# client.upsert(
#     collection_name=collection_name,
#     points=points
# )

# 5. Load the embedding model and index data

In [32]:
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import VectorStoreIndex, StorageContext

def create_index(documents):

    vector_store = QdrantVectorStore(client=client,
                                     collection_name=collection_name)
    
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    
    index = VectorStoreIndex.from_documents(documents,
                                            storage_context=storage_context)
    
    return index

In [33]:
from llama_index.core import Settings

index = create_index(all_chunks)

In [34]:
type(index)

llama_index.core.indices.vector_store.base.VectorStoreIndex

## 6. Load the LLM

In [67]:
from llama_index.llms.ollama import Ollama
from llama_index.core import Settings


llm = Ollama(model="llama3.2:1b", request_timeout=120.0, temperature=0.0)

Settings.llm = llm

In [68]:
type(Settings), Settings.llm, Settings.embed_model

(llama_index.core.settings._Settings,
 Ollama(callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x4128020d0>, system_prompt=None, messages_to_prompt=<function messages_to_prompt at 0x127a50c10>, completion_to_prompt=<function default_completion_to_prompt at 0x127ed6040>, output_parser=None, pydantic_program_mode=<PydanticProgramMode.DEFAULT: 'default'>, query_wrapper_prompt=None, base_url='http://localhost:11434', model='llama3.2:1b', temperature=0.0, context_window=-1, request_timeout=120.0, prompt_key='prompt', json_mode=False, additional_kwargs={}, is_function_calling_model=True, keep_alive=None, thinking=None),
 HuggingFaceEmbedding(model_name='BAAI/bge-large-en-v1.5', embed_batch_size=10, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x4128020d0>, num_workers=None, embeddings_cache=None, max_length=512, normalize=True, query_instruction=None, text_instruction=None, cache_folder=None, show_progress_bar=False))

# 7. Define the prompt template

In [69]:
from llama_index.core import PromptTemplate

template = """Context information is below:
              ---------------------
              {context_str}
              ---------------------
              Given the context information above I want you to think
              step by step to answer the query in a crisp manner,
              incase you don't know the answer say 'I don't know!'
            
              Query: {query_str}
        
              Answer:"""

qa_prompt_tmpl = PromptTemplate(template)

In [None]:
## 8. Query Qdrant directly with your own embedding


In [None]:
# query = "What exactly is DSPy?"

# # Use the same embedding model as for chunking
# query_embedding = semantic_chunker.chunk([query])[0]

# search_result = client.search(
#     collection_name=collection_name,
#     query_vector=query_embedding.tolist(),
#     limit=5
# )

# # Gather the top results' texts
# top_chunks = [hit.payload["text"] for hit in search_result]

# # Optionally, synthesize an answer using your LLM
# context_str = "\n\n".join(top_chunks)
# prompt = template.format(context_str=context_str, query_str=query)

# response = llm.complete(prompt)
# print(response)

# 8. Reranking

Here, we use a cross-encoder to re-rank the document chunks. Also, we limit the output to the top 3 most relevant chunks based on the model’s scoring.

In [70]:
from llama_index.core.postprocessor import SentenceTransformerRerank

rerank = SentenceTransformerRerank(
    model="cross-encoder/ms-marco-MiniLM-L-2-v2", 
    top_n=3
)

In [71]:
rerank

SentenceTransformerRerank(callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x3e5c21df0>, model='cross-encoder/ms-marco-MiniLM-L-2-v2', top_n=3, device='mps', keep_retrieval_score=False, trust_remote_code=False)

# 9. Query the document

In [92]:
query_engine = index.as_query_engine(similarity_top_k=10,
                                     node_postprocessors=[rerank])

query_engine.update_prompts(
    {"response_synthesizer:text_qa_template": qa_prompt_tmpl}
)

# response = query_engine.query("What exactly is DSPy?")
# response = query_engine.query("How is DSPy pronounced?")
# response = query_engine.query("What is the github repo for docling?")
response = query_engine.query("What is the TTS for docling with pypdfium backend, when running on an Apple M3 Max using 4 threads for the test dataset of 225 pages?")
# response = query_engine.query("Which is the RationalAI product that addresses customer care?")
# response = query_engine.query("What is the point with the Rational control room?")

In [93]:
from IPython.display import Markdown, display

display(Markdown(str(response)))

To determine the Time-to-Solution (TTS) for Docling with the pypdfium backend, when running on an Apple M3 Max using 4 threads for the test dataset of 225 pages, we need to follow these steps:

1. First, let's establish the reference numbers for the processing speed of Docling and the resource budget it requires.
2. We know that all tests in this section are run with default options on our standard test set distributed with Docling, which consists of three papers from arXiv and two IBM Redbooks, with a total of 225 pages.
3. Measurements were taken using both available PDF backends on two different hardware systems: one MacBook Pro M3 Max, and one bare-metal server running Ubuntu 20.04 LTS on an Intel Xeon E5-2690 CPU.

Given that we fixed the thread budget (through setting OMP NUM THREADS environment variable) once to 4 (Docling default) and once to 16 (equal to full core count on the test hardware), let's assume we are running with 4 threads for this specific test case.

Now, let's look at Table 1:

| CPU                         | Thread budget   | native backend   | native backend   | native backend   | native backend   | pypdfium backend   | pypdfium backend   | pypdfium backend   |
|-----------------------------|-----------------|------------------|------------------|------------------|--------------------|--------------------|--------------------|
|                             |                 | TTS              | Pages/s          | Mem              | TTS                | Pages/s            | Mem                |
| Apple M3 Max                | 4               | 177 s 167 s      | 1.27 1.34        | 6.20 GB          | 103 s 92 s         | 2.18 2.45          | 2.56 GB            |

Since we are running on an Apple M3 Max with 4 threads, the TTS value for pypdfium backend is 177 s 167 s.

Therefore, the Time-to-Solution (TTS) for Docling with the pypdfium backend, when running on an Apple M3 Max using 4 threads for the test dataset of 225 pages, is 177 seconds.

In [94]:
response.metadata

{'68e1a3c1-4255-4a33-b373-b52195725c43': {},
 'bdf910fb-7668-4faa-8ab6-57ad870626c0': {},
 'd5b4aeb1-4793-43af-88fc-7d86d4e79506': {}}

In [95]:
response.response

"To determine the Time-to-Solution (TTS) for Docling with the pypdfium backend, when running on an Apple M3 Max using 4 threads for the test dataset of 225 pages, we need to follow these steps:\n\n1. First, let's establish the reference numbers for the processing speed of Docling and the resource budget it requires.\n2. We know that all tests in this section are run with default options on our standard test set distributed with Docling, which consists of three papers from arXiv and two IBM Redbooks, with a total of 225 pages.\n3. Measurements were taken using both available PDF backends on two different hardware systems: one MacBook Pro M3 Max, and one bare-metal server running Ubuntu 20.04 LTS on an Intel Xeon E5-2690 CPU.\n\nGiven that we fixed the thread budget (through setting OMP NUM THREADS environment variable) once to 4 (Docling default) and once to 16 (equal to full core count on the test hardware), let's assume we are running with 4 threads for this specific test case.\n\nNow

## Debug

In [96]:
docs[0].text

'<!-- image -->\n\n## Docling Technical Report\n\nVersion 1.0\n\nChristoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar\n\nAI4K Group, IBM Research R¨ uschlikon, Switzerland\n\n## Abstract\n\nThis technical report introduces Docling , an easy to use, self-contained, MITlicensed open-source package for PDF document conversion. It is powered by state-of-the-art specialized AI models for layout analysis (DocLayNet) and table structure recognition (TableFormer), and runs efficiently on commodity hardware in a small resource budget. The code interface allows for easy extensibility and addition of new features and models.\n\n## 1 Introduction\n\nConverting PDF documents back into a machine-processable format has been a major challenge for decades due to t

In [97]:
from IPython.display import Markdown, display
import re

def highlight(text, query, color):
        # Case-insensitive highlight
        pattern = re.compile(re.escape(query), re.IGNORECASE)
        return pattern.sub(f"<mark style='background-color:{color};'>{query}</mark>", text)

def display_sources_with_highlight(response, docs, query, highlight_color="#ffff00"):
    """
    Display source documents for the response, highlighting the query in the text.
    """
    
    for source in response.metadata.values():
        source_document = source.get("file_name")
        source_page = source.get("page_label")
        if source_page:
            # We need also to filter per document file_name
            d = next((doc for doc in docs if doc.metadata.get("file_name") == source_document and doc.metadata.get("page_label") == source_page), None)
            if d:
                highlighted = highlight(d.text, query, highlight_color)
                display(Markdown(f"### Source Document (page_label: {source_page})\n\n{highlighted}"))

# Example usage:
word = "TTS"
display_sources_with_highlight(response, docs, word)

Let's try retrieving the docs containing the query instead

In [83]:
# Write an inline function to find the first document containing a specific text
def find_document_with_text(text):
    for i, doc in enumerate(docs):
        if text in doc.text:
            return i, doc
    return None, None

i, doc = find_document_with_text(word)

In [None]:
doc

In [None]:
Markdown(highlight(doc.text, word, "#ffff00"))