# 1. Setup Asyncio

In [1]:
import nest_asyncio

nest_asyncio.apply()

# 2. Setup the Qdrant vector database

Let's now connect to out qdrant database to store the collection of documents we will use for RAG. 
We will use the `qdrant_client` library to interact with the Qdrant database.

In [2]:
import qdrant_client

collection_name = "chat_with_docs_docling"

client = qdrant_client.QdrantClient(
    host="localhost",
    port=6333,
)



# 3. Read the documents

We are now reading the documents using the docling library. For each document in the `docs` folder, we extract images and tables, in addition to its text.

In [3]:
import os
from glob import glob
from pathlib import Path

from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem


pdf_dir = "./docs"
pdf_files = glob(os.path.join(pdf_dir, "*.pdf"))
md_files = []

IMAGE_RESOLUTION_SCALE = 2.0
output_dir = Path("processed_docs")
output_dir.mkdir(parents=True, exist_ok=True)

# Pipeline options for PDF conversion
pipeline_options = PdfPipelineOptions(do_table_structure=True)
# Table options
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE  # use more accurate TableFormer model
# Image options
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
pipeline_options.generate_picture_images = True
pipeline_options.generate_page_images = True        # This is needed to generate table images

converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

def replace_artifact_images(md_path, doc_filename):
    import re

    """
    Replace image links to _artifacts with links to your own saved images.
    """
    md_text = Path(md_path).read_text()
    # Replace artifact image links with your own pictures/picture-{n}.png
    # Example: ![](docling_artifacts/image_000000_abc123.png) -> ![](docling/pictures/picture-1.png)
    artifact_img_regex = re.compile(r'!\[.*?\]\((?:.*?_artifacts/)?image_\d+_[a-f0-9]+\.png\)')
    picture_idx = 1

    def replacer(match):
        nonlocal picture_idx
        new_link = f"![]({doc_filename}/pictures/picture-{picture_idx}.png)"
        picture_idx += 1
        return new_link

    return artifact_img_regex.sub(replacer, md_text)

# Convert each PDF to Markdown using Docling's Python API
ddocs = []
for pdf_path in pdf_files:
    result = converter.convert(pdf_path)
    ddocs.append(result.document)

    # Save images of figures and tables
    doc_filename = result.input.file.stem
    table_counter = 0
    picture_counter = 0

    # Need to create document directories for images and tables
    (output_dir / doc_filename / "tables").mkdir(parents=True, exist_ok=True)
    (output_dir / doc_filename / "pictures").mkdir(parents=True, exist_ok=True)

    for element, _level in result.document.iterate_items():
        if isinstance(element, TableItem):
            table_counter += 1
            element_image_filename = (
                output_dir / doc_filename / "tables" / f"table-{table_counter}.png"
            )
            with element_image_filename.open("wb") as fp:
                element.get_image(result.document).save(fp, "PNG")

        if isinstance(element, PictureItem):
            picture_counter += 1
            element_image_filename = (
                output_dir / doc_filename / "pictures" / f"picture-{picture_counter}.png"
            )
            with element_image_filename.open("wb") as fp:
                element.get_image(result.document).save(fp, "PNG")

    # Save markdown with externally referenced pictures
    md_filename = output_dir / f"{doc_filename}.md"
    result.document.save_as_markdown(md_filename, image_mode=ImageRefMode.REFERENCED)

    # Replace artifact image links with your own pictures/picture-{n}.png
    md_text = replace_artifact_images(md_filename, doc_filename)
    md_filename.write_text(md_text)

    # Cleanup up artifacts directory if it exists
    artifacts_dir = output_dir / f"{doc_filename}_artifacts"
    if artifacts_dir.exists():
        import shutil
        shutil.rmtree(artifacts_dir)

  from .autonotebook import tqdm as notebook_tqdm


## 3.1 Inspecting Docling Documents

In [66]:
ddoc = ddocs[0]  # Use the first document for indexing
ddoc.origin.filename

'docling.pdf'

Does this document has pictures in it?

In [67]:
len(ddoc.pictures)

13

Let's inspect one to find its provenance wrt the original document

In [41]:
image = ddoc.pictures[1]
provenance = image.prov[0]
provenance.dict()

/var/folders/8w/7z6k0z3n1jd9jqhxwh808hbm0000gn/T/ipykernel_88970/3201237041.py:3: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  provenance.dict()


{'page_no': 3,
 'bbox': {'l': 110.07231140136719,
  't': 719.2913360595703,
  'r': 500.7577209472656,
  'b': 581.2926177978516,
  'coord_origin': <CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>},
 'charspan': (0, 0)}

## 3.2 Bonus: Extracting images using their provenance

In [69]:
import fitz  # PyMuPDF

def save_image_from_provenance(pdf_path, provenance, output_path):
    """
    Extracts and saves an image from a PDF using provenance info.
    provenance should have 'page_ix' (0-based) and 'bbox' ([x0, y0, x1, y1])
    """
    page_ix = provenance.page_no-1  # Convert to 0-based index
    doc = fitz.open(pdf_path)
    page = doc[page_ix]

    _bbox = provenance.bbox.to_top_left_origin(page_height=page.rect.height)
    bbox = [_bbox.l, _bbox.t, _bbox.r, _bbox.b]  # Ensure bbox is a list of floats

    clip = fitz.Rect(*bbox)
    pix = page.get_pixmap(clip=clip)
    pix.save(output_path)
    doc.close()



# Example usage:
save_image_from_provenance(pdf_path, provenance, "img_from_provenance.png")

# 4. Performing RAG


## 4.1 Loading documents

We are now using the markdown version of the documents obtained through docling stored in the `processed_docs` dir

In [74]:
from llama_index.core import SimpleDirectoryReader

input_dir_path = "./processed_docs"

loader = SimpleDirectoryReader(
    input_dir=input_dir_path,
    required_exts=[".md"],
    recursive=True
)

docs = loader.load_data()
len(docs)

2

Let's have a peek into the content retrieved from the PDF files

In [73]:
for doc in docs:
    print(f"======== Document: {doc.metadata.get('file_name')} ========\n")
    print(f"Content: {doc.get_content()[:100]}") # Print first 100 characters of content
    print(f"Metadata: {doc.metadata}")
    print("\n")


Content: ![](docling/pictures/picture-1.png)

## Docling Technical Report

Version 1.0

Christoph Auer Maksym
Metadata: {'file_path': '/Users/fc/experiments/rag-project/processed_docs/docling.md', 'file_name': 'docling.md', 'file_size': 49283, 'creation_date': '2025-07-22', 'last_modified_date': '2025-07-22'}



Content: ## DSPY: COMPILING DECLARATIVE LANGUAGE MODEL CALLS INTO SELF-IMPROVING PIPELINES

Omar Khattab, 1 A
Metadata: {'file_path': '/Users/fc/experiments/rag-project/processed_docs/dspy.md', 'file_name': 'dspy.md', 'file_size': 115012, 'creation_date': '2025-07-22', 'last_modified_date': '2025-07-22'}




## 4.2 Use Chonkie to chunk the documents

In [12]:
from chonkie import SemanticChunker
from llama_index.core.schema import Document
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding


semantic_chunker = SemanticChunker(
    embedding_model="BAAI/bge-large-en-v1.5",
    threshold=0.5,
    chunk_size=512,
    min_sentences=1
)

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5",
                                   trust_remote_code=True)

Settings.embed_model = embed_model

all_chunks = []
for doc in docs:
    chunks = semantic_chunker.chunk(doc.text)
    for chunk in chunks:
        # Use LlamaIndex's embedding model to embed the chunk text
        chunk_embedding = Settings.embed_model.get_text_embedding(chunk.text)
        all_chunks.append(
            Document(
                text=chunk.text,
                metadata=doc.metadata,
                embedding=chunk_embedding
            )
        )

In [13]:
len(all_chunks)

90

In [51]:
type(all_chunks[0])

llama_index.core.schema.Document

Chunks are llama_index `Document`s with their own metadata and embeddings. Those are the actual documents we will index in Qdrant. 

## 4.3 Create Qdrant collection and index data

In [15]:
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import VectorStoreIndex, StorageContext

def create_index(documents):

    vector_store = QdrantVectorStore(client=client,
                                     collection_name=collection_name)
    
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    
    index = VectorStoreIndex.from_documents(documents,
                                            storage_context=storage_context)
    
    return index

In [16]:
from llama_index.core import Settings

index = create_index(all_chunks)

In [17]:
type(index)

llama_index.core.indices.vector_store.base.VectorStoreIndex

## 4.4 Load the LLM

Now, it's time to define the LLM model we will use for querying the index. We are using Ollama as the LLM provider, but you can replace it with any other LLM provider supported by LlamaIndex.

Please, make sure to have available the intended model locally. To do so, you can use the pull command. In a separate terminal, execute:
```bash
ollama pull llama3.2
```
and wait for the model to download. Once ready, continue with the next cell!

In [53]:
from llama_index.llms.ollama import Ollama
from llama_index.core import Settings


llm = Ollama(model="llama3.2", request_timeout=120.0, temperature=0.0)

Settings.llm = llm

In [54]:
type(Settings), Settings.llm, Settings.embed_model

(llama_index.core.settings._Settings,
 Ollama(callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x43010ff70>, system_prompt=None, messages_to_prompt=<function messages_to_prompt at 0x4352e40d0>, completion_to_prompt=<function default_completion_to_prompt at 0x3f8043e50>, output_parser=None, pydantic_program_mode=<PydanticProgramMode.DEFAULT: 'default'>, query_wrapper_prompt=None, base_url='http://localhost:11434', model='llama3.2', temperature=0.0, context_window=-1, request_timeout=120.0, prompt_key='prompt', json_mode=False, additional_kwargs={}, is_function_calling_model=True, keep_alive=None, thinking=None),
 HuggingFaceEmbedding(model_name='BAAI/bge-large-en-v1.5', embed_batch_size=10, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x43010ff70>, num_workers=None, embeddings_cache=None, max_length=512, normalize=True, query_instruction=None, text_instruction=None, cache_folder=None, show_progress_bar=False))

## 4.5 Define the prompt template

In [28]:
from llama_index.core import PromptTemplate

template = """Context information is below:
              ---------------------
              {context_str}
              ---------------------
              Given the context information above I want you to think
              step by step to answer the query in a crisp manner,
              incase you don't know the answer say 'I don't know!'
            
              Query: {query_str}
        
              Answer:"""

qa_prompt_tmpl = PromptTemplate(template)

## 4.6 Define Reranker

Here, we use a cross-encoder to re-rank the document chunks. Also, we limit the output to the top 3 most relevant chunks based on the model’s scoring.

In [29]:
from llama_index.core.postprocessor import SentenceTransformerRerank

rerank = SentenceTransformerRerank(
    model="cross-encoder/ms-marco-MiniLM-L-2-v2", 
    top_n=3
)

In [30]:
rerank

SentenceTransformerRerank(callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x3ecc84eb0>, model='cross-encoder/ms-marco-MiniLM-L-2-v2', top_n=3, device='mps', keep_retrieval_score=False, trust_remote_code=False)

## 4.7 Query the index

It's time to query the index! Let's ask a question about the content of the documents we indexed.

In [31]:
query_engine = index.as_query_engine(similarity_top_k=10, node_postprocessors=[rerank])

query_engine.update_prompts(
    {"response_synthesizer:text_qa_template": qa_prompt_tmpl}
)

Question:

In [55]:
# Example queries
# response = query_engine.query("What exactly is DSPy?")
# response = query_engine.query("How is DSPy pronounced?")
# response = query_engine.query("What is the github repo for docling?")
response = query_engine.query("What is the TTS for docling with pypdfium backend, when running on an Apple M3 Max using 4 threads for the test dataset of 225 pages?")

Answer:

In [56]:
from IPython.display import Markdown, display

display(Markdown(str(response)))

According to Table 1 in the document, the TTS (time-to-solution) for Docling with the pypdfium backend, when running on an Apple M3 Max using 4 threads for the test dataset of 225 pages is 103 seconds.

In [57]:
response.metadata

{'37adfe9d-ffb7-46f4-9a1e-f231866acee8': {'file_path': '/Users/fc/experiments/rag-project/processed_docs/docling.md',
  'file_name': 'docling.md',
  'file_size': 49283,
  'creation_date': '2025-07-22',
  'last_modified_date': '2025-07-22'},
 'd5f9a120-de15-4d3a-939f-0a0e40b9c2d8': {'file_path': '/Users/fc/experiments/rag-project/processed_docs/docling.md',
  'file_name': 'docling.md',
  'file_size': 49283,
  'creation_date': '2025-07-22',
  'last_modified_date': '2025-07-22'},
 '6de32255-ea98-4f3a-8f4d-0196f693f4ba': {'file_path': '/Users/fc/experiments/rag-project/processed_docs/docling.md',
  'file_name': 'docling.md',
  'file_size': 49283,
  'creation_date': '2025-07-22',
  'last_modified_date': '2025-07-22'}}

## Bonus: visualize relevant text in documents

We are now defining an helper function to visualize a query text in the documents under RAG.

This can help us understand how the model is interpreting the content and which (parts of) the documents are being exploited in the LLM response.

In [60]:
from IPython.display import Markdown, display
import re

def highlight(text, query, color):
        # Case-insensitive highlight
        pattern = re.compile(re.escape(query), re.IGNORECASE)
        return pattern.sub(f"<mark style='background-color:{color};'>{query}</mark>", text)

def display_sources_with_highlight(response, docs, query, highlight_color="#ffff00"):
    """
    Display source documents for the response, highlighting the query in the text.
    Avoid displaying the same document multiple times.
    """
    shown = set()
    for source in response.metadata.values():
        source_document = source.get("file_name")
        if source_document and source_document not in shown:
            d = next((doc for doc in docs if doc.metadata.get("file_name") == source_document), None)
            if d:
                highlighted = highlight(d.text, query, highlight_color)
                display(Markdown(f"# ==================== \n\n{highlighted}\n\n"))
                shown.add(source_document)

Usage:

In [None]:
word = "TTS"
display_sources_with_highlight(response, docs, word)