In [1]:
from typing import List
import lancedb
from docling.chunking import HybridChunker
from docling.document_converter import DocumentConverter
from dotenv import load_dotenv
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()

True

In [3]:
MAX_TOKENS = 8191

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "NousResearch/Meta-Llama-3-8B",
    use_fast=True
)

### Extract the data

In [5]:
converter = DocumentConverter()
result = converter.convert("https://arxiv.org/pdf/2408.09869")

2026-01-06 22:27:09,644 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2026-01-06 22:27:09,666 - INFO - Going to convert document batch...
2026-01-06 22:27:09,667 - INFO - Initializing pipeline for StandardPdfPipeline with options hash e15bc6f248154cc62f8db15ef18a8ab7
2026-01-06 22:27:09,675 - INFO - Loading plugin 'docling_defaults'
2026-01-06 22:27:09,676 - INFO - Registered picture descriptions: ['vlm', 'api']
2026-01-06 22:27:09,685 - INFO - Loading plugin 'docling_defaults'
2026-01-06 22:27:09,688 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2026-01-06 22:27:09,698 - INFO - rapidocr cannot be used because onnxruntime is not installed.
2026-01-06 22:27:09,699 - INFO - easyocr cannot be used because it is not installed.
2026-01-06 22:27:09,889 - INFO - Accelerator device: 'cpu'
[32m[INFO] 2026-01-06 22:27:09,901 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-06 22:27:09,905 [RapidOCR] device_con

### Apply hybrid chunking

In [6]:
chunker= HybridChunker(
    tokenizer=tokenizer,
    max_tokens=8191,
    merge_peers=True)

In [7]:
chunk_iter=chunker.chunk(dl_doc=result.document)
chunks= list(chunk_iter)

In [8]:
len(chunks)

21

In [16]:
chunks

[DocChunk(text='Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar\nAI4K Group, IBM Research R¨ uschlikon, Switzerland', meta=DocMeta(schema_name='docling_core.transforms.chunker.DocMeta', version='1.0.0', doc_items=[DocItem(self_ref='#/texts/3', parent=RefItem(cref='#/body'), children=[], content_layer=<ContentLayer.BODY: 'body'>, meta=None, label=<DocItemLabel.TEXT: 'text'>, prov=[ProvenanceItem(page_no=1, bbox=BoundingBox(l=113.643, t=481.532, r=498.359, b=439.849, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), charspan=(0, 295))]), DocItem(self_ref='#/texts/4', parent=RefItem(cref='#/body'), children=[], content_layer=<ContentLayer.BODY: 'body'>, meta=None, label=<DocItemLabel.TEXT: 'text'>, prov=[ProvenanceItem(page_no=1, bbox=Boundi

### Crete a LanceDB database

#### Create a LanceDB database

In [9]:
db= lancedb.connect("data/lancedb")

In [10]:
# HuggingFace E5 embedding function
func = get_registry().get("huggingface").create(
    name="intfloat/e5-large-v2",
    device="cpu"   # explicitly CPU
)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


#### Define a simplified metadata schema

In [11]:
class ChunkMetadata(LanceModel):
    """
    You must order the fields in alphabetical order.
    This is a requirement of the Pydantic implementation."""
    filename: str | None
    page_numbers: List[int] | None
    title: str |None

#### Define the main Schema

In [12]:
class Chunks(LanceModel):
    text: str = func.SourceField()
    vector: Vector(func.ndims()) = func.VectorField()  # type: ignore
    metadata: ChunkMetadata

In [13]:
table = db.create_table("docling", schema=Chunks, mode="overwrite")

#### Create table with processsed chunks

In [14]:
processed_chunks=[
    {
        "text": chunk.text,
        "metadata": {
            "filename": chunk.meta.origin.filename,
            "page_numbers": [
                page_no for page_no in sorted(
                    set(
                    prov.page_no
                    for item in chunk.meta.doc_items
                    for prov in item.prov)
                )
            ]
            or None,
            "title": chunk.meta.headings[0] if chunk.meta.headings else None,
        },
    }
    for chunk in chunks
]

In [15]:
processed_chunks

[{'text': 'Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar\nAI4K Group, IBM Research R¨ uschlikon, Switzerland',
  'metadata': {'filename': '2408.09869v5.pdf',
   'page_numbers': [1],
   'title': 'Version 1.0'}},
 {'text': 'This technical report introduces Docling , an easy to use, self-contained, MITlicensed open-source package for PDF document conversion. It is powered by state-of-the-art specialized AI models for layout analysis (DocLayNet) and table structure recognition (TableFormer), and runs efficiently on commodity hardware in a small resource budget. The code interface allows for easy extensibility and addition of new features and models.',
  'metadata': {'filename': '2408.09869v5.pdf',
   'page_numbers': [1],
   'title': 'Abstract'}

### Add the chunks to the table (automatically embeds the text)

In [17]:
table.add(processed_chunks)

AddResult(version=2)

### Load the table

In [19]:
table.to_pandas()

Unnamed: 0,text,vector,metadata
0,Christoph Auer Maksym Lysak Ahmed Nassar Miche...,"[-0.6758256, -1.1487217, 0.9755337, -0.0216534...","{'filename': '2408.09869v5.pdf', 'page_numbers..."
1,"This technical report introduces Docling , an ...","[0.33233875, -1.03356, 0.2582115, -0.7198179, ...","{'filename': '2408.09869v5.pdf', 'page_numbers..."
2,Converting PDF documents back into a machine-p...,"[0.1541398, -1.1874291, 0.32860306, -0.7412702...","{'filename': '2408.09869v5.pdf', 'page_numbers..."
3,"To use Docling, you can simply install the doc...","[-0.009093603, -0.9743467, -0.06449735, -0.457...","{'filename': '2408.09869v5.pdf', 'page_numbers..."
4,Docling implements a linear pipeline of operat...,"[0.32482517, -1.6535829, 0.22764255, -0.704372...","{'filename': '2408.09869v5.pdf', 'page_numbers..."
5,Two basic requirements to process PDF document...,"[0.0025835217, -1.0926759, 0.1679451, -0.21024...","{'filename': '2408.09869v5.pdf', 'page_numbers..."
6,"As part of Docling, we initially release two h...","[0.27916622, -1.1041205, 0.1643831, -0.6611637...","{'filename': '2408.09869v5.pdf', 'page_numbers..."
7,Our layout analysis model is an object-detecto...,"[0.24132873, -1.7730395, 0.23474853, -0.698244...","{'filename': '2408.09869v5.pdf', 'page_numbers..."
8,"The TableFormer model [12], first published in...","[0.05610736, -0.89408, 0.06974798, -0.50746804...","{'filename': '2408.09869v5.pdf', 'page_numbers..."
9,"Docling provides optional support for OCR, for...","[0.24128692, -1.195347, 0.6988983, -0.7061388,...","{'filename': '2408.09869v5.pdf', 'page_numbers..."
