In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import logging
from docling.datamodel.pipeline_options_vlm_model import ResponseFormat
from pathlib import Path

from archaeo_super_prompt.dataset.load import MagohDataset
from archaeo_super_prompt.pdf_to_text.stream_ocr import process_documents, converter, ollama_vlm_options

# 🦆📃 PDF complete ingestion with Docling preprocessing

We try the young framework Docling and its usage of VLLM on the HuggingFace repositories to achieve thoses tasks:

- document OCR with Italian language analysis (VLLM)
- document chunking with these features:
    - layout-aware
    - smart tokenization

All of these things are possible with incorporating several open ML models into the Docling pipeline

In [None]:
MAX_SAMPLES_FETCHED = 300
SEED = 0.5

dataset = MagohDataset(MAX_SAMPLES_FETCHED, SEED, True)
_selected_ids = [
    35983, 31298
]
selected_ids = set(_selected_ids)
inputs = dataset.get_files_for_batch(selected_ids)

In [None]:
inputs["filepath"].tolist()

In [None]:
def main():
    logging.basicConfig(level=logging.INFO)
    TIMEOUT_PER_PAGE = 60*3
    # Example using the Granite Vision model with Ollama:
    vlm_options = ollama_vlm_options(
        model="granite3.2-vision:latest",
        prompt="OCR the full page for markdown-based processing.",
        # Doctags is only supported by doclings vllm for now
        response_format=ResponseFormat.MARKDOWN,
        allowed_timeout=TIMEOUT_PER_PAGE
    )
    doc_converter = converter(vlm_options)
    # results = doc_converter.convert(inputs["filepath"].tolist()[1])
    results = process_documents([Path(p) for p in inputs["filepath"].tolist()],
                                doc_converter, TIMEOUT_PER_PAGE)
    return results

In [None]:
result = main()

## Inspect the results

We export into markdown the results for display it

In [None]:
from IPython.display import Markdown
Markdown(result[0].document.export_to_markdown())

In [None]:
from docling.datamodel.base_models import ConversionStatus


def is_successful(conv_res):
    return (conv_res.status == ConversionStatus.SUCCESS or
            conv_res.status == ConversionStatus.PARTIAL_SUCCESS)


def filename(conv_res):
    return conv_res.input.file.stem

[{"good": is_successful(r), "name": filename(r)} for r in result]

In [None]:
from docling.chunking import HierarchicalChunker

chunker = HierarchicalChunker()
chunk_iter = chunker.chunk(dl_doc=result[1].document)
chunks = list(chunk_iter)

In [None]:
for i, chunk in enumerate(chunks):
    print(f"=== {i} ===")
    txt_tokens = len(chunk.text.rstrip().split(" "))  # tokenizer.count_tokens(chunk.text)
    print(f"chunk.text ({txt_tokens} tokens):\n{chunk.text!r}")

    ser_txt = chunker.contextualize(chunk=chunk)
    ser_tokens = len(ser_txt.rstrip().split(" "))  # tokenizer.count_tokens(ser_txt)
    print(f"chunker.contextualize(chunk) ({ser_tokens} tokens):\n{ser_txt!r}")

    print()

In [None]:
from IPython.display import Markdown
Markdown(result[1].document.export_to_markdown())