In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import logging

from archaeo_super_prompt.dataset.load import MagohDataset
from archaeo_super_prompt.modeling.pdf_to_text import VLLM_Preprocessing

# 🦆📃 PDF complete ingestion with Docling preprocessing

We try the young framework Docling and its usage of VLLM on the HuggingFace repositories to achieve thoses tasks:

- document OCR with Italian language analysis (VLLM)
- document chunking with these features:
    - layout-aware
    - smart tokenization

All of these things are possible with incorporating several open ML models into the Docling pipeline

In [None]:
MAX_SAMPLES_FETCHED = 300
SEED = 0.5

dataset = MagohDataset(MAX_SAMPLES_FETCHED, SEED, True)
_selected_ids = [33872, 35983, 31298]
selected_ids = set(_selected_ids)
inputs = dataset.get_files_for_batch(selected_ids)

In [None]:
inputs["filepath"].tolist()

In [None]:
def main():
    logging.basicConfig(level=logging.INFO)
    TIMEOUT_PER_PAGE = 60 * 3
    # Example using the Granite Vision model with Ollama:
    doc_converter = VLLM_Preprocessing(
        model="granite3.2-vision:latest",
        prompt="OCR this part of Italian document for markdown-based processing.",
        embedding_model_hf_id="nomic-ai/nomic-embed-text-v1.5",
        max_chunk_size=512,
        allowed_timeout=TIMEOUT_PER_PAGE,
    )
    results = doc_converter.transform(inputs)
    return results

In [None]:
result = main()

## Inspect the results

We inspect the table of contextualized chunks with their kept metadata.

In [None]:
result

## Notes

- the granite-vision model miss a lot of content for some samples (e.g. the `31031`)
- it is better to set in the chunker a maximum chunk size, otherwise, according to the tokenizer, it will merge all the chunks
- the granite-vision model seems to read all the pdf as table elements and print for each extracted chunk the coordinate of the text box. This might not be ideal for a processing by an embedding model or a LLM, unless they ignore this information. 