In [8]:
from docling.chunking import HybridChunker
from docling.document_converter import DocumentConverter
from transformers import AutoTokenizer

In [3]:
converter = DocumentConverter()
result = converter.convert(
    "/Users/alessandro/Development/generalRAG/data/original/Fact-Sheets.pdf"
)

In [None]:
result.document.pictures

In [10]:
# Initialize chunker with tokenizer and max tokens
tokenizer = AutoTokenizer.from_pretrained("Alibaba-NLP/gte-Qwen2-7B-instruct")
chunker = HybridChunker(tokenizer=tokenizer, max_tokens=400, merge_peers=True)


In [11]:
document_chunks = list(chunker.chunk(dl_doc=result.document))

In [19]:
def extract_chunk_metadata(chunk):
    metadata = {}

    # Extract headings if available
    metadata["headings"] = getattr(chunk.meta, "headings", None)

    # Extract filename from origin
    try:
        filename = chunk.meta.origin.filename
        if isinstance(filename, str):
            metadata["filename"] = [filename]
        elif isinstance(filename, list) and all(isinstance(f, str) for f in filename):
            metadata["filename"] = filename
        else:
            raise TypeError("filename must be a string or list of strings")
    except (AttributeError, TypeError, KeyError):
        metadata["filename"] = None

    # Extract MIME type
    metadata["mimetype"] = getattr(chunk.meta.origin, "mimetype", None)

    # Extract page numbers from provenance
    page_numbers = []
    try:
        for item in chunk.meta.doc_items:
            for prov in item.prov:
                if hasattr(prov, "page_no"):
                    page_numbers.append(prov.page_no)
        metadata["pages"] = sorted(set(page_numbers)) if page_numbers else None
    except Exception:
        metadata["pages"] = None

    # Extract bounding boxes and char spans from provenance
    bboxes = []
    charspans = []
    try:
        for item in chunk.meta.doc_items:
            for prov in item.prov:
                bbox = getattr(prov, "bbox", None)
                charspan = getattr(prov, "charspan", None)
                if bbox:
                    bboxes.append(
                        {
                            "left": bbox.l,
                            "top": bbox.t,
                            "right": bbox.r,
                            "bottom": bbox.b,
                            "origin": bbox.coord_origin.name,
                        }
                    )
                if charspan:
                    charspans.append({"start": charspan[0], "end": charspan[1]})
        metadata["bounding_boxes"] = bboxes if bboxes else None
        metadata["charspans"] = charspans if charspans else None
    except Exception:
        metadata["bounding_boxes"] = None
        metadata["charspans"] = None

    return {"chunk": chunk.text, "metadata": metadata}


In [20]:
all_extracted = [extract_chunk_metadata(chunk) for chunk in document_chunks]

In [21]:
all_extracted

[{'chunk': 'ENGIE Brasil Energia is an investment platform in energy infrastructure, active in the areas of generation, commercialization, trading and transmission as well as natural gas transportation, through the intermediary of Transportadora Associada de Gás - TAG, jointly with other partners. As the largest 100% renewable energy generator in the Brazilian private sector, implements and operates projects from renewable sources such as hydroelectric, wind farms, photovoltaic and biomass plants together with small hydroelectric plants. The Company operates with transparency, financial discipline, respect for the environment, support for communities and focus on operational efficiency as drivers of long-term growth.\nThe market cap, as of December 31, 2024, was R$ 29.0 billion, and the own installed capacity totaled 9,556 MW , which comprises a generating complex of 115 plants , of which 11 are hydroelectric power plants and 104 fired from complementary sources: two biomass-fired plan