In [4]:
from __future__ import annotations

import argparse
import os
import tempfile
from pathlib import Path
from typing import List, Tuple

from docling.document_converter import DocumentConverter
from PyPDF2 import PdfMerger

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def convert_directory(
    directory: Path | str,
    theme: str,
    objectif: str,
    output_markdown: Path | str = "dossier_documentaire.md",
) -> None:
    """Parcourt *directory* et consolide les documents en un document Markdown
    consolidé.
    """
    EXTENSIONS = {
        "pdf",
        "docx", "xlsx", "pptx",
        "md",
        "adoc", "asciidoc",
        "html", "xhtml",
        "csv",
        "png", "jpeg", "jpg", "tiff", "bmp",
    }

    directory = Path(directory)

    doc_paths = sorted(
        p for p in directory.rglob("*")
        if p.is_file() and p.suffix.lstrip(".").lower() in EXTENSIONS
    )
    
    output_markdown = Path(output_markdown)

    if not directory.is_dir():
        raise NotADirectoryError(f"{directory} n'est pas un répertoire valide")

    doc_paths: List[Path] = sorted(directory.glob("*.pdf"))
    if not doc_paths:
        raise FileNotFoundError("Aucun fichier PDF trouvé dans le répertoire")

    converter = DocumentConverter()

    # Accumulate markdown blocks, index entries and temp‑PDFs to merge.
    markdown_blocks: List[str] = ["# Dossier documentaire\n", f"**Thème : {theme}**\n", f"**Objectif du dossier : {objectif}**\n" ]
    index_entries: List[str] = ["\n## Index des documents \n"]

    current_global_page = 1

    

    # First pass – convert each PDF, capture markdown pages and produce a temp PDF.
    all_docs_pages: List[Tuple[str, List[str]]] = []
    for doc_path in doc_paths:
        result = converter.convert(str(doc_path))

        pages_md: List[str] = [
            result.document.export_to_markdown(page_no=i)
            for i in range(len(result.document.pages)+1)
        ]
        all_docs_pages.append((doc_path.name, pages_md))

        # Index
        index_entries.append(f"p. {current_global_page} \t : \t {doc_path.name} \n")
        current_global_page += len(pages_md)

    # Add index to markdown.
    markdown_blocks.extend(index_entries)

    # Second pass – build content section.
    for file_name, pages_md in all_docs_pages:
        markdown_blocks.append("\n\n---\n\n")  # Saut de page dans le markdown
        markdown_blocks.append(f"*Début du document : {file_name}*\n")
        for page_number, page_md in enumerate(pages_md, start=0):
            markdown_blocks.append(f"*Début de la page {page_number} du doc : {file_name}*\n")
            markdown_blocks.append(page_md.strip())
            markdown_blocks.append(f"\n*Fin de la page {page_number} du doc : {file_name}**\n")
            markdown_blocks.append("\n\n---\n\n")  # Saut de page dans le markdown
        markdown_blocks.append(f"*Fin du document : {file_name}*\n")

    # Écriture du markdown consolidé.
    output_markdown.write_text("\n".join(markdown_blocks), encoding="utf-8")
    print(f"✅ Markdown écrit → {output_markdown.resolve()}")


In [31]:
convert_directory(directory="../data/", theme="Recueil juridique CNIL",objectif="Veille juridique", output_markdown="../data/Result.md")



✅ Markdown écrit → /Users/alih/Projets/markitab/data/Result.md


In [18]:
%pip install -qU docling transformers


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [19]:
DOC_SOURCE = "../data/result.md"

doc = DocumentConverter().convert(source=DOC_SOURCE).document

In [20]:
from docling.chunking import HybridChunker

chunker = HybridChunker()
chunk_iter = chunker.chunk(dl_doc=doc)

Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors


In [21]:
for i, chunk in enumerate(chunk_iter):
    print(f"=== {i} ===")
    print(f"chunk.text:\n{f'{chunk.text[:300]}…'!r}")

    enriched_text = chunker.serialize(chunk=chunk)
    print(f"chunker.serialize(chunk):\n{f'{enriched_text[:300]}…'!r}")

    print()

=== 0 ===
chunk.text:
'Thème\xa0: Recueil juridique CNIL\nObjectif du dossier : Veille juridique…'
chunker.serialize(chunk):
'Dossier documentaire\nThème\xa0: Recueil juridique CNIL\nObjectif du dossier : Veille juridique…'

=== 1 ===
chunk.text:
'p. 1 \t : \t Délibération 03-008 du 27 février 2003 - Légifrance.pdf\np. 5 \t : \t Délibération 04-020 du 08 avril 2004 - Légifrance.pdf\np. 9 \t : \t Délibération 2004-100 du 09 décembre 2004 - Légifrance.pdf\np. 12 \t : \t Délibération SAN-2023-003 du 16 mars 2023 - Légifrance.pdf\np. 26 \t : \t L…'
chunker.serialize(chunk):
'Dossier documentaire\nIndex des documents\np. 1 \t : \t Délibération 03-008 du 27 février 2003 - Légifrance.pdf\np. 5 \t : \t Délibération 04-020 du 08 avril 2004 - Légifrance.pdf\np. 9 \t : \t Délibération 2004-100 du 09 décembre 2004 - Légifrance.pdf\np. 12 \t : \t Délibération SAN-2023-003 du 16 …'

=== 2 ===
chunk.text:
'Nature de la délibération : Avis Etat juridique : En vigueur\nDate d

  enriched_text = chunker.serialize(chunk=chunk)
