### imports

In [1]:
from docling.document_converter import DocumentConverter
from docling.chunking import HybridChunker
from transformers import AutoTokenizer
from pathlib import Path

### Process all documents from a directory and save the resulted chunks to a .txt file

In [2]:
def append_chunks_to_file(chunks, chunker, output_path: str, document_name: str, chunk_offset: int = 0):
    """Append chunks from a document to an existing file.
    
    Args:
        chunks: List of chunks to append
        chunker: HybridChunker instance for contextualization
        output_path: Path to output file
        document_name: Name of the source document (for metadata)
        chunk_offset: Starting chunk number (for continuous numbering across documents)
    """
    
    with open(output_path, 'a', encoding='utf-8') as f:
        # Add document separator
        f.write(f"\n{'#'*60}\n")
        f.write(f"# SOURCE DOCUMENT: {document_name}\n")
        f.write(f"{'#'*60}\n\n")
        
        for i, chunk in enumerate(chunks):
            chunk_number = chunk_offset + i
            f.write(f"{'='*60}\n")
            f.write(f"CHUNK {chunk_number}\n")
            f.write(f"Source: {document_name}\n")
            f.write(f"{'='*60}\n")

            # Use contextualize to preserve headings and metadata
            contextualized_text = chunker.contextualize(chunk=chunk)
            f.write(contextualized_text)
            f.write("\n\n")
    
    return chunk_offset + len(chunks)

def process_multiple_documents(documents_dir: str, output_file: str, max_tokens: int = 512):
    """Process multiple documents from a directory and save all chunks to a single file.
    
    Docling automatically handles all supported file formats (.pdf, .md, .docx, .html, .txt, etc.)
    
    Args:
        documents_dir: Directory containing documents to process
        output_file: Path to single output file for all chunks
        max_tokens: Maximum tokens per chunk
    """
    
    print("=" * 60)
    print("BATCH HYBRID CHUNKING - MULTIPLE DOCUMENTS")
    print("=" * 60)
    
    # Get all files from directory (excluding directories)
    documents_path = Path(documents_dir)
    all_files = [f for f in documents_path.iterdir() if f.is_file()]
    all_files = sorted(all_files)  # Sort for consistent ordering
    
    if not all_files:
        print(f"\nâœ— No files found in {documents_dir}")
        return
    
    print(f"\nFound {len(all_files)} documents to process")
    print(f"Output file: {output_file}")
    print(f"Max tokens per chunk: {max_tokens}\n")
    
    # Initialize tokenizer once (reuse for all documents)
    print("Initializing tokenizer...")
    model_id = "sentence-transformers/all-MiniLM-L6-v2"
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    
    # Create chunker once (reuse for all documents)
    chunker = HybridChunker(
        tokenizer=tokenizer,
        max_tokens=max_tokens,
        merge_peers=True
    )
    
    # Clear output file (start fresh)
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(f"{'#'*60}\n")
        f.write(f"# HYBRID CHUNKS - ALL DOCUMENTS\n")
        f.write(f"# Generated: {Path().absolute()}\n")
        f.write(f"# Total documents: {len(all_files)}\n")
        f.write(f"# Max tokens per chunk: {max_tokens}\n")
        f.write(f"{'#'*60}\n\n")
    
    total_chunks = 0
    successful_docs = 0
    failed_docs = []
    
    # Process each document
    for file_path in all_files:
        try:
            print(f"\nðŸ“„ Processing: {file_path.name}")
            
            # Convert document
            print("   Converting document...")
            converter = DocumentConverter()
            result = converter.convert(str(file_path))
            doc = result.document
            
            # Generate chunks
            print("   Generating chunks...")
            chunk_iter = chunker.chunk(dl_doc=doc)
            chunks = list(chunk_iter)
            
            # Append to output file
            print(f"   Appending {len(chunks)} chunks to output file...")
            total_chunks = append_chunks_to_file(
                chunks=chunks,
                chunker=chunker,
                output_path=output_file,
                document_name=file_path.name,
                chunk_offset=total_chunks
            )
            
            successful_docs += 1
            print(f"   âœ“ Success! Total chunks so far: {total_chunks}")
            
        except Exception as e:
            print(f"   âœ— Error processing {file_path.name}: {e}")
            failed_docs.append(file_path.name)
    
    # Final summary
    print("\n" + "=" * 60)
    print("PROCESSING COMPLETE")
    print("=" * 60)
    print(f"âœ“ Successfully processed: {successful_docs}/{len(all_files)} documents")
    print(f"âœ“ Total chunks generated: {total_chunks}")
    print(f"âœ“ Output file: {output_file}")
    
    if failed_docs:
        print(f"\nâœ— Failed documents ({len(failed_docs)}):")
        for doc in failed_docs:
            print(f"   - {doc}")
    
    print("\n" + "=" * 60)
    print("KEY BENEFITS OF HYBRID CHUNKING")
    print("=" * 60)
    print("âœ“ Respects document structure (sections, paragraphs)")
    print("âœ“ Token-aware (fits embedding model limits)")
    print("âœ“ Semantic coherence (doesn't split mid-sentence)")
    print("âœ“ Metadata preservation (headings, document context)")
    print("âœ“ Ready for RAG (optimized chunk sizes)")
    print("âœ“ All chunks in one file (easy to process for embeddings)")

### usage

In [None]:
# Example usage: Process all documents from the raw directory
raw_documents_dir = "../documents/raw"
output_chunks_file = "../documents/processed/all_chunks.txt"

# Process all documents (Docling handles all supported formats automatically)
process_multiple_documents(
    documents_dir=raw_documents_dir,
    output_file=output_chunks_file,
    max_tokens=512
)