In [2]:
from langchain.chat_models import init_chat_model
from langchain_openai import OpenAIEmbeddings
from docling.document_converter import DocumentConverter
from pathlib import Path
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
model = init_chat_model("openai:gpt-4o-mini")
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [6]:
def process_document(file_path: str, converter: DocumentConverter) -> dict:
    """Process a single document and return metadata."""
    try:
        print(f"\nðŸ“„ Processing: {Path(file_path).name}")

        # Convert document
        result = converter.convert(file_path)

        # Export to markdown
        markdown = result.document.export_to_markdown()

        # Get document info
        doc_info = {
            'file': Path(file_path).name,
            'format': Path(file_path).suffix,
            'status': 'Success',
            'markdown_length': len(markdown),
            'preview': markdown[:200].replace('\n', ' ')
        }

        # Save output
        output_file = f"../documents/processed/output_{Path(file_path).stem}.md"
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(markdown)

        doc_info['output_file'] = output_file

        print(f"   âœ“ Converted successfully")
        print(f"   âœ“ Output: {output_file}")

        return doc_info

    except Exception as e:
        print(f"   âœ— Error: {e}")
        return {
            'file': Path(file_path).name,
            'format': Path(file_path).suffix,
            'status': 'Failed',
            'error': str(e)
        }

In [10]:
def loader(raw_documents_dir: str):
    print("=" * 60)
    print("Multi-Format Document Processing with Docling")
    print("=" * 60)

    # List of documents to process
    directory = Path(raw_documents_dir)
    documents = [str(file) for file in directory.iterdir() if file.is_file()]

    # Initialize converter once (reusable)
    converter = DocumentConverter()

    # Process all documents
    results = []
    for doc_path in documents:
        result = process_document(doc_path, converter)
        results.append(result)

    # Summary
    print("\n" + "=" * 60)
    print("CONVERSION SUMMARY")
    print("=" * 60)

    for result in results:
        status_icon = "âœ“" if result['status'] == 'Success' else "âœ—"
        print(f"{status_icon} {result['file']} ({result['format']})")
        if result['status'] == 'Success':
            print(f"   Length: {result['markdown_length']} chars")
            print(f"   Preview: {result['preview']}...")
        else:
            print(f"   Error: {result.get('error', 'Unknown')}")
        print()

    success_count = sum(1 for r in results if r['status'] == 'Success')
    print(f"Converted {success_count}/{len(results)} documents successfully")

In [11]:
loader("../documents/raw")

2025-10-29 18:33:23,707 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]


Multi-Format Document Processing with Docling

ðŸ“„ Processing: client-review-globalfinance.pdf


2025-10-29 18:33:26,575 - INFO - Going to convert document batch...
2025-10-29 18:33:26,578 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 4f2edc0f7d9bb60b38ebfecf9a2609f5
2025-10-29 18:33:26,630 - INFO - Loading plugin 'docling_defaults'
2025-10-29 18:33:26,659 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-10-29 18:33:26,689 - INFO - Loading plugin 'docling_defaults'
2025-10-29 18:33:26,834 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-10-29 18:33:27,218 - INFO - rapidocr cannot be used because onnxruntime is not installed.
2025-10-29 18:33:27,219 - INFO - easyocr cannot be used because it is not installed.
2025-10-29 18:33:30,359 - INFO - Accelerator device: 'cpu'
[32m[INFO] 2025-10-29 18:33:30,393 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2025-10-29 18:33:30,565 [RapidOCR] download_file.py:68: Initiating download: https://www.modelscope.cn/models/RapidAI/Rapid

   âœ“ Converted successfully
   âœ“ Output: ../documents/processed/output_client-review-globalfinance.md

ðŸ“„ Processing: company-overview.md


2025-10-29 18:34:16,316 - INFO - Finished converting document company-overview.md in 0.31 sec.
2025-10-29 18:34:16,342 - INFO - detected formats: [<InputFormat.MD: 'md'>]
2025-10-29 18:34:16,343 - INFO - Going to convert document batch...
2025-10-29 18:34:16,344 - INFO - Processing document implementation-playbook.md


   âœ“ Converted successfully
   âœ“ Output: ../documents/processed/output_company-overview.md

ðŸ“„ Processing: implementation-playbook.md


2025-10-29 18:34:17,004 - INFO - Finished converting document implementation-playbook.md in 0.68 sec.
2025-10-29 18:34:17,047 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-29 18:34:17,050 - INFO - Going to convert document batch...
2025-10-29 18:34:17,053 - INFO - Processing document meeting-notes-2025-01-08.docx


   âœ“ Converted successfully
   âœ“ Output: ../documents/processed/output_implementation-playbook.md

ðŸ“„ Processing: meeting-notes-2025-01-08.docx


2025-10-29 18:34:46,669 - INFO - Finished converting document meeting-notes-2025-01-08.docx in 29.64 sec.
2025-10-29 18:34:46,688 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-29 18:34:46,691 - INFO - Going to convert document batch...
2025-10-29 18:34:46,692 - INFO - Processing document meeting-notes-2025-01-15.docx


   âœ“ Converted successfully
   âœ“ Output: ../documents/processed/output_meeting-notes-2025-01-08.md

ðŸ“„ Processing: meeting-notes-2025-01-15.docx


2025-10-29 18:35:06,139 - INFO - Finished converting document meeting-notes-2025-01-15.docx in 19.45 sec.
2025-10-29 18:35:06,168 - INFO - detected formats: [<InputFormat.MD: 'md'>]
2025-10-29 18:35:06,170 - INFO - Going to convert document batch...
2025-10-29 18:35:06,170 - INFO - Processing document mission-and-goals.md


   âœ“ Converted successfully
   âœ“ Output: ../documents/processed/output_meeting-notes-2025-01-15.md

ðŸ“„ Processing: mission-and-goals.md


2025-10-29 18:35:06,749 - INFO - Finished converting document mission-and-goals.md in 0.59 sec.
2025-10-29 18:35:06,785 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-29 18:35:06,787 - INFO - Going to convert document batch...
2025-10-29 18:35:06,788 - INFO - Processing document q4-2024-business-review.pdf


   âœ“ Converted successfully
   âœ“ Output: ../documents/processed/output_mission-and-goals.md

ðŸ“„ Processing: q4-2024-business-review.pdf


2025-10-29 18:35:28,160 - INFO - Finished converting document q4-2024-business-review.pdf in 21.39 sec.
2025-10-29 18:35:28,192 - INFO - detected formats: [<InputFormat.MD: 'md'>]
2025-10-29 18:35:28,193 - INFO - Going to convert document batch...
2025-10-29 18:35:28,194 - INFO - Processing document team-handbook.md


   âœ“ Converted successfully
   âœ“ Output: ../documents/processed/output_q4-2024-business-review.md

ðŸ“„ Processing: team-handbook.md


2025-10-29 18:35:28,703 - INFO - Finished converting document team-handbook.md in 0.53 sec.
2025-10-29 18:35:28,739 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-29 18:35:28,743 - INFO - Going to convert document batch...
2025-10-29 18:35:28,744 - INFO - Processing document technical-architecture-guide.pdf


   âœ“ Converted successfully
   âœ“ Output: ../documents/processed/output_team-handbook.md

ðŸ“„ Processing: technical-architecture-guide.pdf


2025-10-29 18:35:58,597 - INFO - Finished converting document technical-architecture-guide.pdf in 29.88 sec.


   âœ“ Converted successfully
   âœ“ Output: ../documents/processed/output_technical-architecture-guide.md

CONVERSION SUMMARY
âœ“ client-review-globalfinance.pdf (.pdf)
   Length: 9936 chars
   Preview: Meeng:  Quarterly Business Review  Client:  GlobalFinance Corp  Date:  January 12, 2025  Time:  11:00 AM - 12:30 PM EST  Locaon:  GlobalFinance HQ / Virtual (Teams)  Client Aendees:  Richard Marnez (V...

âœ“ company-overview.md (.md)
   Length: 3102 chars
   Preview: # NeuralFlow AI - Company Overview  ## About Us  NeuralFlow AI is a cutting-edge AI automation agency founded in 2023, specializing in intelligent workflow automation, natural language processing solu...

âœ“ implementation-playbook.md (.md)
   Length: 9672 chars
   Preview: # NeuralFlow AI - Client Implementation Playbook  ## Introduction  This playbook outlines our proven methodology for implementing AI automation solutions for clients. Following this framework ensures ...

âœ“ meeting-notes-2025-01-08.docx (.docx)
   L