# ColPali-BAML Quick Start Guide

This notebook demonstrates the ColPali-BAML vision processing pipeline for document analysis and structured data extraction.

In [None]:
import os
import sys
from pathlib import Path

# Add the package to Python path
sys.path.insert(0, '/app')

# Import ColPali engine components
from tatforge.core.pipeline import DocumentProcessingPipeline
from tatforge.core.document_adapter import PDFAdapter
from tatforge.vision.colpali_client import ColPaliClient
from tatforge.storage.qdrant_client import QdrantClient

print("ColPali-BAML imports successful!")

## 1. Initialize the Processing Pipeline

Set up the main document processing pipeline with all components.

In [None]:
# Initialize pipeline components
pipeline = DocumentProcessingPipeline()
pdf_adapter = PDFAdapter()
colpali_client = ColPaliClient()
qdrant_client = QdrantClient()

print("Pipeline components initialized!")

## 2. Process Sample Documents

Load and process one of the sample PDFs from the `/pdfs/` directory.

In [None]:
# List available test PDFs
pdf_dir = Path('/app/pdfs')
pdf_files = list(pdf_dir.glob('*.pdf'))

print(f"Found {len(pdf_files)} test PDFs:")
for i, pdf_file in enumerate(pdf_files[:5]):  # Show first 5
    print(f"  {i+1}. {pdf_file.name}")

## 3. Test BAML Integration

Test the BAML schema system and extraction capabilities.

In [None]:
# Test BAML integration
try:
    import baml_py as baml
    print(f"BAML version: {baml.__version__}")
    print("BAML integration successful!")
except ImportError as e:
    print(f"BAML import failed: {e}")