### imports

In [None]:
from pathlib import Path
from docling.document_converter import DocumentConverter
from langchain_core.documents import Document
from docling.chunking import HybridChunker
from transformers import AutoTokenizer

### Process all documents from a directory and save the resulted chunks to a list as langchain documents

In [None]:

def process_documents_to_langchain(documents_dir: str, max_tokens: int = 512):
    """Process multiple documents and return a list of LangChain Document objects.
    
    Docling automatically handles all supported file formats (.pdf, .md, .docx, .html, .txt, etc.)
    
    Args:
        documents_dir: Directory containing documents to process
        max_tokens: Maximum tokens per chunk
        
    Returns:
        List of LangChain Document objects with page_content and metadata
    """
    
    print("=" * 60)
    print("BATCH HYBRID CHUNKING - TO LANGCHAIN DOCUMENTS")
    print("=" * 60)
    
    # Get all files from directory (excluding directories)
    documents_path = Path(documents_dir)
    all_files = [f for f in documents_path.iterdir() if f.is_file()]
    all_files = sorted(all_files)  # Sort for consistent ordering
    
    if not all_files:
        print(f"\nâœ— No files found in {documents_dir}")
        return []
    
    print(f"\nFound {len(all_files)} documents to process")
    print(f"Max tokens per chunk: {max_tokens}\n")
    
    # Initialize tokenizer once (reuse for all documents)
    print("Initializing tokenizer...")
    model_id = "sentence-transformers/all-MiniLM-L6-v2"
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    
    # Create chunker once (reuse for all documents)
    chunker = HybridChunker(
        tokenizer=tokenizer,
        max_tokens=max_tokens,
        merge_peers=True
    )
    
    langchain_documents = []
    total_chunks = 0
    successful_docs = 0
    failed_docs = []
    
    # Process each document
    for file_path in all_files:
        try:
            print(f"\nðŸ“„ Processing: {file_path.name}")
            
            # Convert document
            print("   Converting document...")
            converter = DocumentConverter()
            result = converter.convert(str(file_path))
            doc = result.document
            
            # Generate chunks
            print("   Generating chunks...")
            chunk_iter = chunker.chunk(dl_doc=doc)
            chunks = list(chunk_iter)
            
            print(f"   Creating {len(chunks)} LangChain Document objects...")
            
            # Convert each chunk to LangChain Document
            for i, chunk in enumerate(chunks):
                # Use contextualize to preserve headings and metadata
                contextualized_text = chunker.contextualize(chunk=chunk)
                
                # Create LangChain Document with metadata
                langchain_doc = Document(
                    page_content=contextualized_text,
                    metadata={
                        "source": str(file_path),
                        "source_name": file_path.name,
                        "chunk_index": total_chunks + i,
                        "document_chunk_index": i,
                        "total_chunks_in_document": len(chunks)
                    }
                )
                
                langchain_documents.append(langchain_doc)
            
            total_chunks += len(chunks)
            successful_docs += 1
            print(f"   âœ“ Success! Total chunks so far: {total_chunks}")
            
        except Exception as e:
            print(f"   âœ— Error processing {file_path.name}: {e}")
            failed_docs.append(file_path.name)
    
    # Final summary
    print("\n" + "=" * 60)
    print("PROCESSING COMPLETE")
    print("=" * 60)
    print(f"âœ“ Successfully processed: {successful_docs}/{len(all_files)} documents")
    print(f"âœ“ Total LangChain Documents created: {len(langchain_documents)}")
    
    if failed_docs:
        print(f"\nâœ— Failed documents ({len(failed_docs)}):")
        for doc in failed_docs:
            print(f"   - {doc}")
    
    print("\n" + "=" * 60)
    print("LANGCHAIN DOCUMENTS READY")
    print("=" * 60)
    print("âœ“ Each chunk is a LangChain Document object")
    print("âœ“ page_content: Contextualized chunk text with headings")
    print("âœ“ metadata: source, source_name, chunk_index, etc.")
    print("âœ“ Ready for vector store ingestion (Chroma, FAISS, Pinecone, etc.)")
    
    return langchain_documents

### usage

In [None]:
raw_documents_dir = "../documents/raw"
all_chunks = process_documents_to_langchain(documents_dir=raw_documents_dir)

### Vector storage -> Postgres/pgvector

In [None]:
# import basics
import os
from dotenv import load_dotenv

from langchain_postgres import PGVector
from langchain_openai import OpenAIEmbeddings

# Load environment variables from .env file
load_dotenv()

# PostgreSQL Configuration
POSTGRES_USER = os.getenv("POSTGRES_USER")
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD")
POSTGRES_DB = os.getenv("POSTGRES_DB")
POSTGRES_PORT = os.getenv("POSTGRES_PORT")

# initiate embeddings model
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# Connection string
CONNECTION_STRING = f"postgresql+psycopg://{POSTGRES_USER}:{POSTGRES_PASSWORD}@localhost:{POSTGRES_PORT}/{POSTGRES_DB}"

# Initialize vector store
vectorstore = PGVector(
    connection=CONNECTION_STRING,
    embeddings=embeddings,
    collection_name="my_documents",  # table name
    use_jsonb=True,
)

### Injestion

In [None]:
# Add documents
vectorstore.add_documents(all_chunks)

### Querying it

In [None]:
query = "What is the Q1 2025 revenue target?"

# Query
results = vectorstore.similarity_search(query, k=5)

print("Retrieved Document:")
for doc in results:
    print("=" * 60)
    print(f"* {doc.page_content} [{doc.metadata}]")

In [None]:
def retrieve_context(query: str):
    """Retrieve information to help answer a query."""
    retrieved_docs = vectorstore.similarity_search(query, k=5)
    serialized = "\n\n".join(
        (f"Source: {doc.metadata}\nContent: {doc.page_content}")
        for doc in retrieved_docs
    )
    return serialized, retrieved_docs

In [None]:
query = "What is the Q1 2025 revenue target?"
serialized, retrieved_docs = retrieve_context(query)

In [None]:
print(serialized)

### Vector store -> Supabase

In [None]:
# import basics
import os
from dotenv import load_dotenv

# import langchain
from langchain_community.vectorstores import SupabaseVectorStore
from langchain_openai import OpenAIEmbeddings

# import supabase
from supabase.client import Client, create_client

# load environment variables
load_dotenv()  

# initiate supabase db
supabase_url = os.environ.get("SUPABASE_URL")
supabase_key = os.environ.get("SUPABASE_SERVICE_KEY")
supabase: Client = create_client(supabase_url, supabase_key)

# initiate embeddings model
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# store chunks in vector store
vector_store = SupabaseVectorStore.from_documents(
    all_chunks,
    embeddings,
    client=supabase,
    table_name="documents",
    query_name="match_documents",
    chunk_size=1000,
)

### Querying supabase

In [None]:
import os
from supabase import Client, create_client
from langchain_openai import OpenAIEmbeddings

# initiate embeddings model
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# initiate supabase db
supabase_url = os.environ.get("SUPABASE_URL")
supabase_key = os.environ.get("SUPABASE_SERVICE_KEY")
supabase: Client = create_client(supabase_url, supabase_key)

def query_vector_store(query: str, top_k: int = 5):
    """Query the Supabase vector store and return top_k similar documents.
    
    Args:
        query: The input query string
        top_k: Number of top similar documents to retrieve
    """
    # 1. Embed the query
    query_embedding = embeddings.embed_query(query)

    # 2. Query the Supabase vector store
    resp = (
        supabase.rpc(
            "match_documents",
            {
                "query_embedding": query_embedding,
                "match_count": top_k,
                "match_threshold": 0.0,
                "filter": {}  # optional jsonb filter
            }
        )
        .execute()
    )

    # if resp.raise_when_api_error():
    #     raise Exception(resp.raise_when_api_error())

    matches = resp.data  # list of rows returned by the function
    for m in matches:
        # similarity is included in the returned row (see function)
        print("=" * 60)
        print(f"Document ID: {m['id']}\nSimilarity: {m['similarity']}\nMetadata: {m['metadata']}\nContent: {m['content']}")

In [None]:
# query = "What is the Q1 2025 revenue target?"
# query = "When was NeuralFlow AI founded"
query = "What ROI did GlobalFinance achieve?"

query_vector_store(query)