In [None]:
! pip install -r requirements.txt

In [2]:
import os
from dotenv import load_dotenv

def _get_env_from_colab_or_os(key):
    return os.getenv(key)

load_dotenv()

HF_TOKEN = _get_env_from_colab_or_os("HF_TOKEN")

In [None]:
from docling.document_converter import DocumentConverter

converter = DocumentConverter()
# --------------------------------------------------------------
# Basic PDF extraction
# --------------------------------------------------------------
result = converter.convert("Constitucion_1978.md")
# result = converter.convert("Constitución 1978.pdf")

document = result.document
markdown_output = document.export_to_markdown()
json_output = document.export_to_dict()

print(markdown_output)

### CHUNKING (Docling-Hybrid)

In [None]:
# This is to use the hybrid chunking method from docling
from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer
from transformers import AutoTokenizer

from docling.chunking import HybridChunker

EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
MAX_TOKENS = 128  

tokenizer = HuggingFaceTokenizer(
    tokenizer=AutoTokenizer.from_pretrained(EMBED_MODEL_ID),
    max_tokens=MAX_TOKENS,  
)

chunker = HybridChunker(
    tokenizer=tokenizer,
    merge_peers=True, 
)

chunk_iter = chunker.chunk(dl_doc=result.document)
chunks = list(chunk_iter)
len(chunks)

In [None]:
# To simplify the metadata
for chunk in chunks:
    simplified_meta = {
        "headings": chunk.meta.headings,
        "filename": chunk.meta.origin.filename  
    }
    chunk.meta = simplified_meta

In [None]:
# Checking what the chunks and metadata look like
for i, chunk in enumerate(chunks):
    print(f"Chunk {i}: {chunk.text}")
    print(f"Metadata: {chunk.meta}")
    print("-" * 100)

In [None]:
# Adding the metadata to the text part to be embedded aswell
from langchain.schema import Document

docs = [
    Document(
        page_content=f"Headings: {', '.join(chunk.meta['headings'])}\nFilename: {chunk.meta['filename']}\n\nContent: {chunk.text}",
        metadata=chunk.meta
    )
    for chunk in chunks
]

In [None]:
from langchain_community.vectorstores import SupabaseVectorStore
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_openai import OpenAIEmbeddings
from supabase import create_client
from dotenv import load_dotenv
load_dotenv()

TABLE_NAME = "documents"

supabase_client = create_client(SUPABASE_URL, SUPABASE_KEY)

# Create embedding model
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
    api_key=OPENAI_API_KEY
)

### Adding the chunks to Supabase

In [None]:
def add_documents(chunks=None, texts=None, embeddings=None, supabase_client=None, table_name=None):
    """
    Add documents to Supabase vector store.
    
    Args:
        chunks: List of chunk objects with .text and .meta attributes (for complex documents)
        texts: List of strings (for simple text documents)
        embeddings: Embedding model instance
        supabase_client: Initialized Supabase client
        table_name: Name of the table in Supabase
    
    Note: Provide either chunks OR texts, not both.
    """
    if chunks is not None and texts is not None:
        raise ValueError("Provide either 'chunks' or 'texts', not both")
    
    if chunks is None and texts is None:
        raise ValueError("Must provide either 'chunks' or 'texts'")
    
    # Create documents from chunks (complex documents with metadata)
    if chunks is not None:
        docs = [
            Document(
                page_content=f"Headings: {', '.join(chunk.meta['headings'])}\nFilename: {chunk.meta['filename']}\n\nContent: {chunk.text}",
                metadata=chunk.meta
            )
            for chunk in chunks
        ]
    
    # Create documents from simple texts
    else:
        docs = [
            Document(
                page_content=text, 
                metadata={"content_type": "text"}
            ) 
            for text in texts
        ]

    # Push to Supabase
    SupabaseVectorStore.from_documents(
        documents=docs,
        embedding=embeddings,
        client=supabase_client,
        table_name=table_name,
        query_name="match_documents",
    )
    
    print(f"Successfully added {len(docs)} documents to Supabase")


In [None]:
add_documents()