In [25]:
import os
import tempfile

from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain.schema.document import Document
from dotenv import load_dotenv
from openai import OpenAI
from pathlib import Path
from supabase import create_client

load_dotenv()


def load_document(filename: str):
    """
    Function use to load a PDF document onto a Supabase vector database.
    It will convert it into Markdown, split it by its headers, create an embedding for each chunk.
    Finally it will upload each embedded chunk to the 'embeddings' table.
    """
    with tempfile.TemporaryDirectory() as tmp_dirname:
        # Parse the PDF and convert it to Markdown
        os.system(f"""marker_single "{filename}" "{tmp_dirname}" --batch_multiplier 1 --ocr_all_pages""")

        # Split the resulting Markdown into chunks
        resulting_folder_name = Path(filename).stem
        with open(f"{tmp_dirname}/{resulting_folder_name}/{resulting_folder_name}.md", "r") as f:
            doc = f.read()

            markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=[
                ("#", "Header 1"),
                ("##", "Header 2"),
                ("###", "Header 3"),
            ], strip_headers=False)
            
            chunks = markdown_splitter.split_text(doc)

            # Instantiate a Supabase and an OpenAI client
            supabase_client = create_client(os.environ.get("SUPABASE_URL"), os.environ.get("SUPABASE_KEY"))
            openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

            # Add an incrementing identifier to each chunk
            for idx, chunk in enumerate(chunks):
                response = openai_client.embeddings.create(
                    input=chunk.page_content,
                    model="text-embedding-3-small"
                )

                supabase_client.table("embeddings").insert({
                    "name": f"{filename}:{idx}",
                    "content": chunk.page_content,
                    "embedding": response.data[0].embedding
                }).execute()

In [27]:
load_document("../data/Acordo Partilha Assinado.pdf")

Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded detection model vikp/surya_layout3 on device cpu with dtype torch.float32
Loaded reading order model vikp/surya_order on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32
Loaded texify model to cpu with torch.float32 dtype


Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.87s/it]
Recognizing Text: 100%|██████████| 2/2 [00:33<00:00, 16.82s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:02<00:00,  2.73s/it]
Finding reading order: 100%|██████████| 1/1 [00:04<00:00,  4.18s/it]


Saved markdown to the /var/folders/b4/t373qrvd4m76swgs_nb9vf9r0000gn/T/tmpaw4n_ez4/Acordo Partilha Assinado folder
