In [27]:
import os
import tempfile

from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain.schema.document import Document
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings
from dotenv import load_dotenv
from openai import OpenAI
from pathlib import Path
from supabase import create_client

load_dotenv()


def load_document(filename: str, document_type: str):
    """
    Function use to load a PDF document onto a Supabase vector database.
    It will convert it into Markdown, split it by its headers, create an embedding for each chunk.
    Finally it will upload each embedded chunk to the 'embeddings' table.
    """
    document_types = [
        "certidao_registo_predial",
        "caderneta_predial",
        "licenca_utilizacao",
        "certidao_isencao",
        "certidao_infraestruturas",
        "ficha_tecnica_habitacao",
        "certificado_energetico",
        "planta_imovel",
        "documento_kyc",
        "documento_preferencia"
    ]

    if document_type not in document_types:
        raise Exception("Invalid document type")
    
    with tempfile.TemporaryDirectory() as tmp_dirname:
        # Parse the PDF and convert it to Markdown
        os.system(f"""marker_single "{filename}" "{tmp_dirname}" --batch_multiplier 1""")

        # Split the resulting Markdown into chunks
        resulting_folder_name = Path(filename).stem
        with open(f"{tmp_dirname}/{resulting_folder_name}/{resulting_folder_name}.md", "r") as f:
            doc = f.read()

            markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=[
                ("#", "Header 1"),
                ("##", "Header 2"),
                ("###", "Header 3"),
            ], strip_headers=False)
            chunks = markdown_splitter.split_text(doc)
            
            # Instantiate a Supabase and an OpenAI client
            supabase_client = create_client(os.environ.get("SUPABASE_URL"), os.environ.get("SUPABASE_KEY"))
            openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

            # Add an incrementing identifier to each chunk
            chunk_idx = 0
            for chunk in chunks:
                # And split each chunk into semanticaly similar sub-chunks  
                text_splitter = SemanticChunker(OpenAIEmbeddings(api_key=os.environ.get("OPENAI_API_KEY")), breakpoint_threshold_type="percentile") 
                semantic_chunks = text_splitter.create_documents([chunk.page_content])
                for semantic_chunk in semantic_chunks:
                    # Generate embeddings from OpenAI
                    response = openai_client.embeddings.create(
                        input=semantic_chunk.page_content,
                        model="text-embedding-3-small"
                    )

                    # Upload to table
                    supabase_client.table("embeddings").insert({
                        "content": semantic_chunk.page_content,
                        "embedding": response.data[0].embedding,
                        "metadata": {
                            "name": resulting_folder_name,
                            "chunk_idx": chunk_idx,
                            **chunk.metadata
                        },
                        "document_type": document_type
                    }).execute()

                    # Increment chunk index
                    chunk_idx = chunk_idx + 1

In [14]:
import os
import tempfile

from langchain_text_splitters import MarkdownHeaderTextSplitter
from pathlib import Path

filename = "../data/certificado_energetico/4355_8432.pdf"

with tempfile.TemporaryDirectory() as tmp_dirname:
    # Parse the PDF and convert it to Markdown
    os.system(f"""marker_single "{filename}" "{tmp_dirname}" --batch_multiplier 1 --ocr_all_pages""")

    # Split the resulting Markdown into chunks
    resulting_folder_name = Path(filename).stem
    with open(f"{tmp_dirname}/{resulting_folder_name}/{resulting_folder_name}.md", "r") as f:
        doc = f.read()

        markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=[
            ("#", "Header 1"),
            ("##", "Header 2"),
            ("###", "Header 3"),
        ], strip_headers=False)
        chunks = markdown_splitter.split_text(doc)

chunks

Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded detection model vikp/surya_layout3 on device cpu with dtype torch.float32
Loaded reading order model vikp/surya_order on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32
Loaded texify model to cpu with torch.float32 dtype


Detecting bboxes: 100%|██████████| 3/3 [00:23<00:00,  7.81s/it]
Recognizing Text: 100%|██████████| 24/24 [13:26<00:00, 33.60s/it]
Detecting bboxes: 100%|██████████| 2/2 [00:15<00:00,  7.91s/it]
Finding reading order: 100%|██████████| 2/2 [00:45<00:00, 22.97s/it]


Saved markdown to the /tmp/tmp_sfwqxcp/4355_8432 folder


[Document(page_content='![0_image_0.png](0_image_0.png)  \n![0_image_1.png](0_image_1.png)  \nSCE107727024 Válido até 15/07/2025 Edifício de Habitação IDENTIFICAÇÃO POSTAL\nMorada ESTRADA DAS PEDRAS LAVRADAS, 28, Localidade SOBRAL DE SÃO MIGUEL Freguesia SOBRAL DE S. MIGUEL\nConcelho COVILHÃ\nGPS 40.212883, -7.741674 IDENTIFICAÇÃO PREDIAL/FISCAL\nConservatória do Registo Predial de COVILHÃ\nNº de Inscrição na Conservatória 1714 Artigo Matricial nº 637 INFORMAÇÃO ADICIONAL\nÁrea útil de Pavimento 100,44 m²\nFração Autónoma Este certificado apresenta a classificação energética deste edificio ou fração. Esta classificação é calculada comparando o desempenho energético deste edifício nas condições atuais, com o desempenho que este obteria nas condições mínimas (com base em valores de referência) a que estão obrigados os edifícios novos. Obtenha mais informação sobre a certificação energética no site da ADENE em www.adene.pt INDICADORES DE DESEMPENHO\nDeterminam a classe energética do edifí

In [17]:
for i in chunks:
    if "SCE107727024" in i.page_content and "Edifício de Habitação" in i.page_content:
        print(i.page_content)
        print("--------------------------------------------------")
# print(chunks[3].page_content)

![0_image_0.png](0_image_0.png)  
![0_image_1.png](0_image_1.png)  
SCE107727024 Válido até 15/07/2025 Edifício de Habitação IDENTIFICAÇÃO POSTAL
Morada ESTRADA DAS PEDRAS LAVRADAS, 28, Localidade SOBRAL DE SÃO MIGUEL Freguesia SOBRAL DE S. MIGUEL
Concelho COVILHÃ
GPS 40.212883, -7.741674 IDENTIFICAÇÃO PREDIAL/FISCAL
Conservatória do Registo Predial de COVILHÃ
Nº de Inscrição na Conservatória 1714 Artigo Matricial nº 637 INFORMAÇÃO ADICIONAL
Área útil de Pavimento 100,44 m²
Fração Autónoma Este certificado apresenta a classificação energética deste edificio ou fração. Esta classificação é calculada comparando o desempenho energético deste edifício nas condições atuais, com o desempenho que este obteria nas condições mínimas (com base em valores de referência) a que estão obrigados os edifícios novos. Obtenha mais informação sobre a certificação energética no site da ADENE em www.adene.pt INDICADORES DE DESEMPENHO
Determinam a classe energética do edifício e a eficiência na utilização d

In [None]:
load_document(filename="../data/4355_8432.pdf", document_type="certificado_energetico")

In [29]:
for document_type_folder in Path("../data").glob('*/'):
    print(document_type_folder.name)

escritura
certificado_energetico
caderneta_predial
ficha_tecnica_habitacao


In [30]:
from pathlib import Path
from tqdm.auto import tqdm

for document_type_folder in Path("../data").glob('*/'):
    if document_type_folder.name == "escritura":
        continue
    
    for document_path in tqdm(Path(document_type_folder).glob("*.pdf")):
        load_document(filename=document_path, document_type=document_type_folder.name)

0it [00:00, ?it/s]

Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded detection model vikp/surya_layout3 on device cpu with dtype torch.float32
Loaded reading order model vikp/surya_order on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32
Loaded texify model to cpu with torch.float32 dtype


Detecting bboxes: 100%|██████████| 2/2 [00:23<00:00, 11.59s/it]
Detecting bboxes: 100%|██████████| 2/2 [00:15<00:00,  7.59s/it]
Finding reading order: 100%|██████████| 2/2 [00:31<00:00, 15.62s/it]


Saved markdown to the /tmp/tmp7t5mrb18/ID92ff3b00-0000-0500-0000-000002081286 folder


1it [02:23, 143.22s/it]

Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded detection model vikp/surya_layout3 on device cpu with dtype torch.float32
Loaded reading order model vikp/surya_order on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32
Loaded texify model to cpu with torch.float32 dtype


Detecting bboxes: 100%|██████████| 3/3 [00:22<00:00,  7.55s/it]
Detecting bboxes: 100%|██████████| 2/2 [00:18<00:00,  9.34s/it]
Finding reading order: 100%|██████████| 2/2 [00:34<00:00, 17.22s/it]


Saved markdown to the /tmp/tmpy9aws1zz/SCE_BlocoA_final folder


2it [04:35, 136.69s/it]

Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded detection model vikp/surya_layout3 on device cpu with dtype torch.float32
Loaded reading order model vikp/surya_order on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32
Loaded texify model to cpu with torch.float32 dtype


Detecting bboxes: 100%|██████████| 2/2 [00:19<00:00,  9.54s/it]
Detecting bboxes: 100%|██████████| 2/2 [00:14<00:00,  7.31s/it]
Finding reading order: 100%|██████████| 2/2 [00:33<00:00, 16.78s/it]


Saved markdown to the /tmp/tmp4f2lqgjt/ADENE_certificado_energético_comércio_e_serviços folder


3it [06:32, 127.58s/it]

Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded detection model vikp/surya_layout3 on device cpu with dtype torch.float32
Loaded reading order model vikp/surya_order on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32
Loaded texify model to cpu with torch.float32 dtype


Detecting bboxes: 100%|██████████| 2/2 [00:18<00:00,  9.30s/it]
Detecting bboxes: 100%|██████████| 2/2 [00:14<00:00,  7.33s/it]
Finding reading order: 100%|██████████| 2/2 [00:35<00:00, 17.55s/it]


Saved markdown to the /tmp/tmpl2c5y970/CE-Rua-Arco-da-Graca-83-RC folder


4it [08:31, 124.45s/it]

Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded detection model vikp/surya_layout3 on device cpu with dtype torch.float32
Loaded reading order model vikp/surya_order on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32
Loaded texify model to cpu with torch.float32 dtype


Detecting bboxes: 100%|██████████| 2/2 [00:16<00:00,  8.26s/it]
Detecting bboxes: 100%|██████████| 2/2 [00:14<00:00,  7.37s/it]
Finding reading order: 100%|██████████| 2/2 [00:29<00:00, 14.93s/it]


Saved markdown to the /tmp/tmpb46z7wbz/ADENE_SCE_SCE0000276598643 folder


5it [10:27, 121.28s/it]

Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded detection model vikp/surya_layout3 on device cpu with dtype torch.float32
Loaded reading order model vikp/surya_order on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32
Loaded texify model to cpu with torch.float32 dtype


Detecting bboxes: 100%|██████████| 3/3 [00:20<00:00,  7.00s/it]
Detecting bboxes: 100%|██████████| 2/2 [00:18<00:00,  9.16s/it]
Finding reading order: 100%|██████████| 2/2 [00:41<00:00, 20.81s/it]


Saved markdown to the /tmp/tmp6t8dfk0a/4355_8432 folder


6it [12:47, 127.64s/it]

Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded detection model vikp/surya_layout3 on device cpu with dtype torch.float32
Loaded reading order model vikp/surya_order on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32
Loaded texify model to cpu with torch.float32 dtype


Detecting bboxes: 100%|██████████| 5/5 [00:46<00:00,  9.27s/it]
Detecting bboxes: 100%|██████████| 4/4 [00:37<00:00,  9.32s/it]
Finding reading order: 100%|██████████| 4/4 [01:18<00:00, 19.63s/it]


Saved markdown to the /tmp/tmp98ys_nay/CertificadoEnergetico folder


7it [16:42, 162.68s/it]

Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded detection model vikp/surya_layout3 on device cpu with dtype torch.float32
Loaded reading order model vikp/surya_order on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32
Loaded texify model to cpu with torch.float32 dtype


Detecting bboxes: 100%|██████████| 3/3 [00:24<00:00,  8.07s/it]
Detecting bboxes: 100%|██████████| 2/2 [00:20<00:00, 10.32s/it]
Finding reading order: 100%|██████████| 2/2 [00:43<00:00, 21.75s/it]


Saved markdown to the /tmp/tmp8bbwtjtt/ADENE_certificado_energético_habitação folder


8it [19:07, 157.15s/it]

Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded detection model vikp/surya_layout3 on device cpu with dtype torch.float32
Loaded reading order model vikp/surya_order on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32
Loaded texify model to cpu with torch.float32 dtype


Detecting bboxes: 100%|██████████| 2/2 [00:21<00:00, 10.86s/it]
Detecting bboxes: 100%|██████████| 2/2 [00:15<00:00,  7.78s/it]
Finding reading order: 100%|██████████| 2/2 [00:31<00:00, 15.94s/it]


Saved markdown to the /tmp/tmpog9rxdpi/certificado_energetico_deleg_queluz folder


9it [21:06, 145.25s/it]

Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded detection model vikp/surya_layout3 on device cpu with dtype torch.float32
Loaded reading order model vikp/surya_order on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32
Loaded texify model to cpu with torch.float32 dtype


Detecting bboxes: 100%|██████████| 2/2 [00:20<00:00, 10.09s/it]
Detecting bboxes: 100%|██████████| 2/2 [00:16<00:00,  8.24s/it]
Finding reading order: 100%|██████████| 2/2 [00:32<00:00, 16.37s/it]


Saved markdown to the /tmp/tmpcn376akt/ADENE_SCE_SCE0000147296433-Certificado-Energetico-ID010617-1 folder


10it [23:06, 137.32s/it]

Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded detection model vikp/surya_layout3 on device cpu with dtype torch.float32
Loaded reading order model vikp/surya_order on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32
Loaded texify model to cpu with torch.float32 dtype


Detecting bboxes: 100%|██████████| 2/2 [00:16<00:00,  8.40s/it]
Detecting bboxes: 100%|██████████| 2/2 [00:15<00:00,  7.69s/it]
Finding reading order: 100%|██████████| 2/2 [00:28<00:00, 14.20s/it]


Saved markdown to the /tmp/tmpgn6lk4it/certificado-comercio folder


11it [24:54, 128.42s/it]

Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded detection model vikp/surya_layout3 on device cpu with dtype torch.float32
Loaded reading order model vikp/surya_order on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32
Loaded texify model to cpu with torch.float32 dtype


Detecting bboxes: 100%|██████████| 3/3 [00:23<00:00,  7.75s/it]
Detecting bboxes: 100%|██████████| 2/2 [00:19<00:00,  9.66s/it]
Finding reading order: 100%|██████████| 2/2 [00:37<00:00, 18.52s/it]


Saved markdown to the /tmp/tmph_4ra2e8/ADENE_SCE_SCE0000138299335 folder


12it [27:04, 128.88s/it]

Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded detection model vikp/surya_layout3 on device cpu with dtype torch.float32
Loaded reading order model vikp/surya_order on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32
Loaded texify model to cpu with torch.float32 dtype


Detecting bboxes: 100%|██████████| 3/3 [00:22<00:00,  7.52s/it]
Detecting bboxes: 100%|██████████| 2/2 [00:19<00:00,  9.53s/it]
Finding reading order: 100%|██████████| 2/2 [00:40<00:00, 20.12s/it]


Saved markdown to the /tmp/tmp0j4ejf_s/bdf10200ae4052c8f7016f304e35afc0-certificado-energetico folder


13it [29:16, 130.03s/it]

Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded detection model vikp/surya_layout3 on device cpu with dtype torch.float32
Loaded reading order model vikp/surya_order on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32
Loaded texify model to cpu with torch.float32 dtype


Detecting bboxes: 100%|██████████| 2/2 [00:17<00:00,  8.89s/it]
Detecting bboxes: 100%|██████████| 2/2 [00:15<00:00,  7.54s/it]
Finding reading order: 100%|██████████| 2/2 [00:29<00:00, 14.79s/it]


Saved markdown to the /tmp/tmpdbz2g6w8/CE folder


14it [31:11, 125.27s/it]

Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded detection model vikp/surya_layout3 on device cpu with dtype torch.float32
Loaded reading order model vikp/surya_order on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32
Loaded texify model to cpu with torch.float32 dtype


Detecting bboxes: 100%|██████████| 3/3 [00:24<00:00,  8.10s/it]
Detecting bboxes: 100%|██████████| 2/2 [00:18<00:00,  9.33s/it]
Finding reading order: 100%|██████████| 2/2 [00:39<00:00, 19.91s/it]


Saved markdown to the /tmp/tmpj9zjkh5m/ADENE_SCE_CE_IPC_ESAC_LabReprodAnimal folder


15it [33:33, 130.28s/it]

Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded detection model vikp/surya_layout3 on device cpu with dtype torch.float32
Loaded reading order model vikp/surya_order on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32
Loaded texify model to cpu with torch.float32 dtype


Detecting bboxes: 100%|██████████| 2/2 [00:19<00:00,  9.98s/it]
Detecting bboxes: 100%|██████████| 2/2 [00:15<00:00,  7.62s/it]
Finding reading order: 100%|██████████| 2/2 [00:34<00:00, 17.22s/it]


Saved markdown to the /tmp/tmpn93qv8r7/certificado-energe-tico-plot-23 folder


16it [35:36, 128.07s/it]

Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded detection model vikp/surya_layout3 on device cpu with dtype torch.float32
Loaded reading order model vikp/surya_order on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32
Loaded texify model to cpu with torch.float32 dtype


Detecting bboxes: 100%|██████████| 2/2 [00:18<00:00,  9.38s/it]
Detecting bboxes: 100%|██████████| 2/2 [00:14<00:00,  7.12s/it]
Finding reading order: 100%|██████████| 2/2 [00:25<00:00, 13.00s/it]


Saved markdown to the /tmp/tmpydskeskg/file_1 (1) folder


17it [37:27, 123.09s/it]

Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded detection model vikp/surya_layout3 on device cpu with dtype torch.float32
Loaded reading order model vikp/surya_order on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32
Loaded texify model to cpu with torch.float32 dtype


Detecting bboxes: 100%|██████████| 2/2 [00:21<00:00, 10.52s/it]
Detecting bboxes: 100%|██████████| 2/2 [00:21<00:00, 10.64s/it]
Finding reading order: 100%|██████████| 2/2 [00:34<00:00, 17.02s/it]


Saved markdown to the /tmp/tmpel1rfyz7/8-4-certificado-energetico-m2e-at-ahv.jsMAXHQ folder


18it [39:37, 132.10s/it]
0it [00:00, ?it/s]

Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded detection model vikp/surya_layout3 on device cpu with dtype torch.float32
Loaded reading order model vikp/surya_order on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32
Loaded texify model to cpu with torch.float32 dtype


Detecting bboxes: 100%|██████████| 1/1 [00:04<00:00,  4.22s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:04<00:00,  4.08s/it]
Finding reading order: 100%|██████████| 1/1 [00:08<00:00,  8.19s/it]


Saved markdown to the /tmp/tmpi2vdf5ay/29_CP 101608 U-1128 folder


1it [00:44, 44.95s/it]

Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded detection model vikp/surya_layout3 on device cpu with dtype torch.float32
Loaded reading order model vikp/surya_order on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32
Loaded texify model to cpu with torch.float32 dtype


Detecting bboxes: 100%|██████████| 1/1 [00:04<00:00,  4.36s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.30s/it]
Finding reading order: 100%|██████████| 1/1 [00:06<00:00,  6.96s/it]


Saved markdown to the /tmp/tmpkkvlhw0d/cpu atualizada folder


2it [01:27, 43.42s/it]

Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded detection model vikp/surya_layout3 on device cpu with dtype torch.float32
Loaded reading order model vikp/surya_order on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32
Loaded texify model to cpu with torch.float32 dtype


Detecting bboxes: 100%|██████████| 1/1 [00:04<00:00,  4.40s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:05<00:00,  5.72s/it]
Finding reading order: 100%|██████████| 1/1 [00:06<00:00,  6.88s/it]


Saved markdown to the /tmp/tmp_dr7cgfu/caderneta predial urbana-fração B folder


3it [02:09, 42.69s/it]

Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded detection model vikp/surya_layout3 on device cpu with dtype torch.float32
Loaded reading order model vikp/surya_order on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32
Loaded texify model to cpu with torch.float32 dtype


Detecting bboxes: 100%|██████████| 1/1 [00:09<00:00,  9.04s/it]
Recognizing Text: 100%|██████████| 1/1 [00:10<00:00, 10.31s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.95s/it]
Finding reading order: 100%|██████████| 1/1 [00:14<00:00, 14.19s/it]


Saved markdown to the /tmp/tmpgckp1f9g/023754_1_6847_Caderneta-Predio-31 folder


4it [03:15, 51.99s/it]

Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded detection model vikp/surya_layout3 on device cpu with dtype torch.float32
Loaded reading order model vikp/surya_order on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32
Loaded texify model to cpu with torch.float32 dtype


Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.79s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:05<00:00,  5.19s/it]
Finding reading order: 100%|██████████| 1/1 [00:10<00:00, 10.22s/it]


Saved markdown to the /tmp/tmp83evqe55/1451477103 folder


5it [04:01, 49.80s/it]

Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded detection model vikp/surya_layout3 on device cpu with dtype torch.float32
Loaded reading order model vikp/surya_order on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32
Loaded texify model to cpu with torch.float32 dtype


Detecting bboxes: 100%|██████████| 1/1 [00:04<00:00,  4.68s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.61s/it]
Finding reading order: 100%|██████████| 1/1 [00:07<00:00,  7.33s/it]


Saved markdown to the /tmp/tmph_74kpa2/025942_1_4670_Caderneta-Predio-2929-B-verba-2 folder


6it [04:48, 48.80s/it]

Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded detection model vikp/surya_layout3 on device cpu with dtype torch.float32
Loaded reading order model vikp/surya_order on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32
Loaded texify model to cpu with torch.float32 dtype


Detecting bboxes: 100%|██████████| 1/1 [00:04<00:00,  4.60s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:04<00:00,  4.67s/it]
Finding reading order: 100%|██████████| 1/1 [00:07<00:00,  7.02s/it]


Saved markdown to the /tmp/tmpskgob0sn/O6-FYW_W-NBNFIDB3UNCHIPL4T081LJZ folder


7it [05:32, 47.21s/it]

Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded detection model vikp/surya_layout3 on device cpu with dtype torch.float32
Loaded reading order model vikp/surya_order on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32
Loaded texify model to cpu with torch.float32 dtype


Detecting bboxes: 100%|██████████| 1/1 [00:04<00:00,  4.57s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.43s/it]
Finding reading order: 100%|██████████| 1/1 [00:06<00:00,  6.91s/it]


Saved markdown to the /tmp/tmpir6bah4m/caderneta predial urbana folder


8it [06:12, 45.16s/it]

Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded detection model vikp/surya_layout3 on device cpu with dtype torch.float32
Loaded reading order model vikp/surya_order on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32
Loaded texify model to cpu with torch.float32 dtype


Detecting bboxes: 100%|██████████| 6/6 [00:54<00:00,  9.03s/it]
Detecting bboxes: 100%|██████████| 4/4 [00:43<00:00, 10.87s/it]
Finding reading order: 100%|██████████| 4/4 [01:22<00:00, 20.64s/it]


Saved markdown to the /tmp/tmpwx82bjmi/PDF20230428150227331 folder


9it [09:49, 98.78s/it]

Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded detection model vikp/surya_layout3 on device cpu with dtype torch.float32
Loaded reading order model vikp/surya_order on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32
Loaded texify model to cpu with torch.float32 dtype


Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.32s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:02<00:00,  2.01s/it]
Finding reading order: 100%|██████████| 1/1 [00:03<00:00,  3.56s/it]


Saved markdown to the /tmp/tmpw_4g0boi/Anexo_2_CadernetaPredial folder


10it [10:21, 78.29s/it]

Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded detection model vikp/surya_layout3 on device cpu with dtype torch.float32
Loaded reading order model vikp/surya_order on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32
Loaded texify model to cpu with torch.float32 dtype


Detecting bboxes: 100%|██████████| 3/3 [00:20<00:00,  6.83s/it]
Detecting bboxes: 100%|██████████| 2/2 [00:17<00:00,  8.53s/it]
Finding reading order: 100%|██████████| 2/2 [00:34<00:00, 17.08s/it]


Saved markdown to the /tmp/tmpkaw085hr/Cadernetas_Prediais folder


11it [11:59, 65.45s/it]
0it [00:00, ?it/s]

Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded detection model vikp/surya_layout3 on device cpu with dtype torch.float32
Loaded reading order model vikp/surya_order on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32
Loaded texify model to cpu with torch.float32 dtype


Detecting bboxes: 100%|██████████| 3/3 [00:24<00:00,  8.25s/it]
Detecting bboxes: 100%|██████████| 2/2 [00:18<00:00,  9.09s/it]
Finding reading order: 100%|██████████| 2/2 [00:37<00:00, 18.65s/it]


Saved markdown to the /tmp/tmp3lcbbckn/ficha_tecnica_da_habitacao folder


1it [02:24, 144.79s/it]


../data/ficha_tecnica_habitacao/ficha_tecnica_da_habitacao.pdf


In [3]:
load_document(filename="../data/cpu atualizada.pdf", document_type="caderneta_predial")
load_document(filename="../data/Escritura.pdf", document_type="licenca_utilizacao")
load_document(filename="../data/Acordo Partilha Assinado.pdf", document_type="licenca_utilizacao")

Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded detection model vikp/surya_layout3 on device cpu with dtype torch.float32
Loaded reading order model vikp/surya_order on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32
Loaded texify model to cpu with torch.float32 dtype


Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.58s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.88s/it]
Finding reading order: 100%|██████████| 1/1 [00:06<00:00,  6.87s/it]


Saved markdown to the /tmp/tmpmx253oww/cpu atualizada folder


In [None]:
types = [
    "certificado_registo_predial",
    "caderneta_predial",
    "licenca_utilizacao",
    "certidao_isencao",
    "certificado_infraestruturas",
    "ficha_tecnica_habitacao",
    "certificado_energetico",
    "planta_imovel",
    "documento_kyc",
    "documento_preferencia"
]

In [None]:
Quais eram os titulares da caderneta predial que fiz?

In [6]:
from langchain.schema.document import Document

document = Document(
    page_content="Hello, world!",
    metadata={"source": "https://example.com"}
)
document

Document(metadata={'source': 'https://example.com'}, page_content='Hello, world!')

In [7]:
from langchain_openai import OpenAIEmbeddings

embeddings_model = OpenAIEmbeddings()
embeddings_model.embed_documents([document])

TypeError: argument 'text': 'Document' object cannot be converted to 'PyString'

In [9]:
filename = "../data/Escritura.pdf"

# Parse the PDF and convert it to Markdown
os.system(f"""marker_single "{filename}" "../dataTest" --batch_multiplier 1""")

Loaded detection model vikp/surya_det3 on device mps with dtype torch.float16
Loaded detection model vikp/surya_layout3 on device mps with dtype torch.float16
Loaded reading order model vikp/surya_order on device mps with dtype torch.float16
Loaded recognition model vikp/surya_rec2 on device mps with dtype torch.float16
Loaded texify model to mps with torch.float16 dtype


Detecting bboxes: 100%|██████████| 3/3 [00:05<00:00,  2.00s/it]
Recognizing Text: 100%|██████████| 10/10 [02:26<00:00, 14.67s/it]
Detecting bboxes: 100%|██████████| 2/2 [00:04<00:00,  2.18s/it]
Finding reading order: 100%|██████████| 2/2 [00:24<00:00, 12.30s/it]


Saved markdown to the ../dataTest/Escritura folder


0

In [10]:
with open(f"../dataTest/Escritura/Escritura.md", "r") as f:
    doc = f.read()

In [5]:
print(doc)

# Cartório Notarial De Palmela

Telef.:212 350 031 / 212 330 288 - Fax 212 332 542 Av. Rainha D. Leonor, 4 Loja E - 2950 - 204 PALMELA
NOTÁRIO

![0_image_0.png](0_image_0.png)

Licenciado Jerónimo Monteiro Lourenço O Signatário, Ajudante do Cartório Notarial de Palmela
-  Que a fotocópia apensa a esta Certidão está conforme o original que restituí o qual tem / não tem aposto o respectivo selo branco.

- Que foi extraída neste Cartório da escritura exarada de folhas per
-- a folhas ort do livro de notas para escrituras diversas número Que foi extraída neste Cartório do Testamento exarado de folhas _
_ a folhas _
do livro de Testamentos número_
Que fiz extrair do Bilhete de Identidade número emitido em de de pelos
- Que fiz extraír do Passaporte número -
de_
_ por de de de maine
...

- Que me foi presente para conferir. ----------------------
- Que fiz extraír do documento. --------------------------
- Que ocupa - Ou 3 l - folhas que têm aposto o respectivo selo branco deste Cartório, es

# Header chunking

In [12]:
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=[
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
], strip_headers=False)

chunks = markdown_splitter.split_text(doc)
chunks

[Document(metadata={'Header 1': 'Cartório Notarial De Palmela'}, page_content='# Cartório Notarial De Palmela  \nTelef.:212 350 031 / 212 330 288 - Fax 212 332 542 Av. Rainha D. Leonor, 4 Loja E - 2950 - 204 PALMELA\nNOTÁRIO\nLicenciado Jerónimo Monteiro Lourenço O Signatário, Ajudante do Cartório Notarial de Palmela\n-  Que a fotocópia apensa a esta Certidão está conforme o original que restituí o qual tem / não tem aposto o respectivo selo branco.  \n- Que foi extraída neste Cartório da escritura exarada de folhas per\n-- a folhas ort do livro de notas para escrituras diversas número Que foi extraída neste Cartório do Testamento exarado de folhas _\n_ a folhas _\ndo livro de Testamentos número_\nQue fiz extrair do Bilhete de Identidade número emitido em de de pelos\n- Que fiz extraír do Passaporte número -\nde_\n_ por de de de maine\n..  \n- Que me foi presente para conferir. ----------------------\n- Que fiz extraír do documento. -------------------------\n- Que ocupa - Ou 3 l - fol

TypeError: SemanticChunker.__init__() got an unexpected keyword argument 'chunk_overlap'

In [33]:
print(semantic_chunks[0].page_content)

# Cartório Notarial De Palmela  
Telef.:212 350 031 / 212 330 288 - Fax 212 332 542 Av. Rainha D. Leonor, 4 Loja E - 2950 - 204 PALMELA
NOTÁRIO
Licenciado Jerónimo Monteiro Lourenço O Signatário, Ajudante do Cartório Notarial de Palmela
-  Que a fotocópia apensa a esta Certidão está conforme o original que restituí o qual tem / não tem aposto o respectivo selo branco. - Que foi extraída neste Cartório da escritura exarada de folhas per
-- a folhas ort do livro de notas para escrituras diversas número Que foi extraída neste Cartório do Testamento exarado de folhas _
_ a folhas _
do livro de Testamentos número_
Que fiz extrair do Bilhete de Identidade número emitido em de de pelos
- Que fiz extraír do Passaporte número -
de_
_ por de de de maine
..


# Semantic Chunking

In [37]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings

text_splitter = SemanticChunker(OpenAIEmbeddings(api_key=os.environ.get("OPENAI_API_KEY")), breakpoint_threshold_type='percentile', ) # chose which embeddings and breakpoint type and threshold to use
chunks = text_splitter.create_documents([doc])
chunks

[Document(page_content='# Cartório Notarial De Palmela\n\nTelef.:212 350 031 / 212 330 288 - Fax 212 332 542 Av. Rainha D. Leonor, 4 Loja E - 2950 - 204 PALMELA\nNOTÁRIO\nLicenciado Jerónimo Monteiro Lourenço O Signatário, Ajudante do Cartório Notarial de Palmela\n-  Que a fotocópia apensa a esta Certidão está conforme o original que restituí o qual tem / não tem aposto o respectivo selo branco. - Que foi extraída neste Cartório da escritura exarada de folhas per\n-- a folhas ort do livro de notas para escrituras diversas número Que foi extraída neste Cartório do Testamento exarado de folhas _\n_ a folhas _\ndo livro de Testamentos número_\nQue fiz extrair do Bilhete de Identidade número emitido em de de pelos\n- Que fiz extraír do Passaporte número -\nde_\n_ por de de de maine\n..'),
 Document(page_content='- Que me foi presente para conferir. ----------------------\n- Que fiz extraír do documento. -------------------------\n- Que ocupa - Ou 3 l - folhas que têm aposto o respectivo se

# Header + Semantic Chunking?

In [39]:
chunks[0].id

# Basic chunking with overlap

In [32]:
from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter, MarkdownTextSplitter

text_splitter = MarkdownTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=40,
    chunk_overlap=0
    # length_function=len,
    # is_separator_regex=False,
    # separator="ch"
)
chunks = text_splitter.create_documents([doc])
chunks

[Document(page_content='da - Sede: Rua Abel Salazar, 7C e 7D,'),
 Document(page_content='loja 8, piso 0  2905-290 Almada  Tel:'),
 Document(page_content='215863163   tipyfamilymo@century21.pt'),
 Document(page_content='www.century21.pt/tipy/familymc=Licer'),
 Document(page_content='![0_image_0.png](0_image_0.png)'),
 Document(page_content='# Century 21'),
 Document(page_content='![0_Image_1.Png](0_Image_1.Png)'),
 Document(page_content='Ipv Family MC'),
 Document(page_content='# Contrato De Colaboração E Partilha De'),
 Document(page_content='Comissão'),
 Document(page_content='Entre PRIMEIRA CONTRAENTE:'),
 Document(page_content='FGM&C Lda., com sede na Rua Abel'),
 Document(page_content='Salazar, 7C e 7D loja 8 piso 0,'),
 Document(page_content='2805-290 Almada, com o capital social'),
 Document(page_content='de €5.000, com o NIPC: 516095188, com o'),
 Document(page_content='código de acesso à certidão comercial'),
 Document(page_content='n\tº 3140-5783-5469, detentora da'),
 Documen

In [28]:
len(chunks)

1

In [18]:
print(chunks[0].page_content)

da - Sede: Rua Abel Salazar, 7C e 7D, loja 8, piso 0  2905-290 Almada  Tel: 215863163   tipyfamilymo@century21.pt     www.century21.pt/tipy/familymc=Licer

![0_image_0.png](0_image_0.png)

# Century 21

![0_Image_1.Png](0_Image_1.Png)

Ipv Family MC

# Contrato De Colaboração E Partilha De Comissão


In [17]:
print(chunks[1].page_content)

Entre PRIMEIRA CONTRAENTE:


In [16]:
print(chunks[2].page_content)

FGM&C Lda., com sede na Rua Abel Salazar, 7C e 7D loja 8 piso 0, 2805-290 Almada, com o capital social de €5.000, com o NIPC: 516095188, com o código de acesso à certidão comercial n	º 3140-5783-5469, detentora da Licença AMI nº 18194 emitida pelo Instituto dos Mercados Públicos, do Imobiliário e Construção (IMPIC),neste ato representada pela Procuradora Carla Martins, conforme procuração autenticada com o número de registo na Ordem dos Advogados n.º 21535L/3525, adiante designada como Mediadora.: E


In [19]:
print(chunks[3].page_content)

e segunda contraente: 
Obvio e Positivo Loc. Mediação Imobiliária, Lda., com sede na Rua Prof Frencisco Gentil nº 20 - Telheras - Inthuse com capital social
€ 10000 com o NIPC: 514 707 356 detentora da Licença AMI nº 1 6962 emitida pelo Instituto dos Mercados Públicos, do Imobiliário e Construção (IMPIC), neste ato representada pela Por Poulo Costa. I von un dorayante designada como Segunda Outorgante.


In [22]:
print(chunks[4].page_content)

As Contraentes manifestam que é vontade das mesmas subscrever o presente contrato de colaboração que se regerá pelas seguintes cláusulas: -

## Primeira

As partes trocam habitualmente informação confidencial de produtos imobiliários, apresentando os mesmos aos seus clientes, com o objetivo de levar a bom termo operações de caráter imobiliário. -

## Segunda


In [21]:
print(doc)

da - Sede: Rua Abel Salazar, 7C e 7D, loja 8, piso 0  2905-290 Almada  Tel: 215863163   tipyfamilymo@century21.pt     www.century21.pt/tipy/familymc=Licer

![0_image_0.png](0_image_0.png)

# Century 21

![0_Image_1.Png](0_Image_1.Png)

Ipv Family MC

# Contrato De Colaboração E Partilha De Comissão

Entre PRIMEIRA CONTRAENTE:
FGM&C Lda., com sede na Rua Abel Salazar, 7C e 7D loja 8 piso 0, 2805-290 Almada, com o capital social de €5.000, com o NIPC: 516095188, com o código de acesso à certidão comercial n	º 3140-5783-5469, detentora da Licença AMI nº 18194 emitida pelo Instituto dos Mercados Públicos, do Imobiliário e Construção (IMPIC),neste ato representada pela Procuradora Carla Martins, conforme procuração autenticada com o número de registo na Ordem dos Advogados n.º 21535L/3525, adiante designada como Mediadora.: E
e segunda contraente: 
Obvio e Positivo Loc. Mediação Imobiliária, Lda., com sede na Rua Prof Frencisco Gentil nº 20 - Telheras - Inthuse com capital social
€ 10000 co

In [None]:
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain.schema.document import Document

In [36]:
load_document("../data/Acordo Partilha Assinado.pdf")
load_document("../data/cpu atualizada.pdf")
load_document("../data/Escritura.pdf")

Loaded detection model vikp/surya_det3 on device mps with dtype torch.float16
Loaded detection model vikp/surya_layout3 on device mps with dtype torch.float16
Loaded reading order model vikp/surya_order on device mps with dtype torch.float16
Loaded recognition model vikp/surya_rec2 on device mps with dtype torch.float16
Loaded texify model to mps with torch.float16 dtype


Detecting bboxes: 100%|██████████| 1/1 [00:01<00:00,  1.11s/it]
Recognizing Text: 100%|██████████| 2/2 [00:36<00:00, 18.26s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:01<00:00,  1.18s/it]
Finding reading order: 100%|██████████| 1/1 [00:03<00:00,  3.74s/it]


Saved markdown to the /var/folders/b4/t373qrvd4m76swgs_nb9vf9r0000gn/T/tmp3qzey8im/Acordo Partilha Assinado folder
Loaded detection model vikp/surya_det3 on device mps with dtype torch.float16
Loaded detection model vikp/surya_layout3 on device mps with dtype torch.float16
Loaded reading order model vikp/surya_order on device mps with dtype torch.float16
Loaded recognition model vikp/surya_rec2 on device mps with dtype torch.float16
Loaded texify model to mps with torch.float16 dtype


Detecting bboxes: 100%|██████████| 1/1 [00:01<00:00,  1.07s/it]
Recognizing Text: 100%|██████████| 3/3 [00:36<00:00, 12.14s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:01<00:00,  1.33s/it]
Finding reading order: 100%|██████████| 1/1 [00:04<00:00,  4.10s/it]


Saved markdown to the /var/folders/b4/t373qrvd4m76swgs_nb9vf9r0000gn/T/tmpx58_l1ty/cpu atualizada folder
Loaded detection model vikp/surya_det3 on device mps with dtype torch.float16
Loaded detection model vikp/surya_layout3 on device mps with dtype torch.float16
Loaded reading order model vikp/surya_order on device mps with dtype torch.float16
Loaded recognition model vikp/surya_rec2 on device mps with dtype torch.float16
Loaded texify model to mps with torch.float16 dtype


Detecting bboxes: 100%|██████████| 3/3 [00:05<00:00,  1.77s/it]
Recognizing Text: 100%|██████████| 10/10 [02:18<00:00, 13.86s/it]
Detecting bboxes: 100%|██████████| 2/2 [00:04<00:00,  2.25s/it]
Finding reading order: 100%|██████████| 2/2 [00:21<00:00, 10.68s/it]


Saved markdown to the /var/folders/b4/t373qrvd4m76swgs_nb9vf9r0000gn/T/tmp8svcrv4a/Escritura folder
