In [None]:
%pip install -qU rapidocr-onnxruntime langchain langchain_community langchain_core pymupdf rich

In [26]:
import os
from langchain.schema import Document
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.document_loaders.parsers import RapidOCRBlobParser
from rich import print
from rich.progress import track
from typing import List

In [36]:
# RICH'S PRINT COLORS
yellow = f"#fde047"
orange = f"#f97316"
red = f"#ef4444"
blue = f"#3b82f6"
cyan = f"#06b6d4"
emerald = f"#34d399"
violet = f"#a855f7"
pink = f"#ec4899"
gray = f"#64748b"

In [27]:
PDF_DIR = "../../../notificaciones"

In [28]:
def directory_loader(directory_path: str) -> List[Document]:
    """LOADS PDF DOCUMENTS FROM A GIVEN DIRECTORY WITH PROGRESS INDICATOR."""

    if not os.path.exists(directory_path):
        raise ValueError(
            f"directory_loader() >>> DIRECTORY {directory_path} DOESN'T EXIST."
        )

    loaded_docs: List[Document] = []

    # SEARCH IN THE GIVEN DIRECTORY FOR EACH PDF FILE IN IT AND GETS ITS PATH
    pdf_files_info = []
    for parent_dir_path, _, files in os.walk(directory_path):
        for filename in files:
            if filename.endswith(".pdf"):
                file_path = os.path.join(parent_dir_path, filename)
                pdf_files_info.append(file_path)

    # LOADS EACH PDF FILE: FILE --> LIST[DOCUMENT]
    for file_path in track(pdf_files_info, description="LOADING FILES"):
        loaded_file = PyMuPDFLoader(
            file_path,
            mode="page",
            images_inner_format="text",
            images_parser=RapidOCRBlobParser(),
        ).load()

        # DELETES UNNECESSARY METADATA AND APPENDS THE LOADED FILE TO THE LIST OF LOADED DOCS
        for page in loaded_file:
            """
            Other metadata's fields:
                - file_path
                - format
                - producer
                - creator
                - creationdate
                - author
                - subject
                - keywords
                - moddate
                - trapped
                - modDate
                - creationDate
                - page
                - total_pages
            """

            page.metadata = {
                "source": page.metadata["source"],
                "filename": page.metadata["title"],
            }

        loaded_docs.append(loaded_file)

    return loaded_docs

In [None]:
docs = directory_loader(PDF_DIR)

In [30]:
docs[0]

[Document(metadata={'source': '../../../notificaciones/RES 11-06-2024 -  SE CONCEDE RECURSO EN RELACION.pdf', 'filename': 'Despacho CIV 1950/2024 - SE CONCEDE RECURSO EN RELACION'}, page_content='#38617791#415584608#20240611104415348\nPoder Judicial de la Nación\nJUZGADO CIVIL 58\n1950/2024\nPROVINCIA ART SA c/ INTEGRITY SEGUROS ARGENTINA \nS.A s/DILIGENCIAS PRELIMINARES\nBuenos Aires,     de  \n.- \njunio de 2024\nIGM\nConcédese en relación el recurso de  apelación interpuesto\ncontra el  pronunciamiento de fs. 16.\nOportunamente, de estar en condiciones,  elévense los autos al\nTribunal de Alzada, en la forma  de estilo.\n \nDigitally signed by MARIA DI\nFILIPPO\nDate: 2024.06.11 13:06:18 ART\nSignature Not Verified')]

In [None]:
for index, doc in enumerate(docs):
    for i, page in enumerate(doc):
        print(
            f"[bold {blue}]> DOC N°:[/] {index}\n[bold {orange}]> PAGE N°:[/] {i}\n\n[bold {emerald}]> METADATA:[/]\n{page.metadata["filename"]}\n\n[bold {yellow}]> CONTENT:[/]\n{page.page_content}"
        )