In [None]:
%pip install -qU rapidocr-onnxruntime langchain langchain_community langchain_core pymupdf

In [47]:
import os
from langchain.schema import Document
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.document_loaders.parsers import RapidOCRBlobParser
from tqdm import tqdm
from typing import List

In [49]:
PDF_DIR = "../../../notificaciones"

In [64]:
def directory_loader(directory_path: str) -> List[Document]:
    """LOADS PDF DOCUMENTS FROM A GIVEN DIRECTORY WITH PROGRESS INDICATOR."""

    if not os.path.exists(directory_path):
        raise ValueError(
            f"directory_loader() >>> DIRECTORY {directory_path} DOESN'T EXIST."
        )

    loaded_docs: List[Document] = []

    # SEARCH IN THE GIVEN DIRECTORY FOR EACH PDF FILE IN IT AND GETS ITS PATH
    pdf_files_info = []
    for parent_dir_path, _, files in os.walk(directory_path):
        for filename in files:
            if filename.endswith(".pdf"):
                file_path = os.path.join(parent_dir_path, filename)
                pdf_files_info.append(file_path)

    # LOADS EACH PDF FILE: FILE --> LIST[DOCUMENT]
    for file_path in tqdm(pdf_files_info, desc="LOADING FILES"):
        loaded_file = PyMuPDFLoader(
            file_path,
            mode="page",
            images_inner_format="text",
            images_parser=RapidOCRBlobParser(),
        ).load()

        # DELETES UNNECESSARY METADATA AND APPENDS THE LOADED FILE TO THE LIST OF LOADED DOCS
        for page in loaded_file:
            del page.metadata["file_path"]
            del page.metadata["format"]
            del page.metadata["producer"]
            del page.metadata["creator"]
            del page.metadata["creationdate"]
            del page.metadata["author"]
            del page.metadata["subject"]
            del page.metadata["keywords"]
            del page.metadata["moddate"]
            del page.metadata["trapped"]
            del page.metadata["modDate"]
            del page.metadata["creationDate"]
            del page.metadata["page"]
            del page.metadata["total_pages"]

        loaded_docs.append(loaded_file)

    return loaded_docs

In [None]:
docs = directory_loader(PDF_DIR)

In [None]:
docs[0]

In [None]:
for index, doc in enumerate(docs):
    for i, page in enumerate(doc):
        print(
            f"> DOC N°: {index}\n> PAGE N°: {i}\n\n> METADATA:\n{page.metadata}\n\n> CONTENT:\n{page.page_content}\n\n{'-'*20}\n"
        )