In [47]:
%pip install -qU langchain langchain_community langchain_core pymupdf4llm

Note: you may need to restart the kernel to use updated packages.


In [73]:
import os
from langchain.schema import Document
from pymupdf4llm import to_markdown
from rich import print
from rich.progress import track
from tqdm import tqdm
from typing import List

In [74]:
# RICH'S PRINT COLORS
YELLOW = "#fde047"
ORANGE = "#f97316"
RED = "#ef4444"
BLUE = "#3b82f6"
CYAN = "#06b6d4"
EMERALD = "#34d399"
VIOLET = "#a855f7"
PINK = "#ec4899"
GRAY = "#64748b"
WHITE = "#cccccc"

In [75]:
PDF_DIR = "../../../notificaciones"

In [86]:
def directory_loader(directory_path: str) -> List[Document]:
    """LOADS PDF DOCUMENTS FROM A GIVEN DIRECTORY WITH PROGRESS INDICATOR."""

    if not os.path.exists(directory_path):
        raise ValueError(
            f"directory_loader() >>> DIRECTORY {directory_path} DOESN'T EXIST."
        )

    loaded_docs: List[Document] = []

    # SEARCH IN THE GIVEN DIRECTORY FOR EACH PDF FILE IN IT AND GETS ITS PATH
    pdf_files_info = []
    for parent_dir_path, _, files in os.walk(directory_path):
        for filename in files:
            if filename.endswith(".pdf"):
                file_path = os.path.join(parent_dir_path, filename)
                pdf_files_info.append({"file_name": filename, "file_path": file_path})

    # LOADS EACH PDF FILE: FILE --> LIST[DOCUMENT]
    for file_info in tqdm(
        pdf_files_info,
        desc="LOADING PDF FILES",
        total=len(pdf_files_info),
        colour=EMERALD,
    ):
        md_text = to_markdown(file_info["file_path"], show_progress=False)
        md_text = md_text.replace("-----", "")
        md_text = md_text.replace("\n\n", "\n")
        md_text = md_text.replace("\n\n\n", "\n")
        md_text = md_text.strip()
        loaded_file = Document(metadata=file_info, page_content=md_text)

        loaded_docs.append(loaded_file)

    return loaded_docs

In [87]:
pymupdf4llm_docs = directory_loader(PDF_DIR)

LOADING PDF FILES: 100%|[38;2;52;211;153m██████████[0m| 6/6 [00:00<00:00, 15.59it/s]


In [88]:
len(pymupdf4llm_docs)

6

In [None]:
for index, doc in enumerate(pymupdf4llm_docs):
    print(
        f"[bold {BLUE}]> DOC N°:[/] [bold {WHITE}]{index}[/]\n",
        f"[bold {EMERALD}]> FILENAME:[/] [bold {WHITE}]{doc.metadata["file_name"]}[/]\n\n",
        f"[bold {YELLOW}]> CONTENT:[/]\n[{WHITE}]{doc.page_content}[/]",
    )

---
---
---

In [62]:
%pip install -qU pytesseract pdf2image poppler-utils

Note: you may need to restart the kernel to use updated packages.


In [63]:
import pytesseract
from pdf2image import convert_from_path
from tqdm import tqdm

# Ruta al archivo PDF
pdf_path = "../../../notificaciones/RES 04-04-2024 - DILIGENCIA PRELIMINAR.pdf"
# pdf_path = "../../../COLEGA DATA/MÉTODO DE LA DEMANDA Y SU CONTESTACIÓN/1_EL_CASO_Y_SU_SOLUCIÓN.pdf"

# Convertir el PDF a imágenes
pages = convert_from_path(pdf_path)

# Extraer texto con Tesseract
documents = []
for page_num, page in tqdm(
    enumerate(pages),
    desc="EXTRACTING TEXT FROM PDFs",
    total=len(pages),
    colour=EMERALD,
):
    extracted_text = pytesseract.image_to_string(page, lang="spa")
    documents.append(
        Document(
            metadata={
                "source": pdf_path,
                "page": page_num,
            },
            page_content=extracted_text,
        )
    )

EXTRACTING TEXT FROM PDFs: 100%|[38;2;52;211;153m██████████[0m| 2/2 [00:03<00:00,  1.85s/it]


In [64]:
len(documents)

2

In [None]:
for doc in documents:
    print(doc.page_content)

---
---
---