In [None]:
%pip install -qU langchain_community langchain_core pymupdf pymupdf4llm rapidocr-onnxruntime

In [2]:
# GENERAL IMPORTS
import os
import re
import numpy as np
from langchain_core.documents import Document
from pathlib import Path
from pdf2image import convert_from_path
from rich import print
from tqdm import tqdm
from typing import List, Dict

# RICH'S PRINT COLORS
YELLOW = "#fde047"
ORANGE = "#f97316"
RED = "#ef4444"
BLUE = "#3b82f6"
CYAN = "#06b6d4"
EMERALD = "#34d399"
VIOLET = "#a855f7"
PINK = "#ec4899"
GRAY = "#64748b"
WHITE = "#cccccc"
GREEN = "#3fb618"

# PATHS
ROOT_DIR = Path("../../../../COLEGA DATA")
PDF_DIR = ROOT_DIR / "notificaciones"
PDF_DIR_2 = ROOT_DIR / "MÉTODO DE LA DEMANDA Y SU CONTESTACIÓN" / "CAPS"
PDF_FILE_1 = PDF_DIR / "RES 04-04-2024 - DILIGENCIA PRELIMINAR.pdf"
PDF_FILE_2 = PDF_DIR_2 / "1_EL_CASO_Y_SU_SOLUCIÓN.pdf"

In [3]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
def search_dir(dir_path: str, file_ext: str) -> List[Dict[str, str]]:
    """FILE'S SEARCH IN A GIVEN DIRECTORY"""
    dir_path = Path(dir_path)

    if not dir_path.is_dir():
        raise ValueError(f"search_dir() => DIRECTORY ({dir_path}) DOESN'T EXIST.")

    if not any(dir_path.iterdir()):
        raise ValueError(f"search_dir() => DIRECTORY ({dir_path}) IS EMPTY.")

    if not file_ext.startswith("."):
        file_ext = f".{file_ext}"

    # SEARCH FOR WANTED FILES
    files_info: List[Dict[str, str]] = [
        {"filename": f.name, "filepath": str(f)}
        for f in dir_path.glob(f"*{file_ext}")
        if f.is_file()
    ]

    # CHECK IF FILES WERE FOUND
    if not files_info:
        raise ValueError(
            f"search_dir() => NO FILES WITH EXTENSION ({file_ext}) WERE FOUND IN DIRECTORY ({dir_path})."
        )

    return files_info

In [5]:
def text_cleaner(text: str) -> str:
    """
    Cleans text by replacing non-breaking spaces, normalizing spaces and newlines,
    and removing hash symbols.
    """
    try:
        # Replace non-breaking spaces with regular spaces
        text = text.replace("\xa0", " ")
        # Normalize spaces
        text = re.sub(r"\s+", " ", text)
        # Normalize newlines if specified
        text = re.sub(r"\n{3,}", "\n\n", text)
        # Remove hash symbols if specified
        text = re.sub(r"#", "", text)
        # Trim leading and trailing whitespace
        text = text.strip()

        return text
    except Exception as e:
        print(f"An error occurred while cleaning the text: {e}")
        return text

In [6]:
def is_text_corrupt(text) -> bool:
    """Verifica si el texto extraído contiene caracteres corruptos o codificado incorrectamente."""
    if not text.strip():
        return True

    # Contar caracteres alfabéticos, espacios y caracteres extraños
    total_chars = len(text)
    valid_chars = sum(c.isalpha() or c.isspace() for c in text)
    invalid_chars = sum(1 for c in text if c in "�")  # Caracteres de reemplazo o BOM

    # Si hay demasiados caracteres extraños o pocos alfabéticos, marcar como corrupto
    if (valid_chars / total_chars) < 0.7:
        # if (invalid_chars / total_chars) > 0.3:
        return True

    return False

# PyMuPDF4llm

In [None]:
%pip install -qU pymupdf4llm

In [8]:
from pymupdf4llm import to_markdown

In [9]:
def pdf4llm_directory_loader(dir_path: str, file_ext: str) -> List[Document]:
    """LOADS PDF DOCUMENTS FROM A GIVEN DIRECTORY"""

    # SEARCH IN THE GIVEN DIRECTORY FOR EACH PDF FILE IN IT AND GETS ITS PATH
    loaded_docs: List[Document] = []
    files_info: List[Dict[str, str]] = search_dir(dir_path, file_ext)

    # LOADS EACH PDF FILE: FILE --> LIST[DOCUMENT]
    for f in tqdm(
        files_info, desc="LOADING PDF FILES", total=len(files_info), colour=EMERALD
    ):
        md_text = to_markdown(f["filepath"], show_progress=False)
        loaded_file = Document(metadata=f, page_content=md_text)

        loaded_docs.append(loaded_file)

    return loaded_docs

In [None]:
pdf4llm_docs = pdf4llm_directory_loader(PDF_DIR, "pdf")

In [None]:
for index, doc in enumerate(pdf4llm_docs):
    if is_text_corrupt(doc.page_content):
        print(f"[{RED}]{doc.metadata['filename']}[/]")
    else:
        print(f"[{GREEN}]{doc.metadata['filename']}[/]")

In [None]:
test_docs = pdf4llm_directory_loader(PDF_DIR_2, "pdf")

for index, doc in enumerate(test_docs):
    if is_text_corrupt(doc.page_content):
        print(f"[{RED}]{doc.metadata['filename']}[/]")
    else:
        print(f"[{GREEN}]{doc.metadata['filename']}[/]")

for index, doc in enumerate(test_docs):
    print(
        f"[bold {BLUE}]> DOC N°:[/] [bold {WHITE}]{index}[/]\n",
        f"[bold {EMERALD}]> FILENAME:[/] [bold {WHITE}]{doc.metadata["filename"]}[/]\n\n",
        f"[bold {YELLOW}]> CONTENT:[/]\n[{WHITE}]{doc.page_content[:500]}[/]",
    )

---
---

# PyMuPDFLoader + RapidOCR

In [14]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.document_loaders.parsers import RapidOCRBlobParser

In [16]:
def pymupdf_directory_loader(dir_path: str, file_ext: str) -> List[Document]:
    """LOADS PDF DOCUMENTS FROM A GIVEN DIRECTORY"""

    # SEARCH IN THE GIVEN DIRECTORY FOR EACH PDF FILE IN IT AND GETS ITS PATH
    loaded_docs: List[Document] = []
    files_info: List[Dict[str, str]] = search_dir(dir_path, file_ext)

    # LOADS EACH PDF FILE: FILE --> LIST[DOCUMENT]
    for f in tqdm(
        files_info, desc="LOADING PDF FILES", total=len(files_info), colour=EMERALD
    ):
        loader = PyMuPDFLoader(
            file_path=f["filepath"],
            mode="page",
            images_inner_format="text",
            images_parser=RapidOCRBlobParser(),
        )
        loaded_file = loader.load()
        loaded_docs.append(loaded_file)

    return loaded_docs

In [None]:
rapidocr_docs = pymupdf_directory_loader(PDF_DIR, "pdf")

In [None]:
for index, doc in enumerate(pymupdf_docs):
    for page in doc:
        print(
            f"[bold {BLUE}]> DOC N°:[/] [bold {WHITE}]{index}[/]\n",
            f"[bold {EMERALD}]> FILENAME:[/] [bold {WHITE}]{page.metadata["title"]}[/]\n\n",
            f"[bold {YELLOW}]> CONTENT:[/]\n[{WHITE}]{page.page_content}[/]",
        )

---

# PyMuPDFLoader + Tesseract

In [None]:
%pip install -qU Pillow

In [19]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.document_loaders.parsers import TesseractBlobParser

In [21]:
def directory_loader(dir_path: str, file_ext: str) -> List[Document]:
    """LOADS PDF DOCUMENTS FROM A GIVEN DIRECTORY"""

    # SEARCH IN THE GIVEN DIRECTORY FOR EACH PDF FILE IN IT AND GETS ITS PATH
    loaded_docs: List[Document] = []
    files_info: List[Dict[str, str]] = search_dir(dir_path, file_ext)

    # LOADS EACH PDF FILE: FILE --> LIST[DOCUMENT]
    for f in tqdm(
        files_info, desc="LOADING PDF FILES", total=len(files_info), colour=EMERALD
    ):
        loader = PyMuPDFLoader(
            file_path=f["filepath"],
            mode="page",
            images_inner_format="text",
            images_parser=TesseractBlobParser(),
        )
        loaded_file = loader.load()
        loaded_docs.append(loaded_file)

    return loaded_docs

In [None]:
tesseract_docs = directory_loader(PDF_DIR, "pdf")