In [None]:
%pip install -qU langchain_community langchain_core pymupdf rapidocr-onnxruntime

In [3]:
# GENERAL IMPORTS
import os
import re
from langchain_core.documents import Document
from pathlib import Path
from rich import print
from tqdm import tqdm
from typing import List, Dict

# RICH'S PRINT COLORS
YELLOW = "#fde047"
ORANGE = "#f97316"
RED = "#ef4444"
BLUE = "#3b82f6"
CYAN = "#06b6d4"
EMERALD = "#34d399"
VIOLET = "#a855f7"
PINK = "#ec4899"
GRAY = "#64748b"
WHITE = "#cccccc"
GREEN = "#3fb618"

# PATHS
ROOT_DIR = Path("../../../../COLEGA DATA")
PDF_DIR = ROOT_DIR / "notificaciones"
PDF_DIR_2 = ROOT_DIR / "MÉTODO DE LA DEMANDA Y SU CONTESTACIÓN" / "CAPS"
PDF_FILE_1 = PDF_DIR / "RES 04-04-2024 - DILIGENCIA PRELIMINAR.pdf"
PDF_FILE_2 = PDF_DIR_2 / "1_EL_CASO_Y_SU_SOLUCIÓN.pdf"

In [4]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [5]:
def search_dir(dir_path: str, file_ext: str) -> List[Dict[str, str]]:
    """FILE'S SEARCH IN A GIVEN DIRECTORY"""
    dir_path = Path(dir_path)

    if not dir_path.is_dir():
        raise ValueError(f"search_dir() => DIRECTORY ({dir_path}) DOESN'T EXIST.")

    if not any(dir_path.iterdir()):
        raise ValueError(f"search_dir() => DIRECTORY ({dir_path}) IS EMPTY.")

    if not file_ext.startswith("."):
        file_ext = f".{file_ext}"

    # SEARCH FOR WANTED FILES
    files_info: List[Dict[str, str]] = [
        {"filename": f.name, "filepath": str(f)}
        for f in dir_path.glob(f"*{file_ext}")
        if f.is_file()
    ]

    # CHECK IF FILES WERE FOUND
    if not files_info:
        raise ValueError(
            f"search_dir() => NO FILES WITH EXTENSION ({file_ext}) WERE FOUND IN DIRECTORY ({dir_path})."
        )

    return files_info


def text_cleaner(text: str) -> str:
    """
    Cleans text by replacing non-breaking spaces, normalizing spaces and newlines,
    and removing hash symbols.
    """
    try:
        # From non-breaking space character to a regular space
        text = re.sub(r"\xa0", " ", text)
        # From multiple spaces to a single space
        text = re.sub(r" +", " ", text)
        # From >=3 - symbols to none
        text = re.sub(r"-{3,}", "", text)
        # From >=3 line breaks to double line breaks
        text = re.sub(r"\n{3,}", "\n\n", text)
        # From >=2  hash symbols to none
        text = re.sub(r"#{2,}", "", text)
        # Trim leading and trailing whitespace
        text = text.strip()

    except Exception as e:
        print(f"An error occurred while cleaning the text: {e}")

    return text


def is_text_corrupt(text) -> bool:
    """Verifica si el texto extraído contiene caracteres corruptos o codificado incorrectamente."""
    if not text.strip():
        return True

    # Contar caracteres alfabéticos, espacios y caracteres extraños
    total_chars = len(text)
    valid_chars = sum(c.isalpha() or c.isspace() for c in text)
    invalid_chars = sum(1 for c in text if c in "�")  # Caracteres de reemplazo o BOM

    # Si hay demasiados caracteres extraños o pocos alfabéticos, marcar como corrupto
    if (valid_chars / total_chars) < 0.7:
        # if (invalid_chars / total_chars) > 0.3:
        return True

    return False

In [6]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.document_loaders.parsers import RapidOCRBlobParser

In [6]:
def directory_loader(dir_path: str, file_ext: str) -> List[Document]:
    """LOADS PDF DOCUMENTS FROM A GIVEN DIRECTORY"""

    # SEARCH IN THE GIVEN DIRECTORY FOR EACH PDF FILE IN IT AND GETS ITS PATH
    loaded_docs: List[Document] = []
    files_info: List[Dict[str, str]] = search_dir(dir_path, file_ext)

    # LOADS EACH PDF FILE: FILE --> LIST[DOCUMENT]
    for f in tqdm(
        files_info, desc="LOADING PDF FILES", total=len(files_info), colour=EMERALD
    ):
        loader = PyMuPDFLoader(
            file_path=f["filepath"],
            mode="page",
            images_inner_format="text",
            images_parser=RapidOCRBlobParser(),
        )
        loaded_file = loader.load()
        for page in loaded_file:
            page.page_content = text_cleaner(page.page_content)
        loaded_docs.append(loaded_file)

    return loaded_docs

In [None]:
rapidocr_docs = directory_loader(PDF_DIR, "pdf")

In [None]:
for index, doc in enumerate(rapidocr_docs):
    for page in doc:
        print(
            f"[bold {BLUE}]> DOC N°:[/] [bold {WHITE}]{index}[/]\n",
            f"[bold {EMERALD}]> FILENAME:[/] [bold {WHITE}]{page.metadata["title"]}[/]\n\n",
            f"[bold {YELLOW}]> CONTENT:[/]\n[{WHITE}]{page.page_content}[/]",
        )

---

    TESTING WITH BROKEN PDFS

In [None]:
# SINGLE FILE
loader = PyMuPDFLoader(
    file_path=PDF_FILE_2,
    mode="page",
    images_inner_format="text",
    images_parser=RapidOCRBlobParser(),
)
loaded_file = loader.load()

In [None]:
loaded_file

In [None]:
for index, doc in enumerate(loaded_file):
    if is_text_corrupt(doc.page_content):
        print(f"[{RED}]{doc.metadata['title']}[/]")
    else:
        print(f"[{GREEN}]{doc.metadata['title']}[/]")

for index, doc in enumerate(loaded_file[-5:]):
    print(
        f"[bold {BLUE}]> DOC N°:[/] [bold {WHITE}]{index - 5}[/]\n",
        f"[bold {EMERALD}]> FILENAME:[/] [bold {WHITE}]{doc.metadata["title"]}[/]\n\n",
        f"[bold {YELLOW}]> CONTENT len({len(doc.page_content)}):[/]\n[{WHITE}]{doc.page_content}[/]",
    )