In [None]:
%pip install -qU langchain-community langchain-core pymupdf pymupdf4llm

In [3]:
# GENERAL IMPORTS
import os
import re
import numpy as np
from langchain_core.documents import Document
from pathlib import Path
from pdf2image import convert_from_path
from rich import print
from tqdm import tqdm
from typing import List, Dict

# RICH'S PRINT COLORS
YELLOW = "#fde047"
ORANGE = "#f97316"
RED = "#ef4444"
BLUE = "#3b82f6"
CYAN = "#06b6d4"
EMERALD = "#34d399"
VIOLET = "#a855f7"
PINK = "#ec4899"
GRAY = "#64748b"
WHITE = "#cccccc"
GREEN = "#3fb618"

# PATHS
ROOT_DIR = Path("../../../../COLEGA DATA")
PDF_DIR = ROOT_DIR / "notificaciones"
PDF_DIR_2 = ROOT_DIR / "MÉTODO DE LA DEMANDA Y SU CONTESTACIÓN" / "CAPS"
PDF_FILE_1 = PDF_DIR / "RES 04-04-2024 - DILIGENCIA PRELIMINAR.pdf"
PDF_FILE_2 = PDF_DIR_2 / "1_EL_CASO_Y_SU_SOLUCIÓN.pdf"

In [4]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [5]:
def search_dir(dir_path: str, file_ext: str) -> List[Dict[str, str]]:
    """FILE'S SEARCH IN A GIVEN DIRECTORY"""
    dir_path = Path(dir_path)

    if not dir_path.is_dir():
        raise ValueError(f"search_dir() => DIRECTORY ({dir_path}) DOESN'T EXIST.")

    if not any(dir_path.iterdir()):
        raise ValueError(f"search_dir() => DIRECTORY ({dir_path}) IS EMPTY.")

    if not file_ext.startswith("."):
        file_ext = f".{file_ext}"

    # SEARCH FOR WANTED FILES
    files_info: List[Dict[str, str]] = [
        {"filename": f.name, "filepath": str(f)}
        for f in dir_path.glob(f"*{file_ext}")
        if f.is_file()
    ]

    # CHECK IF FILES WERE FOUND
    if not files_info:
        raise ValueError(
            f"search_dir() => NO FILES WITH EXTENSION ({file_ext}) WERE FOUND IN DIRECTORY ({dir_path})."
        )

    return files_info


def text_cleaner(text: str) -> str:
    """
    Cleans text by replacing non-breaking spaces, normalizing spaces and newlines,
    and removing hash symbols.
    """
    try:
        # From non-breaking space character to a regular space
        text = re.sub(r"\xa0", " ", text)
        # From multiple spaces to a single space
        text = re.sub(r" +", " ", text)
        # From >=3 - symbols to none
        text = re.sub(r"-{3,}", "", text)
        # From >=3 line breaks to double line breaks
        text = re.sub(r"\n{3,}", "\n\n", text)
        # From >=2  hash symbols to none
        text = re.sub(r"#{2,}", "", text)
        # Trim leading and trailing whitespace
        text = "\n\n".join([line.strip() for line in text.split("\n\n")])
        # text = "\n".join([line.strip() for line in text.split("\n")])
        text = text.strip()

    except Exception as e:
        print(f"An error occurred while cleaning the text: {e}")

    return text


def is_text_corrupt(text) -> bool:
    """Verifica si el texto extraído contiene caracteres corruptos o codificado incorrectamente."""
    if not text.strip():
        return True

    # Contar caracteres alfabéticos, espacios y caracteres extraños
    total_chars = len(text)
    valid_chars = sum(c.isalpha() or c.isspace() for c in text)
    invalid_chars = sum(1 for c in text if c in "�")  # Caracteres de reemplazo o BOM

    # Si hay demasiados caracteres extraños o pocos alfabéticos, marcar como corrupto
    if (valid_chars / total_chars) < 0.7:
        # if (invalid_chars / total_chars) > 0.3:
        return True

    return False

In [6]:
from pymupdf4llm import to_markdown

In [7]:
def directory_loader(dir_path: str, file_ext: str) -> List[Document]:
    """LOADS PDF DOCUMENTS FROM A GIVEN DIRECTORY"""

    # SEARCH IN THE GIVEN DIRECTORY FOR EACH PDF FILE IN IT AND GETS ITS PATH
    loaded_docs: List[Document] = []
    files_info: List[Dict[str, str]] = search_dir(dir_path, file_ext)

    # LOADS EACH PDF FILE: FILE --> LIST[DOCUMENT]
    for f in tqdm(
        files_info, desc="LOADING PDF FILES", total=len(files_info), colour=EMERALD
    ):
        extracted_text = to_markdown(f["filepath"], show_progress=False)
        text_cleaned = text_cleaner(extracted_text)
        loaded_file = Document(metadata=f, page_content=text_cleaned)

        loaded_docs.append(loaded_file)

    return loaded_docs

In [9]:
pdf4llm_docs = directory_loader(PDF_DIR, "pdf")

LOADING PDF FILES: 100%|[38;2;52;211;153m██████████[0m| 6/6 [00:00<00:00, 25.48it/s]


In [None]:
for index, doc in enumerate(pdf4llm_docs):
    if is_text_corrupt(doc.page_content):
        print(f"[{RED}]{doc.metadata['filename']}[/]")
    else:
        print(f"[{GREEN}]{doc.metadata['filename']}[/]")

In [None]:
for index, doc in enumerate(pdf4llm_docs):
    print(
        f"[bold {BLUE}]> DOC N°:[/] [bold {WHITE}]{index}[/]\n",
        f"[bold {EMERALD}]> FILENAME:[/] [bold {WHITE}]{doc.metadata["filename"]}[/]\n\n",
        f"[bold {YELLOW}]> CONTENT len({len(doc.page_content)}):[/]\n[{WHITE}]{doc.page_content}[/]",
    )

---

    TESTING WITH BROKEN PDFS

In [None]:
test_docs = directory_loader(PDF_DIR_2, "pdf")

In [None]:
for index, doc in enumerate(test_docs):
    if is_text_corrupt(doc.page_content):
        print(f"[{RED}]{doc.metadata['filename']}[/]")
    else:
        print(f"[{GREEN}]{doc.metadata['filename']}[/]")

for index, doc in enumerate(test_docs):
    print(
        f"[bold {BLUE}]> DOC N°:[/] [bold {WHITE}]{index}[/]    ",
        f"[bold {EMERALD}]> FILENAME:[/] [bold {WHITE}]{doc.metadata["filename"]}[/]    ",
        f"[bold {YELLOW}]> CONTENT:[/] len({len(doc.page_content)}) >>> [{WHITE}]{doc.page_content}...[/]",
    )