# GENERALS


In [None]:
%pip install -qU langchain_community langchain_core pdf2image

In [2]:
# GENERAL IMPORTS
import os
import re
import numpy as np
from langchain_core.documents import Document
from pathlib import Path
from pdf2image import convert_from_path
from rich import print
from tqdm import tqdm
from typing import List, Dict

In [3]:
# RICH'S PRINT COLORS
YELLOW = "#fde047"
ORANGE = "#f97316"
RED = "#ef4444"
BLUE = "#3b82f6"
CYAN = "#06b6d4"
EMERALD = "#34d399"
VIOLET = "#a855f7"
PINK = "#ec4899"
GRAY = "#64748b"
WHITE = "#cccccc"
GREEN = "#3fb618"

In [11]:
# GENERAL VARIABLES
ROOT_DIR = Path("../../../../COLEGA DATA")
PDF_DIR = ROOT_DIR / "notificaciones"
PDF_DIR_2 = ROOT_DIR / "MÉTODO DE LA DEMANDA Y SU CONTESTACIÓN" / "CAPS"
PDF_FILE_1 = PDF_DIR / "RES 04-04-2024 - DILIGENCIA PRELIMINAR.pdf"
PDF_FILE_2 = PDF_DIR_2 / "1_EL_CASO_Y_SU_SOLUCIÓN.pdf"

In [5]:
def search_dir(dir_path: str, file_ext: str) -> List[Dict[str, str]]:
    """FILE'S SEARCH IN A GIVEN DIRECTORY"""
    dir_path = Path(dir_path)

    if not dir_path.is_dir():
        raise ValueError(f"search_dir() => DIRECTORY ({dir_path}) DOESN'T EXIST.")

    if not any(dir_path.iterdir()):
        raise ValueError(f"search_dir() => DIRECTORY ({dir_path}) IS EMPTY.")

    if not file_ext.startswith("."):
        file_ext = f".{file_ext}"

    # SEARCH FOR WANTED FILES
    files_info: List[Dict[str, str]] = [
        {"filename": f.name, "filepath": str(f)}
        for f in dir_path.glob(f"*{file_ext}")
        if f.is_file()
    ]

    # CHECK IF FILES WERE FOUND
    if not files_info:
        raise ValueError(
            f"search_dir() => NO FILES WITH EXTENSION ({file_ext}) WERE FOUND IN DIRECTORY ({dir_path})."
        )

    return files_info

In [6]:
def text_cleaner(text: str) -> str:
    """
    Cleans text by replacing non-breaking spaces, normalizing spaces and newlines,
    and removing hash symbols.
    """
    try:
        # Replace non-breaking spaces with regular spaces
        text = text.replace("\xa0", " ")
        # Normalize spaces
        text = re.sub(r"\s+", " ", text)
        # Normalize newlines if specified
        text = re.sub(r"\n{3,}", "\n\n", text)
        # Remove hash symbols if specified
        text = re.sub(r"#", "", text)
        # Trim leading and trailing whitespace
        text = text.strip()

        return text
    except Exception as e:
        print(f"An error occurred while cleaning the text: {e}")
        return text

In [7]:
def is_text_corrupt(text) -> bool:
    """Verifica si el texto extraído contiene caracteres corruptos o codificado incorrectamente."""
    if not text.strip():
        return True

    # Contar caracteres alfabéticos, espacios y caracteres extraños
    total_chars = len(text)
    valid_chars = sum(c.isalpha() or c.isspace() for c in text)
    invalid_chars = sum(1 for c in text if c in "�")  # Caracteres de reemplazo o BOM

    # Si hay demasiados caracteres extraños o pocos alfabéticos, marcar como corrupto
    if (valid_chars / total_chars) < 0.7:
        # if (invalid_chars / total_chars) > 0.3:
        return True

    return False

---
---

---


In [88]:
def pdf4llm_directory_loader(dir_path: str, file_ext: str) -> List[Document]:
    """LOADS PDF DOCUMENTS FROM A GIVEN DIRECTORY"""

    # SEARCH IN THE GIVEN DIRECTORY FOR EACH PDF FILE IN IT AND GETS ITS PATH
    loaded_docs: List[Document] = []
    files_info: List[Dict[str, str]] = search_dir(dir_path, file_ext)

    # LOADS EACH PDF FILE: FILE --> LIST[DOCUMENT]
    for f in tqdm(
        files_info, desc="LOADING PDF FILES", total=len(files_info), colour=EMERALD
    ):
        md_text = to_markdown(f["filepath"], show_progress=False)
        loaded_file = Document(metadata=f, page_content=md_text)

        loaded_docs.append(loaded_file)

    return loaded_docs

In [None]:
pdf4llm_docs = pdf4llm_directory_loader(PDF_DIR, "pdf")

In [None]:
for index, doc in enumerate(pdf4llm_docs):
    if is_text_corrupt(doc.page_content):
        print(f"[{RED}]{doc.metadata['filename']}[/]")
    else:
        print(f"[{GREEN}]{doc.metadata['filename']}[/]")

In [None]:
test_docs = pdf4llm_directory_loader(PDF_DIR_2, "pdf")

for index, doc in enumerate(test_docs):
    if is_text_corrupt(doc.page_content):
        print(f"[{RED}]{doc.metadata['filename']}[/]")
    else:
        print(f"[{GREEN}]{doc.metadata['filename']}[/]")

for index, doc in enumerate(test_docs):
    print(
        f"[bold {BLUE}]> DOC N°:[/] [bold {WHITE}]{index}[/]\n",
        f"[bold {EMERALD}]> FILENAME:[/] [bold {WHITE}]{doc.metadata["filename"]}[/]\n\n",
        f"[bold {YELLOW}]> CONTENT:[/]\n[{WHITE}]{doc.page_content[:500]}[/]",
    )

---
---

---


# PyTesseract


In [None]:
%pip install -qU pytesseract

In [179]:
from pytesseract import image_to_string

In [181]:
def pytess_directory_loader(dir_path: str, file_ext: str) -> List[Document]:
    """LOADS PDF DOCUMENTS FROM A GIVEN DIRECTORY"""

    # SEARCH IN THE GIVEN DIRECTORY FOR EACH PDF FILE IN IT AND GETS ITS PATH
    loaded_docs: List[Document] = []
    files_info: List[Dict[str, str]] = search_dir(dir_path, file_ext)

    for f in tqdm(
        files_info,
        desc="LOADING PDF FILES",
        total=len(files_info),
        colour=EMERALD,
    ):
        f_pages_imgs = convert_from_path(f["filepath"])

        pages = []
        for page in f_pages_imgs:
            page_extracted_text = image_to_string(page, lang="spa")
            pages.append(page_extracted_text)

        content = "\n".join(page for page in pages)

        loaded_file = Document(metadata=f, page_content=content)
        loaded_docs.append(loaded_file)

    return loaded_docs

In [None]:
pytess_docs = pytess_directory_loader(PDF_DIR, "pdf")

In [None]:
for index, doc in enumerate(pytess_docs):
    print(
        f"[bold {BLUE}]> DOC N°:[/] [bold {WHITE}]{index}[/]\n",
        f"[bold {EMERALD}]> FILENAME:[/] [bold {WHITE}]{doc.metadata["filename"]}[/]\n\n",
        f"[bold {YELLOW}]> CONTENT:[/]\n[{WHITE}]{doc.page_content}[/]",
    )

---
---

---


# SuryaOCR


In [None]:
%pip install -qU surya-ocr

In [None]:
from surya.recognition import RecognitionPredictor
from surya.detection import DetectionPredictor

In [None]:
pages = convert_from_path(PDF_FILE_1)

langs = ["es", "en"]

recognition_predictor = RecognitionPredictor()
detection_predictor = DetectionPredictor()

predictions_per_page = [
    recognition_predictor([page], [langs], detection_predictor) for page in pages
]

In [47]:
# Iterate through each page
txt = ""
for prediction in predictions_per_page:
    for ocr_result in prediction:
        for text_line in ocr_result.text_lines:
            txt += f"\n{text_line.text}"

---
---

---


# PyOCR


In [None]:
%pip install -qU pyocr

In [91]:
import pyocr
import pyocr.builders

In [92]:
def pyocr_directory_loader(dir_path: str, file_ext: str) -> List[Document]:
    """LOADS PDF DOCUMENTS FROM A GIVEN DIRECTORY"""

    loaded_docs: List[Document] = []
    files_info: List[Dict[str, str]] = search_dir(dir_path, file_ext)

    tools = pyocr.get_available_tools()
    tool = (
        ValueError("pyocr_directory_loader() => NO TOOLS FOUND")
        if len(tools) == 0
        else tools[0]
    )
    langs = tool.get_available_languages()
    lang = (
        ValueError("pyocr_directory_loader() => 'spa' IS NOT AVAILABLE")
        if "spa" not in langs
        else "spa"
    )

    for f in tqdm(
        files_info,
        desc="LOADING PDF FILES",
        total=len(files_info),
        colour=EMERALD,
    ):
        f_pages_imgs = convert_from_path(f["filepath"])

        pages = []
        for page in f_pages_imgs:
            page_extracted_text = tool.image_to_string(
                page, lang=lang, builder=pyocr.builders.TextBuilder()
            )
            pages.append(page_extracted_text)

        content = "\n".join(page for page in pages)

        loaded_file = Document(metadata=f, page_content=content)
        loaded_docs.append(loaded_file)

    return loaded_docs

In [None]:
pyocr_docs = pyocr_directory_loader(PDF_DIR, "pdf")

In [None]:
for index, doc in enumerate(pyocr_docs):
    print(
        f"[bold {BLUE}]> DOC N°:[/] [bold {WHITE}]{index}[/]\n",
        f"[bold {EMERALD}]> FILENAME:[/] [bold {WHITE}]{doc.metadata["filename"]}[/]\n\n",
        f"[bold {YELLOW}]> CONTENT:[/]\n[{WHITE}]{doc.page_content}[/]",
    )

---

In [None]:
test_docs = pyocr_directory_loader(PDF_DIR_2, "pdf")

for index, doc in enumerate(test_docs):
    if is_text_corrupt(doc.page_content):
        print(f"[{RED}]{doc.metadata['filename']}[/]")
    else:
        print(f"[{GREEN}]{doc.metadata['filename']}[/]")

for index, doc in enumerate(test_docs):
    print(
        f"[bold {BLUE}]> DOC N°:[/] [bold {WHITE}]{index}[/]\n",
        f"[bold {EMERALD}]> FILENAME:[/] [bold {WHITE}]{doc.metadata["filename"]}[/]\n\n",
        f"[bold {YELLOW}]> CONTENT:[/]\n[{WHITE}]{doc.page_content[:250]}[/]",
    )

---
---

---


# EasyOCR


In [None]:
%pip install -qU easyocr

In [16]:
import easyocr
import numpy as np

In [26]:
def easyocr_directory_loader(dir_path: str, file_ext: str) -> List[Document]:
    """LOADS PDF DOCUMENTS FROM A GIVEN DIRECTORY WITH PROGRESS INDICATOR."""

    loaded_docs: List[Document] = []

    files_info: List[Dict[str, str]] = search_dir(dir_path, file_ext)

    # Initialize EasyOCR reader for Spanish and English
    reader = easyocr.Reader(["es", "en"])

    for f in tqdm(
        files_info,
        desc="LOADING PDF FILES",
        total=len(files_info),
        colour=EMERALD,
    ):
        f_pages_imgs = convert_from_path(f["filepath"])

        loaded_pages = []
        for page in f_pages_imgs:
            # EasyOCR reads the text
            results = reader.readtext(np.array(page))
            # Extract text from results
            page_text = " ".join([text[1] for text in results])

            loaded_pages.append(page_text)

        content = "\n".join(page for page in loaded_pages)

        loaded_file = Document(metadata=f, page_content=content)
        loaded_docs.append(loaded_file)

    return loaded_docs

In [None]:
easyocr_docs = easyocr_directory_loader(PDF_DIR, "pdf")

In [None]:
for index, doc in enumerate(easyocr_docs):
    print(
        f"[bold {BLUE}]> DOC N°:[/] [bold {WHITE}]{index}[/]\n",
        f"[bold {EMERALD}]> FILENAME:[/] [bold {WHITE}]{doc.metadata["filename"]}[/]\n\n",
        f"[bold {YELLOW}]> CONTENT:[/]\n[{WHITE}]{doc.page_content}[/]",
    )

---
---
---

# Docling

In [None]:
%pip install -qU docling langchain-docling

In [None]:
from docling.document_converter import DocumentConverter

In [18]:
def docling_directory_loader(dir_path: str, file_ext: str) -> List[Document]:
    """LOADS PDF DOCUMENTS FROM A GIVEN DIRECTORY"""

    # SEARCH IN THE GIVEN DIRECTORY FOR EACH PDF FILE IN IT AND GETS ITS PATH
    loaded_docs: List[Document] = []
    files_info: List[Dict[str, str]] = search_dir(dir_path, file_ext)

    # LOADS EACH PDF FILE: FILE --> LIST[DOCUMENT]
    for f in tqdm(
        files_info, desc="LOADING PDF FILES", total=len(files_info), colour=EMERALD
    ):
        converter = DocumentConverter()
        result = converter.convert(f["filepath"])
        extracted_text = result.document.export_to_text()

        loaded_file = Document(metadata=f, page_content=extracted_text)
        loaded_docs.append(loaded_file)

    return loaded_docs

In [None]:
docling_docs = docling_directory_loader(PDF_DIR, "pdf")

In [None]:
for index, doc in enumerate(docling_docs):
    print(
        f"[bold {BLUE}]> DOC N°:[/] [bold {WHITE}]{index}[/]\n",
        f"[bold {EMERALD}]> FILENAME:[/] [bold {WHITE}]{doc.metadata["filename"]}[/]\n\n",
        f"[bold {YELLOW}]> CONTENT:[/]\n[{WHITE}]{doc.page_content}[/]",
    )

---
---
---

---
---
---

# PyMuPDFLoader + Groq (Multimodal Model)

In [None]:
%pip install -qU langchain-groq pymupdf

In [2]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.document_loaders.parsers import LLMImageBlobParser
from langchain_groq import ChatGroq

In [2]:
from getpass import getpass
from pydantic import SecretStr

GROQ_API_KEY = SecretStr(getpass("GROQ_API_KEY ="))

In [4]:
MODEL: ChatGroq = ChatGroq(
    api_key=GROQ_API_KEY, model="llama-3.2-11b-vision-preview", max_tokens=8192
)

PROMPT: str = (
    "You are an assistant tasked with extracting text from pdf files for retrieval."
    + " Extract only all the text from the pdf file."
    + " Do not exclude any text, except for the barcodes found in each page."
    + "\nAnswer only with the text from the pdf file."
    # + "\nFormat answer in markdown without explanatory text and without markdown delimiter ``` at the beginning. "
)

In [None]:
loader = PyMuPDFLoader(
    PDF_FILE_1,
    mode="page",
    images_inner_format="text",
    images_parser=LLMImageBlobParser(
        model=MODEL,
        prompt=PROMPT,
    ),
)
mmm_docs = loader.load()

In [None]:
for index, doc in enumerate(mmm_docs):
    for page in doc:
        print(
            f"[bold {BLUE}]> DOC N°:[/] [bold {WHITE}]{index}[/]\n",
            f"[bold {EMERALD}]> FILENAME:[/] [bold {WHITE}]{page.metadata["title"]}[/]\n\n",
            f"[bold {YELLOW}]> CONTENT:[/]\n[{WHITE}]{page.page_content}[/]",
        )