# GENERALS


In [None]:
%pip install -qU langchain langchain_community langchain_core pdf2image 

In [26]:
# GENERAL IMPORTS
import re
import numpy as np
from langchain.schema import Document
from pathlib import Path
from pdf2image import convert_from_path
from rich import print
from tqdm import tqdm
from typing import List, Dict

In [31]:
# RICH'S PRINT COLORS
YELLOW = "#fde047"
ORANGE = "#f97316"
RED = "#ef4444"
BLUE = "#3b82f6"
CYAN = "#06b6d4"
EMERALD = "#34d399"
VIOLET = "#a855f7"
PINK = "#ec4899"
GRAY = "#64748b"
WHITE = "#cccccc"
GREEN = "#3fb618"

In [30]:
# GENERAL VARIABLES
ROOT_DIR = Path("../../../COLEGA DATA")
PDF_DIR = ROOT_DIR / "notificaciones"
PDF_FILE_1 = PDF_DIR / "RES 04-04-2024 - DILIGENCIA PRELIMINAR.pdf"
PDF_FILE_2 = (
    ROOT_DIR / "MÉTODO DE LA DEMANDA Y SU CONTESTACIÓN/1_EL_CASO_Y_SU_SOLUCIÓN.pdf"
)


# print(f"[{WHITE}]{PDF_DIR}\n\n{PDF_FILE_1}\n\n{PDF_FILE_2}[/]")

In [33]:
def search_dir(dir_path: str, file_ext: str) -> List[Dict[str, str]]:
    """FILE'S SEARCH IN A GIVEN DIRECTORY"""
    dir_path = Path(dir_path)

    if not dir_path.is_dir():
        raise ValueError(f"search_dir() => DIRECTORY ({dir_path}) DOESN'T EXIST.")

    if not any(dir_path.iterdir()):
        raise ValueError(f"search_dir() => DIRECTORY ({dir_path}) IS EMPTY.")

    if not file_ext.startswith("."):
        file_ext = f".{file_ext}"

    # SEARCH FOR WANTED FILES
    files_info: List[Dict[str, str]] = [
        {"filename": f.name, "filepath": str(f)}
        for f in dir_path.glob(f"*{file_ext}")
        if f.is_file()
    ]

    # CHECK IF FILES WERE FOUND
    if not files_info:
        raise ValueError(
            f"search_dir() => NO FILES WITH EXTENSION ({file_ext}) WERE FOUND IN DIRECTORY ({dir_path})."
        )

    return files_info

In [173]:
def text_cleaner(text: str) -> str:
    """
    Cleans text by replacing non-breaking spaces, normalizing spaces and newlines,
    and removing hash symbols.
    """
    try:
        # Replace non-breaking spaces with regular spaces
        text = text.replace("\xa0", " ")
        # Normalize spaces
        text = re.sub(r"\s+", " ", text)
        # Normalize newlines if specified
        text = re.sub(r"\n{3,}", "\n\n", text)
        # Remove hash symbols if specified
        text = re.sub(r"#", "", text)
        # Trim leading and trailing whitespace
        text = text.strip()

        return text
    except Exception as e:
        print(f"An error occurred while cleaning the text: {e}")
        return text

---
---

---


# PyMuPDF4llm


In [None]:
%pip install -qU pymupdf4llm

In [88]:
from pymupdf4llm import to_markdown

In [174]:
def pdf4llm_directory_loader(dir_path: str, file_ext: str) -> List[Document]:
    """LOADS PDF DOCUMENTS FROM A GIVEN DIRECTORY"""

    # SEARCH IN THE GIVEN DIRECTORY FOR EACH PDF FILE IN IT AND GETS ITS PATH
    loaded_docs: List[Document] = []
    files_info: List[Dict[str, str]] = search_dir(dir_path, file_ext)

    # LOADS EACH PDF FILE: FILE --> LIST[DOCUMENT]
    for f in tqdm(
        files_info, desc="LOADING PDF FILES", total=len(files_info), colour=EMERALD
    ):
        md_text = to_markdown(f["filepath"], show_progress=False)
        loaded_file = Document(metadata=f, page_content=md_text)

        loaded_docs.append(loaded_file)

    return loaded_docs

In [None]:
pdf4llm_docs = pdf4llm_directory_loader(PDF_DIR, "pdf")

In [None]:
for index, doc in enumerate(pdf4llm_docs):
    print(
        f"[bold {BLUE}]> DOC N°:[/] [bold {WHITE}]{index}[/]\n",
        f"[bold {EMERALD}]> FILENAME:[/] [bold {WHITE}]{doc.metadata["filename"]}[/]\n\n",
        f"[bold {YELLOW}]> CONTENT:[/]\n[{WHITE}]{doc.page_content}[/]",
    )

---
---

---


# PyTesseract


In [None]:
%pip install -qU pytesseract

In [179]:
from pytesseract import image_to_string

In [181]:
def pytess_directory_loader(dir_path: str, file_ext: str) -> List[Document]:
    """LOADS PDF DOCUMENTS FROM A GIVEN DIRECTORY"""

    # SEARCH IN THE GIVEN DIRECTORY FOR EACH PDF FILE IN IT AND GETS ITS PATH
    loaded_docs: List[Document] = []
    files_info: List[Dict[str, str]] = search_dir(dir_path, file_ext)

    for f in tqdm(
        files_info,
        desc="LOADING PDF FILES",
        total=len(files_info),
        colour=EMERALD,
    ):
        f_pages_imgs = convert_from_path(f["filepath"])

        pages = []
        for page in f_pages_imgs:
            page_extracted_text = image_to_string(page, lang="spa")
            pages.append(page_extracted_text)

        content = "\n".join(page for page in pages)

        loaded_file = Document(metadata=f, page_content=content)
        loaded_docs.append(loaded_file)

    return loaded_docs

In [None]:
pytess_docs = pytess_directory_loader(PDF_DIR, "pdf")

In [None]:
for index, doc in enumerate(pytess_docs):
    print(
        f"[bold {BLUE}]> DOC N°:[/] [bold {WHITE}]{index}[/]\n",
        f"[bold {EMERALD}]> FILENAME:[/] [bold {WHITE}]{doc.metadata["filename"]}[/]\n\n",
        f"[bold {YELLOW}]> CONTENT:[/]\n[{WHITE}]{doc.page_content}[/]",
    )

---
---

---


# SuryaOCR


In [None]:
%pip install -qU surya-ocr

In [None]:
from surya.recognition import RecognitionPredictor
from surya.detection import DetectionPredictor

In [None]:
pages = convert_from_path(PDF_FILE_1)

langs = ["es", "en"]

recognition_predictor = RecognitionPredictor()
detection_predictor = DetectionPredictor()

predictions_per_page = [
    recognition_predictor([page], [langs], detection_predictor) for page in pages
]

In [47]:
# Iterate through each page
txt = ""
for prediction in predictions_per_page:
    for ocr_result in prediction:
        for text_line in ocr_result.text_lines:
            txt += f"\n{text_line.text}"

---
---

---


# PyOCR


In [None]:
%pip install -qU pyocr

In [96]:
import pyocr
import pyocr.builders

# Ruta al archivo PDF
pdf_path = (
    "../../../COLEGA DATA/notificaciones/RES 04-04-2024 - DILIGENCIA PRELIMINAR.pdf"
)
# pdf_path = "../../../COLEGA DATA/MÉTODO DE LA DEMANDA Y SU CONTESTACIÓN/1_EL_CASO_Y_SU_SOLUCIÓN.pdf"

pages = convert_from_path(pdf_path)

tools = pyocr.get_available_tools()
tool = ValueError("No tools found") if len(tools) == 0 else tools[0]
langs = tool.get_available_languages()
lang = ValueError("'spa' is not available") if "spa" not in langs else "spa"

loaded_pages = []
for page in pages:
    txt: str = tool.image_to_string(
        page, lang="spa", builder=pyocr.builders.TextBuilder()
    )
    loaded_pages.append(txt)

In [None]:
print("\n\n".join(loaded_pages))

---
---

---


# EasyOCR


In [None]:
%pip install -qU easyocr

In [None]:
"""
Valores comunes de DPI:

- 72-96: Calidad web/pantalla
- 150: Calidad media
- 300: Alta calidad, buen balance entre resolución y tamaño de archivo
- 600: Muy alta calidad, archivos más pesados
"""

In [6]:
import easyocr
import numpy as np

In [7]:
def easyocr_directory_loader(directory_path: str) -> List[Document]:
    """LOADS PDF DOCUMENTS FROM A GIVEN DIRECTORY WITH PROGRESS INDICATOR."""

    if not os.path.exists(directory_path):
        raise ValueError(
            f"pymupdf4llm_directory_loader() >>> DIRECTORY {directory_path} DOESN'T EXIST."
        )

    loaded_docs: List[Document] = []

    # SEARCH IN THE GIVEN DIRECTORY FOR EACH PDF FILE IN IT AND GETS ITS PATH
    pdf_files_info = []
    for parent_dir_path, _, files in os.walk(directory_path):
        for filename in files:
            if filename.endswith(".pdf"):
                file_path = os.path.join(parent_dir_path, filename)
                pdf_files_info.append({"file_name": filename, "file_path": file_path})

    # CONVERTS EACH PDF FILE INTO A LIST[PNG]
    for file_info in tqdm(
        pdf_files_info,
        desc="LOADING PDF FILES",
        total=len(pdf_files_info),
        colour=EMERALD,
    ):
        # Initialize EasyOCR reader for Spanish and English
        reader = easyocr.Reader(["es", "en"])
        pages_imgs = convert_from_path(file_info["file_path"])
        loaded_pages = []
        for page in pages_imgs:
            # EasyOCR reads the text
            results = reader.readtext(np.array(page))
            # Extract text from results
            page_text = " ".join([text[1] for text in results])

            loaded_pages.append(page_text)

        content = "\n".join(page for page in loaded_pages)

        loaded_file = Document(metadata=file_info, page_content=content)
        loaded_docs.append(loaded_file)

    return loaded_docs

In [None]:
easyocr_docs = easyocr_directory_loader(PDF_DIR)