# GENERALS

In [None]:
%pip install -qU langchain langchain_community langchain_core pdf2image 

In [1]:
# GENERAL IMPORTS
import os
import re
from langchain.schema import Document
from pdf2image import convert_from_path
from rich import print
from tqdm import tqdm
from typing import List

In [2]:
# RICH'S PRINT COLORS
YELLOW = "#fde047"
ORANGE = "#f97316"
RED = "#ef4444"
BLUE = "#3b82f6"
CYAN = "#06b6d4"
EMERALD = "#34d399"
VIOLET = "#a855f7"
PINK = "#ec4899"
GRAY = "#64748b"
WHITE = "#cccccc"
GREEN = "#3fb618"

In [3]:
# GENERAL VARIABLES
ROOT_DIR = "../../../COLEGA DATA"
PDF_DIR = os.path.join(ROOT_DIR, "notificaciones")
# Ruta al archivo PDF
PDF_FILE_1 = os.path.join(PDF_DIR, "RES 04-04-2024 - DILIGENCIA PRELIMINAR.pdf")
PDF_FILE_2 = os.path.join(
    ROOT_DIR, "MÉTODO DE LA DEMANDA Y SU CONTESTACIÓN/1_EL_CASO_Y_SU_SOLUCIÓN.pdf"
)

# print(f"[{WHITE}]{PDF_DIR}\n\n{PDF_FILE_1}\n\n{PDF_FILE_2}[/]")

In [192]:
def text_cleaner(text: str) -> str:
    """
    Cleans text by replacing non-breaking spaces, normalizing spaces and newlines,
    and removing hash symbols.
    """
    try:
        # Replace non-breaking spaces with regular spaces
        text = text.replace("\xa0", " ")
        # Normalize spaces
        text = re.sub(r"\s+", " ", text)
        # Normalize newlines if specified
        text = re.sub(r"\n{3,}", "\n\n", text)
        # Remove hash symbols if specified
        text = re.sub(r"#", "", text)
        # Trim leading and trailing whitespace
        text = text.strip()

        return text
    except Exception as e:
        print(f"An error occurred while cleaning the text: {e}")
        return text

---
---
---

# PyMuPDF4llm

In [None]:
%pip install -qU pymupdf4llm

In [198]:
from pymupdf4llm import to_markdown

In [204]:
def pymupdf4llm_directory_loader(directory_path: str) -> List[Document]:
    """LOADS PDF DOCUMENTS FROM A GIVEN DIRECTORY WITH PROGRESS INDICATOR."""

    if not os.path.exists(directory_path):
        raise ValueError(
            f"pymupdf4llm_directory_loader() >>> DIRECTORY {directory_path} DOESN'T EXIST."
        )

    loaded_docs: List[Document] = []

    # SEARCH IN THE GIVEN DIRECTORY FOR EACH PDF FILE IN IT AND GETS ITS PATH
    pdf_files_info = []
    for parent_dir_path, _, files in os.walk(directory_path):
        for filename in files:
            if filename.endswith(".pdf"):
                file_path = os.path.join(parent_dir_path, filename)
                pdf_files_info.append({"file_name": filename, "file_path": file_path})

    # LOADS EACH PDF FILE: FILE --> LIST[DOCUMENT]
    for file_info in tqdm(
        pdf_files_info,
        desc="LOADING PDF FILES",
        total=len(pdf_files_info),
        colour=EMERALD,
    ):
        md_text = to_markdown(file_info["file_path"], show_progress=False)
        loaded_file = Document(metadata=file_info, page_content=md_text)

        loaded_docs.append(loaded_file)

    return loaded_docs

In [None]:
pymupdf4llm_docs = pymupdf4llm_directory_loader(PDF_DIR)

In [None]:
len(pymupdf4llm_docs)

In [None]:
for index, doc in enumerate(pymupdf4llm_docs):
    print(
        f"[bold {BLUE}]> DOC N°:[/] [bold {WHITE}]{index}[/]\n",
        f"[bold {EMERALD}]> FILENAME:[/] [bold {WHITE}]{doc.metadata["file_name"]}[/]\n\n",
        f"[bold {YELLOW}]> CONTENT:[/]\n[{WHITE}]{doc.page_content}[/]",
    )

---
---
---

# PyTesseract

In [None]:
%pip install -qU pytesseract poppler-utils

In [None]:
import pytesseract

In [228]:
def pytesseract_directory_loader(directory_path: str) -> List[Document]:
    """LOADS PDF DOCUMENTS FROM A GIVEN DIRECTORY WITH PROGRESS INDICATOR."""

    if not os.path.exists(directory_path):
        raise ValueError(
            f"pymupdf4llm_directory_loader() >>> DIRECTORY {directory_path} DOESN'T EXIST."
        )

    loaded_docs: List[Document] = []

    # SEARCH IN THE GIVEN DIRECTORY FOR EACH PDF FILE IN IT AND GETS ITS PATH
    pdf_files_info = []
    for parent_dir_path, _, files in os.walk(directory_path):
        for filename in files:
            if filename.endswith(".pdf"):
                file_path = os.path.join(parent_dir_path, filename)
                pdf_files_info.append({"file_name": filename, "file_path": file_path})

    # CONVERTS EACH PDF FILE INTO A LIST[PNG]
    for file_info in tqdm(
        pdf_files_info,
        desc="LOADING PDF FILES",
        total=len(pdf_files_info),
        colour=EMERALD,
    ):
        pages_imgs = convert_from_path(file_info["file_path"])
        pages = []
        for page in pages_imgs:
            page_extracted_text = pytesseract.image_to_string(page, lang="spa")
            pages.append(page_extracted_text)

        content = "\n".join(page for page in pages)

        loaded_file = Document(metadata=file_info, page_content=content)
        loaded_docs.append(loaded_file)

    return loaded_docs

In [None]:
pytesseract_docs = pytesseract_directory_loader(PDF_DIR)

In [None]:
len(pytesseract_docs)

In [None]:
for index, doc in enumerate(pytesseract_docs):
    print(
        f"[bold {BLUE}]> DOC N°:[/] [bold {WHITE}]{index}[/]\n",
        f"[bold {EMERALD}]> FILENAME:[/] [bold {WHITE}]{doc.metadata["file_name"]}[/]\n\n",
        f"[bold {YELLOW}]> CONTENT:[/]\n[{WHITE}]{doc.page_content}[/]",
    )

---
---
---

# SuryaOCR

In [None]:
%pip install -qU surya-ocr

In [261]:
from surya.recognition import RecognitionPredictor
from surya.detection import DetectionPredictor

In [None]:
pages = convert_from_path(PDF_FILE_1)

langs = ["es", "en"]

recognition_predictor = RecognitionPredictor()
detection_predictor = DetectionPredictor()

predictions_per_page = [
    recognition_predictor([page], [langs], detection_predictor) for page in pages
]

In [47]:
# Iterate through each page
txt = ""
for prediction in predictions_per_page:
    for ocr_result in prediction:
        for text_line in ocr_result.text_lines:
            txt += f"\n{text_line.text}"

---
---
---

# PyOCR

In [None]:
%pip install -qU pyocr

In [96]:
import pyocr
import pyocr.builders

# Ruta al archivo PDF
pdf_path = (
    "../../../COLEGA DATA/notificaciones/RES 04-04-2024 - DILIGENCIA PRELIMINAR.pdf"
)
# pdf_path = "../../../COLEGA DATA/MÉTODO DE LA DEMANDA Y SU CONTESTACIÓN/1_EL_CASO_Y_SU_SOLUCIÓN.pdf"

pages = convert_from_path(pdf_path)

tools = pyocr.get_available_tools()
tool = ValueError("No tools found") if len(tools) == 0 else tools[0]
langs = tool.get_available_languages()
lang = ValueError("'spa' is not available") if "spa" not in langs else "spa"

loaded_pages = []
for page in pages:
    txt: str = tool.image_to_string(
        page, lang="spa", builder=pyocr.builders.TextBuilder()
    )
    loaded_pages.append(txt)

In [None]:
print("\n\n".join(loaded_pages))

---
---
---

# EasyOCR

In [None]:
%pip install -qU easyocr

In [None]:
"""
Valores comunes de DPI:

- 72-96: Calidad web/pantalla
- 150: Calidad media
- 300: Alta calidad, buen balance entre resolución y tamaño de archivo
- 600: Muy alta calidad, archivos más pesados
"""

In [7]:
import easyocr
import numpy as np

In [None]:
# Initialize EasyOCR reader for Spanish
reader = easyocr.Reader(["es", "en"])

# Convert PDF to images
pages = convert_from_path(PDF_FILE_1, dpi=300)

# Extract text from each page
loaded_pages = []
for page in pages:
    # EasyOCR reads the text
    results = reader.readtext(np.array(page))

    # Extract text from results
    page_text = " ".join([text[1] for text in results])
    loaded_pages.append(page_text)

In [None]:
print("\n\n".join(loaded_pages))