In [None]:
# %pip install pymupdf

In [None]:
import random
import re
import statistics
from glob import glob

import numpy as np
import pymupdf
from rich.console import Console

In [None]:
console = Console(width=120)

In [None]:
pdf_dir = (
    "/resources/data/restricted/ar-juz-pcyf-10/RESOLUCIONES DEL JUZGADO-pdf/subsample"
)

pdfs = glob(pdf_dir + "/**/**.pdf", recursive=True)
len(pdfs)

In [None]:
filename = random.choice(pdfs)
print(filename)

In [None]:
def compute_median_margin_between_blocks(pdf_path: str) -> float:
    """
    Computes the median vertical margin between text blocks in a PDF.

    Args:
        pdf_path (str): Path to the PDF file.

    Returns:
        float: Median margin between text blocks (in points).
    """
    margins = []

    with pymupdf.open(pdf_path) as doc:
        for page in doc:
            # Extract all text blocks from the page
            blocks = page.get_text("blocks")

            # Sort blocks by their top y-coordinate (y0)
            blocks_sorted = sorted(blocks, key=lambda b: b[1])

            # Compute vertical margins between consecutive blocks
            for i in range(1, len(blocks_sorted)):
                previous_block = blocks_sorted[i - 1]
                current_block = blocks_sorted[i]

                # Calculate the vertical margin
                previous_y1 = previous_block[3]  # Bottom of the previous block
                current_y0 = current_block[1]  # Top of the current block
                margin = current_y0 - previous_y1

                if margin > 0:  # Ignore overlapping blocks
                    margins.append(margin)

    # Compute and return the median margin
    if margins:
        return statistics.median(margins)
    else:
        return 0.0  # Return 0 if no margins were found


def extract_and_merge_paragraphs(pdf_path, y_tolerance=5):
    """
    Extracts and merges paragraphs from a PDF by grouping close text blocks.

    Args:
        pdf_path (str): Path to the PDF file.
        y_tolerance (float): Maximum vertical gap (in points) to consider blocks part of the same paragraph.

    Returns:
        list: A list of merged paragraphs as strings.
    """
    paragraphs = []

    with pymupdf.open(pdf_path) as doc:
        current_paragraph = ""

        for page in doc:
            blocks = page.get_text("blocks")  # Extract text blocks
            blocks = sorted(
                blocks, key=lambda b: b[1]
            )  # Sort by vertical position (y0)

            last_y1 = None  # Track the bottom position of the last block

            for block in blocks:
                text = block[4].strip()  # Extract text content from block
                if not text:  # Skip empty blocks
                    continue

                y0, y1 = block[1], block[3]  # Top and bottom positions of the block

                # If current_paragraph is empty, start a new paragraph
                if not current_paragraph:
                    current_paragraph = text
                    last_y1 = y1
                    continue

                # Check if the block is close enough to the previous one (vertical distance)
                if last_y1 is not None and y0 - last_y1 <= y_tolerance:
                    current_paragraph += " " + text  # Merge with the previous paragraph

                # Check if the start of the text is lower case in the first block of the page
                elif re.match(r"[a-z]", text[0]) and block[5] == 0:
                    current_paragraph += " " + text  # Merge with the previous paragraph

                else:
                    # Save the current paragraph and start a new one
                    paragraphs.append(current_paragraph)
                    current_paragraph = text

                last_y1 = y1

    return paragraphs

In [None]:
y_tolerance = compute_median_margin_between_blocks(filename)
paragraphs = extract_and_merge_paragraphs(filename, np.ceil(y_tolerance))

# Print paragraphs
for i, paragraph in enumerate(paragraphs, 1):
    console.print(f"Paragraph {i}:\n{paragraph}\n")