In [1]:
import pymupdf

print(pymupdf.__doc__)

PyMuPDF 1.26.3: Python bindings for the MuPDF 1.26.3 library (rebased implementation).
Python 3.13 running on win32 (64-bit).



# Problem
* Based on https://github.com/pymupdf/PyMuPDF/discussions/763
* Start with looking at PyMuPDF PDF engine and its capabilities
* No build in equation detection, especially problematic for text based equation

# Pipeline
1. Equation detection
    * (option) verify detection
2. Equation conversion latex equation
    * (option) verify conversion
3. Replacing original equation with latex equation


## Equation detection
### 1. heuristic based equation detection
In our case code examples are not a problem:

Yes, exactly!
In PDF, text is just text. The PDF specification contains nothing to sub-divide different kinds of text. Equations are also text and be coded in any font, can be italic, or normal, mono-spaced of proportional, serifed or sans-serifed.
Also note that the equation symbol appears in program code listings a lot - PyMuPDF.pdf is full of such examples.

So I would say, that you have to develop your own way of recognizing equations ... and whatever you will develop, may not work with the next PDF example.


In [6]:
import pprint
import os
import json

test_folder = os.path.join("data", "raw", "test-data", "equation-examples")

for file_name in os.listdir(test_folder):
    doc = pymupdf.open(os.path.join(test_folder, file_name))
    for page in doc:
        # print(page.get_text("html"))
        blocks = page.get_text("dict", flags=0)["blocks"]
        # tables = page.find_tables()
        # pprint.pprint(blocks)
        with open(os.path.join("data", "raw", "test-data", "pymupdf-dict-repr", f'{file_name}-page_{page.number}.json'), 'w') as f:
            json.dump({"blocks": blocks}, f)
        # page.get_pixmap().save("data/raw/test-data/solvency_II_level_1_v2_equations_page_{}.png".format(page.number))

In [None]:
# idea 
# is if you can detect the equation
# then you can just image multimodal, somehow

In [7]:
# WARNING THIS IS AI SLOP

import collections
import json
import re

# --- Heuristics Configuration ---

# 1. Define characters that strongly indicate mathematical notation
MATH_SYMBOLS = {'√', '∑', '×', '=', '+', '−', '∂', '∫', '≥', '≤', '≠'}

# 2. Define scoring weights for different features
SCORING_WEIGHTS = {
    'is_math_symbol': 5,
    'is_large_symbol': 5,
    'is_subscript': 3,
    'is_superscript': 3,
    'is_italic': 1,
}

# 3. Threshold for a line to be considered part of an equation
LINE_SCORE_THRESHOLD = 4

# --- Helper Functions ---

def merge_bboxes(bboxes):
    """Merges a list of bounding boxes into a single bounding box."""
    if not bboxes:
        return None
    min_x0 = min(b[0] for b in bboxes)
    min_y0 = min(b[1] for b in bboxes)
    max_x1 = max(b[2] for b in bboxes)
    max_y1 = max(b[3] for b in bboxes)
    return (min_x0, min_y0, max_x1, max_y1)

def get_dominant_line_properties(line):
    """Calculates the most common font size and baseline for a line."""
    if not line['spans']:
        return 0, 0, False
        
    baselines = [round(s['bbox'][3], 2) for s in line['spans']]
    sizes = [round(s['size'], 2) for s in line['spans']]
    fonts = [s['font'] for s in line['spans']]
    
    dominant_size = collections.Counter(sizes).most_common(1)[0][0]
    dominant_baseline = collections.Counter(baselines).most_common(1)[0][0]
    is_bold_dominant = 'Bold' in collections.Counter(fonts).most_common(1)[0][0]

    return dominant_size, dominant_baseline, is_bold_dominant

def is_likely_heading_or_prose(line, is_bold_dominant):
    """
    Applies negative heuristics to determine if a line is likely a heading or regular text.
    """
    full_text = "".join(span['text'] for span in line['spans']).strip()
    
    if not full_text:
        return False

    # Heuristic 1: Starts with a number like "1." or "A."
    if re.match(r'^[\[\(]?\d{1,2}[\.\)]', full_text):
        return True

    # Heuristic 2: Line is dominantly bold.
    if is_bold_dominant:
        return True

    # Heuristic 3: High ratio of letters to other characters.
    # Equations have a low ratio of letters.
    text_no_space = full_text.replace(" ", "")
    if not text_no_space:
        return False # Empty line
        
    alpha_chars = sum(1 for char in text_no_space if char.isalpha())
    total_chars = len(text_no_space)
    alpha_ratio = alpha_chars / total_chars
    
    # If over 90% of characters are letters, it's likely prose/heading.
    if alpha_ratio > 0.90:
        return True
        
    return False


# --- Main Detection Logic ---

def detect_equations(page_data):
    """
    Detects equations from a page's text dictionary representation.
    """
    math_lines = []
    
    for block in page_data.get('blocks', []):
        if block.get('type', 0) != 0:
            continue
            
        for line in block.get('lines', []):
            line_score = 0
            dominant_size, dominant_baseline, is_bold_dominant = get_dominant_line_properties(line)

            if dominant_size == 0:
                continue

            for span in line['spans']:
                if any(char in MATH_SYMBOLS for char in span['text']):
                    line_score += SCORING_WEIGHTS['is_math_symbol']
                if 'Italic' in span['font']:
                    line_score += SCORING_WEIGHTS['is_italic']
                
                height = span['bbox'][3] - span['bbox'][1]
                if height > dominant_size * 1.5:
                    line_score += SCORING_WEIGHTS['is_large_symbol']
                
                is_smaller = span['size'] < dominant_size * 0.9
                span_baseline = span['bbox'][3]
                
                if is_smaller and span_baseline > dominant_baseline + 1:
                    line_score += SCORING_WEIGHTS['is_subscript']
                if is_smaller and span_baseline < dominant_baseline - 2:
                    line_score += SCORING_WEIGHTS['is_superscript']

            if line_score >= LINE_SCORE_THRESHOLD:
                # --- NEW DISQUALIFICATION STEP ---
                # Check if the line matches heading/prose characteristics.
                if is_likely_heading_or_prose(line, is_bold_dominant):
                    continue # Skip this line, it's a false positive.
                
                math_lines.append({
                    'score': line_score,
                    'bbox': line['bbox'],
                    'spans': line['spans']
                })

    # Stage 3: Clustering (no changes needed here)
    if not math_lines:
        return []

    math_lines.sort(key=lambda l: l['bbox'][1])
    clusters = []
    current_cluster = [math_lines[0]]
    
    for i in range(1, len(math_lines)):
        prev_line = current_cluster[-1]
        current_line = math_lines[i]
        vertical_gap = current_line['bbox'][1] - prev_line['bbox'][3]
        prev_line_height = prev_line['bbox'][3] - prev_line['bbox'][1]
        
        if vertical_gap < prev_line_height * 0.5:
            current_cluster.append(current_line)
        else:
            clusters.append(current_cluster)
            current_cluster = [current_line]
            
    clusters.append(current_cluster)
    
    detected_equations = []
    for cluster in clusters:
        all_spans = [span for line in cluster for span in line['spans']]
        cluster_bbox = merge_bboxes([line['bbox'] for line in cluster])
        detected_equations.append({
            'bbox': cluster_bbox,
            'spans': all_spans
        })
        
    return detected_equations

if __name__ == '__main__':
    # I have added a heading to the sample data to test the new filter.
    for filename in os.listdir(os.path.join("data", "raw", "test-data", "pymupdf-dict-repr")):
        with open(os.path.join("data", "raw", "test-data", "pymupdf-dict-repr", filename), 'r') as f:
            sample_page_data = json.load(f)

        equations = detect_equations(sample_page_data)
        
        print(f"Detected {len(equations)} equations.\n")
        
        for i, eq in enumerate(equations):
            eq_text = "".join([s['text'] for s in eq['spans']])
            print(f"--- Equation {i+1} ---")
            print(f"  Bounding Box: {eq['bbox']}")
            print(f"  Reconstructed Text: {eq_text}")
            print("-" * 20 + "\n")

Detected 1 equations.

--- Equation 1 ---
  Bounding Box: (67.3510971069336, 701.406982421875, 379.53106689453125, 711.0025024414062)
  Reconstructed Text: (d)  r(t + 1) denotes the basic risk-free interest rate for the maturity of t + 1 years. 
--------------------

Detected 0 equations.

Detected 0 equations.

Detected 0 equations.

Detected 0 equations.

Detected 0 equations.

Detected 0 equations.

Detected 0 equations.

Detected 0 equations.

Detected 0 equations.

Detected 2 equations.

--- Equation 1 ---
  Bounding Box: (268.21240234375, 208.0092010498047, 346.2121276855469, 236.52261352539062)
  Reconstructed Text: √∑××j,irr=
--------------------

--- Equation 2 ---
  Bounding Box: (267.60467529296875, 661.0762329101562, 345.5970153808594, 689.5896606445312)
  Reconstructed Text: √∑××j,irr=
--------------------

Detected 2 equations.

--- Equation 1 ---
  Bounding Box: (260.9570007324219, 121.13847351074219, 338.9583435058594, 149.65188598632812)
  Reconstructed Text: √∑××j,irr

In [None]:
# two question, e.g. can we detect equation accurately enough
# then what are the next step, next to feeding it to an LLM
# because how do you integrate it into the pipeline
# e.g. how do you replace the equation text with