In [1]:
import pymupdf

print(pymupdf.__doc__)

PyMuPDF 1.26.3: Python bindings for the MuPDF 1.26.3 library (rebased implementation).
Python 3.13 running on win32 (64-bit).



# Problem
* Based on https://github.com/pymupdf/PyMuPDF/discussions/763
* Start with looking at PyMuPDF PDF engine and its capabilities
* No build in equation detection, especially problematic for text based equation

# Pipeline
1. Equation detection
    * (option) verify detection
2. Equation conversion latex equation
    * (option) verify conversion
3. Replacing original equation with latex equation


## Equation detection
### 1. heuristic based equation detection
In our case code examples are not a problem:

Yes, exactly!
In PDF, text is just text. The PDF specification contains nothing to sub-divide different kinds of text. Equations are also text and be coded in any font, can be italic, or normal, mono-spaced of proportional, serifed or sans-serifed.
Also note that the equation symbol appears in program code listings a lot - PyMuPDF.pdf is full of such examples.

So I would say, that you have to develop your own way of recognizing equations ... and whatever you will develop, may not work with the next PDF example.


In [3]:
# import pprint
# import os
# import json

# test_folder = os.path.join("data", "raw", "test-data", "equation-examples")

# for file_name in os.listdir(test_folder):
#     doc = pymupdf.open(os.path.join(test_folder, file_name))
#     for page in doc:
#         # print(page.get_text("html"))
#         text_in_dict = page.get_text("dict", flags=0)
#         # tables = page.find_tables()
#         # pprint.pprint(blocks)
#         with open(os.path.join("data", "raw", "test-data", "pymupdf-dict-repr", f'{file_name}-page_{page.number}.json'), 'w') as f:
#             json.dump(text_in_dict, f)
#         # page.get_pixmap().save("data/raw/test-data/solvency_II_level_1_v2_equations_page_{}.png".format(page.number))

In [None]:
# idea 
# is if you can detect the equation
# then you can just image multimodal, somehow

In [None]:
# WARNING THIS IS AI SLOP
import pymupdf
import collections
import json
import re

# --- Heuristics Configuration ---

# 1. Define characters that strongly indicate mathematical notation
WEAK_MATH_SYMBOLS = {'×', '+', '−'}
STRONG_MATH_SYMBOLS = {'√', '∑', '=', '¼', '∂', '∫', '≥', '≤', '≠', '�', 'Þ'}

# 2. Define scoring weights for different features
SCORING_WEIGHTS = {
    'is_strong_math_symbol': 15,
    'is_weak_math_symbol': 3,
    'is_large_symbol': 3,
    'is_subscript': 3,
    'is_superscript': 3,
    'is_italic': 1,
    'is_largely_alphabetic': 5
}

# 3. Threshold for a line to be considered part of an equation
LINE_SCORE_THRESHOLD = 9

# --- Helper Functions ---

def merge_bboxes(bboxes):
    """Merges a list of bounding boxes into a single bounding box."""
    if not bboxes:
        return None
    min_x0 = min(b[0] for b in bboxes)
    min_y0 = min(b[1] for b in bboxes)
    max_x1 = max(b[2] for b in bboxes)
    max_y1 = max(b[3] for b in bboxes)
    return (min_x0, min_y0, max_x1, max_y1)

def get_dominant_line_properties(line):
    """Calculates the most common font size and baseline for a line."""
    if not line['spans']:
        return 0, 0, False
        
    baselines = [round(s['bbox'][3], 2) for s in line['spans']]
    sizes = [round(s['size'], 2) for s in line['spans']]
    fonts = [s['font'] for s in line['spans']]
    
    dominant_size = collections.Counter(sizes).most_common(1)[0][0]
    dominant_baseline = collections.Counter(baselines).most_common(1)[0][0]
    is_bold_dominant = 'Bold' in collections.Counter(fonts).most_common(1)[0][0]

    return dominant_size, dominant_baseline, is_bold_dominant

def is_likely_heading_or_prose(line, is_bold_dominant):
    """
    Applies negative heuristics to determine if a line is likely a heading or regular text.
    """
    full_text = "".join(span['text'] for span in line['spans']).strip()
    
    if not full_text:
        return False

    # Heuristic 1: Starts with a number like "1." or "A."
    if re.match(r'^[\[\(]?\d{1,2}[\.\)]', full_text):
        return True

    # Heuristic 2: Line is dominantly bold.
    if is_bold_dominant:
        return True

    # Heuristic 3: High ratio of letters to other characters.
    # Equations have a low ratio of letters.
    text_no_space = full_text.replace(" ", "")
    if not text_no_space:
        return False # Empty line
        
    alpha_chars = sum(1 for char in text_no_space if char.isascii()) #changed is isascii
    total_chars = len(text_no_space)
    alpha_ratio = alpha_chars / total_chars
    
    # If over 90% of characters are letters, it's likely prose/heading.
    if alpha_ratio > 0.90:
        return True
    
    # print(alpha_ratio)
        
    return False


def calculate_positive_line_score(line, dominant_size, dominant_baseline):
    line_score = 0
    for span in line['spans']:
                if any(char in WEAK_MATH_SYMBOLS for char in span['text']):
                    line_score += SCORING_WEIGHTS['is_weak_math_symbol']
                if any(char in STRONG_MATH_SYMBOLS for char in span['text']):
                    line_score += SCORING_WEIGHTS['is_strong_math_symbol']
                if 'Italic' in span['font']:
                    line_score += SCORING_WEIGHTS['is_italic']
                
                height = span['bbox'][3] - span['bbox'][1]
                if height > dominant_size * 1.5:
                    line_score += SCORING_WEIGHTS['is_large_symbol']
                
                is_smaller = span['size'] < dominant_size * 0.9
                span_baseline = span['bbox'][3]
                
                if is_smaller and span_baseline > dominant_baseline + 1:
                    line_score += SCORING_WEIGHTS['is_subscript']
                if is_smaller and span_baseline < dominant_baseline - 2:
                    line_score += SCORING_WEIGHTS['is_superscript']

    return line_score
    


# --- Main Detection Logic ---

def detect_equations(page_data, verbose=0):
    """
    Detects equations from a page's text dictionary representation.
    """
    math_lines = []
    
    for block in page_data.get('blocks', []):
        if block.get('type', 0) != 0:
            continue
            
        for line in block.get('lines', []):
            dominant_size, dominant_baseline, is_bold_dominant = get_dominant_line_properties(line)

            if dominant_size == 0:
                continue

            line_score = calculate_positive_line_score(line, dominant_size, dominant_baseline)

            # Check if the line matches heading/prose characteristics.
            if is_likely_heading_or_prose(line, is_bold_dominant):
                # continue # Skip this line, it's a false positive.
                line_score -= SCORING_WEIGHTS['is_largely_alphabetic']

            if verbose:
                print(f"Processing line: {[span['text'] for span in line['spans']]} with score {line_score}")

            if line_score >= LINE_SCORE_THRESHOLD:

                math_lines.append({
                    'score': line_score,
                    'bbox': line['bbox'],
                    'spans': line['spans']
                })

    # Stage 3: Clustering (no changes needed here)
    if not math_lines:
        return []

    math_lines.sort(key=lambda l: l['bbox'][1])
    clusters = []
    current_cluster = [math_lines[0]]
    
    for i in range(1, len(math_lines)):
        prev_line = current_cluster[-1]
        current_line = math_lines[i]
        vertical_gap = current_line['bbox'][1] - prev_line['bbox'][3]
        prev_line_height = prev_line['bbox'][3] - prev_line['bbox'][1]
        
        if vertical_gap < prev_line_height * 0.5:
            current_cluster.append(current_line)
        else:
            clusters.append(current_cluster)
            current_cluster = [current_line]
            
    clusters.append(current_cluster)
    
    detected_equations = []
    for cluster in clusters:
        all_spans = [span for line in cluster for span in line['spans']]
        cluster_bbox = merge_bboxes([line['bbox'] for line in cluster])
        detected_equations.append({
            'bbox': cluster_bbox,
            'spans': all_spans
        })
        
    return detected_equations

def print_equations(equations):
    print(f"Detected {len(equations)} equations.\n")
    
    for i, eq in enumerate(equations):
        eq_text = "".join([s['text'] for s in eq['spans']])

        print([s['text'] for s in eq['spans']])
        print(f"--- Equation {i+1} ---")
        print(f"  Bounding Box: {eq['bbox']}")
        print(f"  Reconstructed Text: {eq_text}")
        print("-" * 20 + "\n")

# if __name__ == '__main__':
    # I have added a heading to the sample data to test the new filter.
import os
# for filename in os.listdir(os.path.join("data", "raw", "test-data", "pymupdf-dict-repr")):
# with open(os.path.join("data", "raw", "test-data", "pymupdf-dict-repr", filename), 'r') as f:
#     sample_page_data = json.load(f)

with open(os.path.join("data", "raw", "solvency-II-files", "solvency II - level 2.pdf"), 'rb') as f:
    original_doc = pymupdf.open(f)

# page_number = 70

# equations = detect_equations(original_doc[page_number - 1].get_text("dict", flags=0), verbose=1)

# print(f"File: {filename}")
# print_equations(equations)



In [137]:
# equation_spans = "*Lapse*" "*up* ¼ 0,5 � *l* *up* � *n* *up* � *S* *up*
equation_spans = ["RM", "re,all", "�", "Recoverables", "[Recoverables]", "all", "[i]"]
test = {"spans": []}
for equation_span in equation_spans:
    test["spans"].append({"text": equation_span, "font": 'Italic'})

# is_likely_heading_or_prose(test, False)
# calculate_positive_line_score(test, 12, 5)
# 'ð'.isascii()


In [None]:
import pymupdf
# 1. how to ensure that replacement is correct?
# 2. how to replace the exact text?

equations_detected = None
with open(os.path.join("data", "raw", "test-data", "solvency II - level 2 - 78 - replacement-test.pdf"), 'rb') as f:
    doc = pymupdf.open(f)
    for page_md in doc:
        blocks = page_md.get_text("dict", flags=0)["blocks"]
        equations = detect_equations({"blocks": blocks})
        # tables = page.find_tables()
        # pprint.pprint(tables)
        print_equations(equations)
        # page.get_pixmap(dpi=1200).save("data/raw/test-data/solvency_II_level_1_v2_equations_page_{}.png".format(page.number))
        equations_detected = equations


Detected 4 equations.

['CorrEQ', 'ð', 'r,s', 'Þ', ' �', 'SCR', 'ð', 'earthquake,r', 'Þ', ' �', 'SCR', 'ð', 'earthquake,s', 'Þ', 'Þ þ', ' SCR', '2', 'SCR', 'earthquake', ' ¼', 'ð', 'earthquake,other', 'Þ', 'ð', 'r,s', 'Þ']
--- Equation 1 ---
  Bounding Box: (82.716064453125, 240.76597595214844, 372.42034912109375, 259.61907958984375)
  Reconstructed Text: CorrEQðr,sÞ �SCRðearthquake,rÞ �SCRðearthquake,sÞÞ þ SCR2SCRearthquake ¼ðearthquake,otherÞðr,sÞ
--------------------

['L', 'ð', 'earthquake,r', 'Þ', ' ¼', ' Q', 'ð', 'earthquake,r', 'Þ', ' �', 'Corr', 'ð', 'earthquake,r,i,j', 'Þ', ' �', 'WSI', 'ð', 'earthquake,r,i', 'Þ', ' �', 'WSI', 'ð', 'earthquake,r,j', 'Þ', 'ð', 'i,j', 'Þ']
--- Equation 2 ---
  Bounding Box: (82.7159423828125, 418.3394470214844, 359.936279296875, 436.727783203125)
  Reconstructed Text: Lðearthquake,rÞ ¼ Qðearthquake,rÞ �Corrðearthquake,r,i,jÞ �WSIðearthquake,r,iÞ �WSIðearthquake,r,jÞði,jÞ
--------------------

['WSI', 'ð', 'earthquake,r,i', 'Þ', ' ¼', ' W', 'ð', 

# Research on regex patterns to find equation on page

In [139]:
snippet_1 = """
1. The capital requirement for earthquake risk shall be equal to the following:


ffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffi
*SCR* *earthquake* ¼ ðX *CorrEQ* ð *r,s* Þ � *SCR* ð *earthquake,r* Þ � *SCR* ð *earthquake,s* Þ Þ þ *SCR* [2] ð *earthquake,other* Þ


s


*CorrEQ* ð *r,s* Þ � *SCR* ð *earthquake,r* Þ � *SCR* ð *earthquake,s* Þ Þ þ *SCR* [2] ð *earthquake,other* Þ


ð *r,s* Þ


where:
"""
snippet_2 = """
following amount:


ffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffiffi
*L* ð *earthquake,r* Þ ¼ *Q* ð *earthquake,r* Þ � X *Corr* ð *earthquake,r,i,j* Þ � *WSI* ð *earthquake,r,i* Þ � *WSI* ð *earthquake,r,j* Þ


s


*Corr* ð *earthquake,r,i,j* Þ � *WSI* ð *earthquake,r,i* Þ � *WSI* ð *earthquake,r,j* Þ


ð *i,j* Þ


where:

"""

In [None]:
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
import base64
from langchain_core.messages import HumanMessage
import os
import pprint

load_dotenv()

llm = ChatGoogleGenerativeAI(
        model="gemini-2.5-pro", 
        api_key=os.environ["GOOGLE_API_KEY"], 
        temperature=0.2,
)

page_number = 0
test_image = "data/raw/test-data/solvency_II_equations_page_{}.png".format(page_number)

def extract_equations(llm, image_url):
        # from docs: https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal/image-understanding
        # * If you want to detect text in an image, use prompts with a single image to produce better results than prompts with multiple images.
        # * If your prompt contains a single image, place the image before the text prompt in your request.

        # from docs, may not be fully applicable but related:
        # For most accurate OCR results from Document AI, document scans should be a minimum of 200 dpi (dots per inch). 300 dpi and higher generally produce the best results. OCR accuracy is dependent on both the resolution and the minimum font size, along with other factors like document (and if handwritten, handwriting) quality, so testing is recommended. The image quality analysis feature can help evaluate resolution concerns.
        # we have precompute images where equation are detected for solvency level 2 at 400 dpi
        with open(image_url, "rb") as f:
                image_data = base64.b64encode(f.read()).decode("utf-8")
                extraction_prompt = "Extract equation(s) in latex from this page, only consider full equation, not inline reference to variable"
                format_examples = r"""
                \n# Example output format
                Here are the equations from the page, in LaTeX format:
```latex
\text{SCR}_{\text{earthquake}} = \sqrt{\left(\sum_{(r,s)} \text{CorrEQ}_{(r,s)} \cdot \text{SCR}_{(\text{earthquake},r)} \cdot \text{SCR}_{(\text{earthquake},s)}\right) + \text{SCR}^2_{(\text{earthquake,other})}}
```
```latex
\text{WSI}_{(\text{earthquake},r,i)} = W_{(\text{earthquake},r,i)} \cdot \text{SI}_{(\text{earthquake}, r,i)}
```
```latex
\text{SI}_{(\text{earthquake},r,i)} = \text{SI}_{(\text{property},r,i)} + \text{SI}_{(\text{onshore-property},r,i)}
```

                # Answer based on the provided image adhering to the example format
                """
                message = HumanMessage(
                content=[
                        {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{image_data}"},
                        },
                        {"type": "text", "text": extraction_prompt + format_examples},
                ]
                )
                # print("LLM INSTRUCTION")
                # pprint.pprint(message)
                response = llm.invoke([message])

        equations_converted = re.findall(r"```latex(.*?)```", response.text(), re.DOTALL)

        return equations_converted

# extract_equations(llm, test_image)


In [48]:
import pymupdf4llm
import os
import pprint
import time

regex_without_page = r"((following|follows)([^\n]*?:|\n))((?!following|follows).)*?(\n((where(((?!where).)*?(?=\(a\))))|where|provided))"
# example of page within (following|follows) ... where block --- Page 78 --- in solvency II - level 2
# regex_with_page = r'((following|follows)[^\n]*?:)((?!following|follows).)*?(--- Page \d+ ---)((?!following|follows).)*?(\n(where|provided))'

def find_equation_snippets(markdown_file):
    matches = re.finditer(regex_without_page, markdown_file, re.DOTALL)
    return [match.group(0) for match in matches]

def find_page(markdown_file, page_number, is_last_page):
    page_regex = r"(--- Page " + str(page_number) + " ---.*?)"
    if is_last_page:
        match = re.search(page_regex, markdown_file, re.DOTALL)
        return match.group(0)
    
    
    page_regex += r"(--- Page " + str(page_number+1) + " ---)(\nwhere)?"
    match = re.search(page_regex, markdown_file, re.DOTALL)
    if not match:
        raise Exception(f"Expected exactly one match for page {page_number}, found 0.")
    return match.group(1) + (match.group(3) + "\n" if match.group(3) else "")

def evaluate_placement_likelihood(ordered_equation_spans, snippets, verbose=0, std_output_file=None):
    count_per_snippet = [0] * len(snippets)
    count_in_expected_order = [0] * len(snippets)
    for i, snippet in enumerate(snippets):
        # snippet = snippet.replace(" ", "")
        # snippet = snippet.replace("*", "")
        snippet = snippet.replace("*", "").replace("[", "").replace("]", "").replace("\n", " ")
        snippet_chunks = snippet.split(" ")
        clean_snippet_chunks = []
        # remove empty chunks
        for snippet_chunk in snippet_chunks:
            if snippet_chunk != "":
                clean_snippet_chunks.append(snippet_chunk)

        if verbose:
            print(f"ordered equation_spans: {ordered_equation_spans}", file=std_output_file)
            print(f"evaluation for snippet {i}: {clean_snippet_chunks}", file=std_output_file)
        current_search_index = 0 # tries to take the order into account
        current_window_extra_size = 0


        for idx, span in enumerate(ordered_equation_spans):
            span = span.replace(" ", "")
            # set up found_pos and search_window
            snippet_window = []
            if idx == 0 or current_search_index == 0: # find first snippet with match
                # found_pos = snippet.find(span)
                try: 
                    found_pos = clean_snippet_chunks.index(span)
                except ValueError:
                    found_pos = -1
            else:
                # SEARCH_WINDOW = len(span) * 3 if len(span) * 3 < len(snippet) else len(span)
                # end = current_search_index + SEARCH_WINDOW + 1 + current_window_extra_size
                # end = min(end, len(snippet))  # Ensure we don't go out of bounds
                # found_pos = snippet.find(span, current_search_index, end)
                SEARCH_WINDOW = current_window_extra_size + current_window_extra_size + 3 if current_search_index + current_window_extra_size + 3 < len(clean_snippet_chunks) else len(clean_snippet_chunks)
                snippet_window = clean_snippet_chunks[current_search_index:current_search_index + SEARCH_WINDOW]
                try:
                    found_pos = current_search_index + snippet_window.index(span)
                except ValueError:
                    found_pos = -1
            if verbose:
                print(f"WINDOW {snippet_window} for {span}", file=std_output_file)
                if found_pos != -1:
                    print(f"Found '{span}' at position {found_pos} in snippet {i}.", file=std_output_file)

            # update score accordingly
            if span in clean_snippet_chunks:
                # print(f"Found '{span}' in snippet {i}.")
                count_per_snippet[i] += 1
                if found_pos != -1:
                    count_in_expected_order[i] += 1
                    current_search_index = found_pos + 1
                    current_window_extra_size = 0
                else:
                    current_window_extra_size += 1
    return count_per_snippet, count_in_expected_order


def convert_equations(best_matches, image_url):
    equations_converted = extract_equations(llm, image_url)

    if len(equations_converted) != len(best_matches):
        raise Exception("[Error in converting equations to latex]")
    
    for i, equation_converted in enumerate(equations_converted):
        first_match = next((best_match for best_match in best_matches if best_match["snippet"]["index"] == i), None)
        # print(f"equation_converted: {equation_converted}")
        first_match["equation"]["latex"] = equation_converted


def handle_replacement(best_matches, page_md, page_number):
    # print(f"\nHandling replacement for page {page_number} with {len(best_matches)} best matches.")
    i = 0
    number_replaced = 0

    def replacer(match: re.Match):
        # print("Found {} matches".format(len(matches)))

        nonlocal i
        nonlocal best_matches
        nonlocal number_replaced
        # is the snippet a best_match?
        first_match = next((best_match for best_match in best_matches if best_match["snippet"]["index"] == i), None)
        i += 1

        equation_latex_llm = first_match["equation"]["latex"] if first_match and "latex" in first_match["equation"] else ""

        if first_match:
            latex_equation = f"equation {equation_latex_llm} in snippet {first_match['snippet']['text']} replaced equation {first_match['equation']['text']}"
            number_replaced += 1

            # if not match.group(11):
                # match without page
            # print("Match without page: ", match.group(2), match.group(3))
            if match.group(7):
                return f"{match.group(1)}\n<REPLACEMENT_EQUATION>\n {latex_equation} \n</REPLACEMENT_EQUATION>\nwhere:\n(a)"
            return f"{match.group(1)}\n<REPLACEMENT_EQUATION>\n {latex_equation} \n</REPLACEMENT_EQUATION>\n{match.group(4)}\n"
        else:
            return match.group(0)

        
        # # replace equation

    # regex = "|".join([regex_without_page, regex_with_page])

    # page = response.text()
    # with open(os.path.join("data", "preprocessed-step-final", "solvency-II-files", "final_solvency II - level 2.pdf.md"), 'r', encoding="utf-8") as f:
    #     page = f.read()
    page_delimiter = "--- Page " + str(page_number) + " ---"

    page_delimiter_before = re.findall(page_delimiter, page_md)

    (processed_text, equation_replaced) = re.subn(regex_without_page, replacer, page_md, flags=re.DOTALL)
    # print(f"Replaced {equation_replaced} equations on page.")

    # verify replacement did not change number of page delimiters
    page_delimiter_after = re.findall(page_delimiter, processed_text)
    if page_delimiter_after != page_delimiter_before:
        print(f"[ERROR, PAGE DELIMITER CHANGED]: Before: {page_delimiter_before}, After: {page_delimiter_after}", file=std_output_file)
        raise Exception(f"Page delimiter on page: {page_number}")

    if number_replaced != len(best_matches):
        print(f"Expected {len(best_matches)} replacements, but got {number_replaced}.")
        raise Exception(f"[ERROR, REPLACEMENT NOT SUCCESSFULL] for {page_number}")

    return processed_text

def match_equations(original_doc, markdown_file, process_page_mask, file_name, std_output_file):
    # markdown_file = pymupdf4llm.to_markdown(doc)
    markdown_file_with_equation_replaced = ""
    page_numbers_unresolved = []
    equation_matching_unresolved = []
    total_number_of_pages_with_equations = 0
    total_matches_not_accepted = 0
    total_equation_detected = 0
    total_equation_matched = 0
    total_equation_matched_accepted = 0
    total_equation_matched_with_duplicate_match = 0
    total_not_enough_snippets = 0

    last_page_index = 0
    COUNT_ERROR_ACCEPTANCE = 0.8 #the portion of spans that must be matched
    ORDER_ERROR_ACCEPTANCE = 0.25



    for i in range(len(process_page_mask) - 1, -1, -1):
        if process_page_mask[i]:
            last_page_index = i
            break

    for page in original_doc: #[original_doc[36], original_doc[42]]:
        if process_page_mask[page.number] == 0:
            # print(f"Skipping page {page.number + 1} (masked)")
            continue

        # print(f"page with {page.number + 1} is being processed")
        page_text_in_dict = page.get_text("dict", flags=0)
        # detect_equation will look if the page has equations
        equations_detected = detect_equations(page_text_in_dict)
        total_equation_detected += len(equations_detected)
        # find_snippets will try to identify the exact replacement area
        # based on following: where regex

        page_md = find_page(markdown_file, page.number + 1, last_page_index == page.number)
        if equations_detected:
            print(f"*** Found {len(equations_detected)} equations on page {page.number + 1}. ***", file=std_output_file)
            total_number_of_pages_with_equations += 1

            # the original doc will not be changed, the snippets correspond the current representation of the original document in markdown.
            snippets = find_equation_snippets(page_md)

            
            if len(snippets) < len(equations_detected):
                print(f"[ERROR, NOT ENOUGH SNIPPETS]: Found {len(snippets)}, expected {len(equations_detected)}.", file=std_output_file)
                if len(snippets) != 0:
                    # not implemented yet
                    equation_texts = []

                    for idx, eq in enumerate(equations_detected):
                        equation_text = [s['text'] for s in eq['spans']]
                        print(f"Equations text: {equation_text}", file=std_output_file)
                        equation_texts.append(str(equation_text))
                        # print(f"Equations obj: {eq}", file=std_output_file)
                    for snippet in snippets:
                        print(f"Snippet text: {snippet}", file=std_output_file)
                    # raise Exception(f"Not enough snippets found. Found {len(snippets)}, expected {len(equations_detected)}.")
                    page_numbers_unresolved.append(page.number + 1)
                total_not_enough_snippets += len(equations_detected)
                markdown_file_with_equation_replaced += page_md #+ f"<ERROR, NOT ENOUGH SNIPPETS>\nsnippets:\n{snippets} \n\n equations:\n{"\n".join([str(idx + 1) + "." + equation_text for idx, equation_text in enumerate(equation_texts)])}\n</ERROR, NOT ENOUGH SNIPPETS>\n"
                continue
                
            best_matches = []


            for idx, eq in enumerate(equations_detected):
                equation_spans = [s['text'] for s in eq['spans']]
                # print(f"\n*** Evaluating spans from equation {idx + 1}: {spans} ***")

                count_per_snippet, count_in_expected_order = evaluate_placement_likelihood(equation_spans, snippets)
                # print(f"Count per snippet: {count_per_snippet}")
                # max_index, max_count = max(enumerate(count_per_snippet), key=lambda x: x[1])
                # max_index_in_expected_order, max_in_expected_order = max(enumerate(count_in_expected_order), key=lambda x: x[1])

                highest_score = 0
                best_match = {"snippet":{"index": None, "score": 0, "text": None}, "equation": {"index": idx, "text": equation_spans}}
                snippet_scores = []
                for i, snippet in enumerate(snippets):
                    snippet_score = count_per_snippet[i] + 2 * count_in_expected_order[i]
                    snippet_scores.append(snippet_score)
                    if snippet_score > highest_score:
                        highest_score = snippet_score
                        best_match.update({"snippet": {"index": i, "score": snippet_score, "text": snippet}})
                    # elif snippet_score == highest_score:
                        # raise Exception("A tie, multiple snippets have the same score.")

                SCORE_THRESHOLD = len(equation_spans) * ORDER_ERROR_ACCEPTANCE + len(equation_spans) * COUNT_ERROR_ACCEPTANCE
                if SCORE_THRESHOLD > highest_score:
                    print(f"<Equation {idx + 1} on page {page.number + 1} did not match sufficiently with any snippet.>", file=std_output_file)
                    # print(f"Did not meet total_score requirement, potentially {count_per_snippet[best_match_index]} / {len(equation_spans) * COUNT_ERROR_ACCEPTANCE} in snippet {best_match_index + 1}.", file=std_output_file)
                    pprint.pprint(count_per_snippet, stream=std_output_file)
                    # print(f"Did not meet total_score requirement, potentially {count_in_expected_order[best_match_index]} / {len(equation_spans) * ORDER_ERROR_ACCEPTANCE} in snippet {best_match_index + 1}.", file=std_output_file)
                    pprint.pprint(count_in_expected_order, stream=std_output_file)
                    # raise Exception("Not accepted match")
                    evaluate_placement_likelihood(equation_spans, snippets, verbose=1, std_output_file=std_output_file)
                    print(f"<End of equation NOT_ACCEPTED >", file=std_output_file)
                    # total_matches_not_accepted += 1
                    best_match["status"] = "NOT_ACCEPTED"
                else:
                    best_match["status"] = "ACCEPTED"
                
                    
                best_matches.append(best_match)
                # print(f"SNIPPET SCORES: {snippet_scores}")
                # print(f"BEST MATCH FOR EQUATION IS: {best_match_index + 1 if isinstance(best_match_index, int) else best_match_index} with score {highest_score}.")
                # print(f"*** END OF Evaluating spans from equation {idx + 1} ***\n")

            total_equation_matched += len(best_matches)

            snippets_indexes = [best_match["snippet"]["index"] for best_match in best_matches if best_match["status"] == "ACCEPTED"]
            if "NOT_ACCEPTED" in [best_match["status"] for best_match in best_matches]:
                page_numbers_unresolved.append(page.number + 1)
                total_matches_not_accepted += len(best_matches)
                print(f"[ERROR, MATCHES NOT ACCEPTED] SKIPPING PAGE {page.number + 1}...", file=std_output_file)
                markdown_file_with_equation_replaced += page_md
                continue
            elif len(set(snippets_indexes)) != len(snippets_indexes):
                print("[ERROR, MATCHED MULTIPLE TIMES] ", file=std_output_file)

                for best_match in best_matches:
                    print(f"Best match for equation {best_match['equation']['text']} on page {page.number + 1}:\n{best_match['snippet']['text']}\n\n", file=std_output_file)

                # for eq in equations_detected:
                #     equation_text = [s['text'] for s in eq['spans']]
                #     print(f"Equations text: {equation_text}", file=std_output_file)
                # for idx, snippet in enumerate(snippets):
                #     print(f"*** MATCH FOR SNIPPET {idx + 1} ***", file=std_output_file)
                #     print(f"Snippet {idx + 1}:\n{snippet}", file=std_output_file)
                #     print(f"Best match for snippet {idx + 1}:\n{[s['text'] for s in equations_detected[best_matches[idx]]['spans']] if idx < len(best_matches_indexes) else 'None'}\n", file=std_output_file)
                #     print(f"*** END OF BEST MATCH OF SNIPPET {idx + 1} ***", file=std_output_file)

                print(f"Best matches: {best_matches}", file=std_output_file)
                equation_matching_unresolved.append((page.number + 1, best_matches))
                page_numbers_unresolved.append(page.number + 1)
                total_equation_matched_with_duplicate_match += len(best_matches)
                markdown_file_with_equation_replaced += page_md
                continue
                # raise Exception("equation matched with multiple")

            image_url = f"data/raw/test-data/page-images/{file_name}/page_{page.number + 1}-{dpi}.png"
            # convert_equations(best_matches, image_url) # adds information to best_matches
            page_md_with_latex_equations = handle_replacement(best_matches, page_md, page.number + 1)
            markdown_file_with_equation_replaced += page_md_with_latex_equations
            total_equation_matched_accepted += len(best_matches)
            # print(f"Equation text: {''.join([s['text'] for s in eq['spans']])}")
        # tables = page.find_tables()
        # pprint.pprint(tables)
        # print_equations(equations_detected)
        # page.get_pixmap(dpi=1200).save("data/raw/test-data/solvency_II_level_1_v2_equations_page_{}.png".format(page.number))
            print(f"[ACCEPTED] page was matched", file=std_output_file)
            print(f"*** End of page {page.number + 1}. ***", file=std_output_file)
        else:
            print(f"[NO EQUATIONS] page {page.number + 1} has no equations.", file=std_output_file)
            markdown_file_with_equation_replaced += page_md
            # continue
        

    print(f"Total equations detected: {total_equation_detected}",)
    print(f"Total not enough snippets: {total_not_enough_snippets}/{total_equation_detected}")
    print(f"Total equations matched: {total_equation_matched}/{total_equation_detected}")
    print(f"Total matches accepted: {total_equation_matched_accepted}/{total_equation_matched}")
    print(f"Total matches not accepted: {total_matches_not_accepted}/{total_equation_matched}")
    print(f"Total matches with duplicate match: {total_equation_matched_with_duplicate_match}/{total_equation_matched}")
    print(f"Unresolved {len(page_numbers_unresolved)} of {total_number_of_pages_with_equations} pages: {page_numbers_unresolved}")
    print(f"Unresolved matches: {equation_matching_unresolved} ")

    return markdown_file_with_equation_replaced
    

# test_file = "solvency II - level 2 - 1-295.pdf"


# will not be updated. for testing only
copy_structure_solvency_II_level_2 ={
  "file_name": "solvency II - level 2.pdf",
  "toc": (1, 4),
  "recitals": (5, 20),
  "CORRELATION_TABLES": (296, 797),
  "NUM_TITLES": 3,
  "NUM_CHAPTERS": 25,
  "NUM_SECTIONS": 61,
  "NUM_SUBSECTIONS": 31,
  "NUM_ARTICLES": 381
}

file_name = "solvency II - level 2"
precompute_images = False

with open("print-output.txt", "w", encoding="utf-8") as std_output_file:

    with open(os.path.join("data", "raw", "solvency-II-files", f"{file_name}.pdf"), 'rb') as f:
        original_doc = pymupdf.open(f) # detection on original pdf
        # markdown_file = pymupdf4llm.to_markdown(original_doc) # can be changed to own preprocessed markdown_file
        mask : list = [1] * original_doc.page_count

        exceptions = [copy_structure_solvency_II_level_2["toc"], copy_structure_solvency_II_level_2["CORRELATION_TABLES"]]

        for start, end in exceptions:
            for i in range(start - 1, end): #page delimiters are one-indexed
                if i >= original_doc.page_count:
                    raise Exception(f"Page index {i} out of bounds for document with {original_doc.page_count} pages.")
                mask[i] = 0

        dpi = 400

        if precompute_images:
            # https://cloud.google.com/document-ai/docs/file-types
            # For most accurate OCR results from Document AI, document scans should be a minimum of 200 dpi (dots per inch). 300 dpi and higher generally produce the best results. OCR accuracy is dependent on both the resolution and the minimum font size, along with other factors like document (and if handwritten, handwriting) quality, so testing is recommended. The image quality analysis feature can help evaluate resolution concerns.
            start_time = time.time()
            for page in original_doc:
                page_text_in_dict = page.get_text("dict", flags=0)
                # detect_equation will look if the page has equations
                equations_detected = detect_equations(page_text_in_dict)
                if equations_detected:
                    # print(f"Converting page to image...")
                    file_path = f"data/raw/test-data/page-images/{file_name}/page_{page.number + 1}-{dpi}.png"
                    if os.path.isfile(f"{file_path}"):
                        # print(f"Image already exists: {file_path}")
                        continue
                    page_image = page.get_pixmap(dpi=dpi).save(file_path)
            end_time = time.time()
            print(f"Converted images in {end_time - start_time} seconds.")

        with open(os.path.join("data", "preprocessed-step-2", "solvency-II-files", "substep-5-solvency II - level 2.pdf.md"), 'r', encoding="utf-8") as md_file:
            markdown_file = md_file.read()

        # markdown_file = pymupdf4llm.to_markdown(original_doc) # can be changed to own preprocessed markdown_file
        markdown_file_with_equation_replaced = match_equations(original_doc, markdown_file, mask, file_name, std_output_file)

        # quick fix, not sure why on same line: where--- Page \d+
        (markdown_file_with_equation_replaced, num_quick_fix) = re.subn(r"(where)(--- Page \d+)", r"\1\n\2", markdown_file_with_equation_replaced)
        print(f"quick fix {num_quick_fix}")

page_numbers = re.findall(r"--- Page (\d+) ---", markdown_file_with_equation_replaced)

page_tracker = 5
if mask.count(1) != len(page_numbers):
    for page_number in page_numbers:
        if int(page_number) != page_tracker:
            print(f"Warning: Page number {page_number} does not match expected page number {page_tracker}.")
            page_tracker = int(page_number)
        page_tracker += 1
    print(f"Warning: Number of pages in mask ({mask.count(1)}) does not match number of pages in markdown ({len(page_numbers)}).")

with open(os.path.join("data", "preprocessed-step-2", "solvency-II-files", "substep-6_solvency II - level 2.pdf.md"), 'w', encoding="utf-8") as f:
    f.write(markdown_file_with_equation_replaced)



Total equations detected: 189
Total not enough snippets: 48/189
Total equations matched: 141/189
Total matches accepted: 126/141
Total matches not accepted: 8/141
Total matches with duplicate match: 7/141
Unresolved 15 of 74 pages: [69, 87, 111, 116, 127, 128, 274, 275, 276, 277, 278, 279, 280, 281, 285]
Unresolved matches: [(116, [{'snippet': {'index': 0, 'score': 24, 'text': 'following:\n\n*stress* *i* ¼ minð *b* *i* � *dur* *i* ;1Þ\n\nwhere:\n\n'}, 'equation': {'index': 0, 'text': ['stress', 'i', ' ¼', ' min', 'ð', 'b', 'i', ' �', 'dur', 'i', ';1', 'Þ']}, 'status': 'ACCEPTED'}, {'snippet': {'index': 0, 'score': 24, 'text': 'following:\n\n*stress* *i* ¼ minð *b* *i* � *dur* *i* ;1Þ\n\nwhere:\n\n'}, 'equation': {'index': 1, 'text': ['stress', 'i', ' ¼', ' min', 'ð', 'b', 'i', ' �', 'dur', 'i', ';1', 'Þ']}, 'status': 'ACCEPTED'}, {'snippet': {'index': 0, 'score': 24, 'text': 'following:\n\n*stress* *i* ¼ minð *b* *i* � *dur* *i* ;1Þ\n\nwhere:\n\n'}, 'equation': {'index': 2, 'text': ['s

In [69]:
with open(os.path.join("data", "raw", "solvency-II-files", f"{file_name}.pdf"), 'rb') as f:
        original_doc = pymupdf.open(f) # detection on original pdf

        for page in [original_doc[272]]:
            for image in page.get_images(full=True):
                 xref = image[0]
                 image_data = original_doc.extract_image(xref)
                 with open(f"image-page-{page.number + 1}-{xref}.{image_data['ext']}", "wb") as img_file:
                    img_file.write(image_data["image"])
            print(page.get_images(full=True))
            print(page.get_xobjects())

            page_text = page.get_text()
            with open(f"text-page-{page.number + 1}.txt", "w", encoding="utf-8") as text_file:
                text_file.write(page_text)
            # print(page.get_image_info(0)[0])



[(556, 0, 1235, 165, 1, '', '', 'I0', 'CCITTFaxDecode', 0), (1721, 0, 39, 48, 1, '', '', 'I1', 'CCITTFaxDecode', 0), (1720, 0, 34, 64, 1, '', '', 'I2', 'CCITTFaxDecode', 0), (1719, 0, 31, 62, 1, '', '', 'I3', 'CCITTFaxDecode', 0), (1718, 0, 1155, 388, 1, '', '', 'I4', 'CCITTFaxDecode', 0), (1720, 0, 34, 64, 1, '', '', 'I5', 'CCITTFaxDecode', 0), (1719, 0, 31, 62, 1, '', '', 'I6', 'CCITTFaxDecode', 0), (557, 0, 1020, 267, 1, '', '', 'I7', 'CCITTFaxDecode', 0), (1720, 0, 34, 64, 1, '', '', 'I8', 'CCITTFaxDecode', 0), (1719, 0, 31, 62, 1, '', '', 'I9', 'CCITTFaxDecode', 0)]
[]


In [None]:

image_data = original_doc.extract_image(170)

# with open(f"image.{image_data['ext']}", "wb") as img_file:
#     img_file.write(image_data["image"])

In [62]:
image_data

{'width': 485,
 'height': 94,
 'ext': 'png',
 'colorspace': 1,
 'xres': 96,
 'yres': 96,
 'bpc': 1,
 'size': 1404,
 'image': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x01\xe5\x00\x00\x00^\x08\x00\x00\x00\x008!S\xbc\x00\x00\x00\tpHYs\x00\x00\x0e\xc4\x00\x00\x0e\xc4\x01\x95+\x0e\x1b\x00\x00\x05.IDATx\x9c\xed\x9a\xdb\x96\xc3 \x08E\xf3\xff?\xed\xac5m\x13\xe5&\n\x9a\x189/3Q\x04e\x07c\xda\x1eGh\xbe\xd2\xc2\xdeCJ\xa5\xc1\x94\x03\xf3\xfdJ\xc3)\x8c\x8f\x10\xaah\x06\x82\xc0|\xb3\xe6\x00\x08\xcc\xb7jV\xfa\x03\xf3\x8d\x9a\x96\xfc\x14\x98o\xd3\xc4\xdc\x07\xe6\xdb43\xf3A\xf9&M\xad\xaf(\xe6{49\xef\x81\xf9\x16\x05\xe5\r4=\xeb\x81\xf9\x06\x05\xe5\r\x94n\xa1\x1c\x98\xe7JNy\xfa\xea\xbc\x1e\x162AyD\n}$%\x14g=\xc3m`\xc2\x0e\x08\xc2\x83$$5\xeb\xa2\xd3\xdf\x0b\x85\x1f\x13\x8c\x87HHk\xd9!Pv\x0c\x1a\x94\x87H\x0b\x996\x1dC\xb9\xd9_\xa8\xa2\x96|3\x94\x1d\xa3F)\x0f\x91\\U\xb5\xa6n(\xdc\xc0\x80<D,\'\xb2\xdd\xab\x94\x83\xf2\\\x05\xe5\r\xc4\x1f\x92gP&\xfc\x07\xe5\x01\xb2Q\xee\x87"Q\xee\xf1\x17\x92Ty\xa5\xc1\xc7/

# Integration into existing pipeline

In [None]:
# Question on interference. I need access 

# Integration
# instead of doc[0].get_text() you can just use the langchain.Document.page_content
# but the problem is you need the doc[0].get_text("dict", flags=0) representation as well
# therefore preferably I get access to the pymupdf.Document in the pymupdf4llm code


# Exploring other solvency II-files
The solvency II - level 2 regulation seems to be the most heavy on equations (~100). However, we will also analyse if other files have many equations

with open("print-output.txt", "w", encoding="utf-8") as std_output_file:
    with open(os.path.join("data", "raw", "test-data", "equation-examples", test_file), 'rb') as f:
        match_equations(f, std_output_file)