This notebook explores options for finding the page number for any passage / chunk. The process is as follows:
- given a chunk, get the headers from the metadata
- In the mistral ocr result, find a match for the largest header
- If found, find the corresponding page number
- Look for the second largest header; if found replace the found page number with this page number
- repeat for the third header.

Using this technique, this code should find the page number for the smalles header containing this text.
Note that it's also possible to look for matches for the chunk text itself, but this becomes tricky when a chunk is split over multiple pages.

Since the markdown file is manually edited, it might be possible that header names are edited as well, so the code uses fuzzy matching.

TODO: currently very inefficient. Rewrite to process all chunks at once; in order:
- start at page 1, find first match
- starting at that page, find next match etc
- This prevents looking through the entire doc for every chunk

In [None]:
from fuzzywuzzy import fuzz
import json
from pathlib import Path
import re

from src.config import FilePaths
from src.document_chunking import chunk_markdown_file
from src.enums import Party

In [None]:
def get_chunk_page_with_fuzzy_matching(
    raw_ocr_path: Path,
    chunk_metadata: dict,
    confidence_threshold: int = 85
) -> int:
    """
    Finds a chunk's page number by combining regex and fuzzy matching on headers.
    
    Args:
        raw_ocr_path (Path): Path to the raw OCR JSON file.
        chunk_metadata (dict): The metadata dictionary from a chunk.
        confidence_threshold (int): The minimum fuzzy match score to be considered a match.
        
    Returns:
        int: The page number of the best match, or -1 if no match is found.
    """
    try:
        with open(raw_ocr_path, 'r', encoding='utf-8') as f:
            ocr_data = json.load(f)
    except (FileNotFoundError, json.JSONDecodeError):
        return -1

    pages = ocr_data.get("pages", [])
    if not pages:
        return -1

    # Define the header hierarchy
    header_hierarchy = ["Hoofdstuk", "Sectie", "Subsectie"]

    found_page = -1
    best_match_score = 0

    for header_type in header_hierarchy:
        header_title = chunk_metadata.get(header_type)
        if not header_title:
            continue

        print("Looking for", header_title)

        regex_pattern = r"^#{1,3}\s.*"
        
        # Iterate over pages to find a match
        for page in pages:
            markdown_text = page.get("markdown", "")

            page_headers = re.findall(regex_pattern, markdown_text, re.MULTILINE)

            for header in page_headers:
                header_text = re.sub(r"^#{1,3}\s", "", header).strip()
                match_score = fuzz.partial_ratio(header_title, header_text)
            
                if match_score >= confidence_threshold and match_score >= best_match_score:
                    best_match_score = match_score
                    found_page = page.get("index", -1)
                    print(header, match_score, found_page)
                    break
    
    return found_page



In [None]:
party = Party.VVD

clean_markdown_file = FilePaths.clean_markdown_dir / f"{party}_clean.md"

with open(clean_markdown_file, 'r', encoding='utf-8') as file:
    markdown_string = file.read()

chunks = chunk_markdown_file(markdown_string)

In [3]:
chunk = chunks[100]
chunk

Document(metadata={'Hoofdstuk': 'Missie 1: Radicale economische groei', 'Sectie': 'Groei met een banenpakket voor Nederland'}, page_content='- Meer en langer doorwerken moet lonen: We willen maatregelen nemen om mensen meer te laten werken. Iedereen is nodig op de arbeidsmarkt. We vinden daarom dat bij een stijgende levensverwachting, de pensioenleeftijd mee moet stijgen. We hebben oog voor de kwetsbare groepen. We ondersteunen het langer doorwerken door meer in te zetten op een Leven Lang Ontwikkelen. Zo zorgen we dat er genoeg mensen zijn om bijvoorbeeld in de zorg, het onderwijs of in de techniek te werken.')

In [None]:
get_chunk_page_with_fuzzy_matching(
    raw_ocr_path=FilePaths.json_dir / f"{party}.json",
    chunk_metadata=chunk.metadata
)

Looking for Missie 1: Radicale economische groei
# Missie 1: Radicale economische groei  100 5
# Missie 1: Radicale economische groei  100 6
Looking for Groei met een banenpakket voor Nederland
# Groei met een banenpakket voor Nederland  100 12


12

In [None]:
from langchain_core.documents import Document

def find_page_index_for_chunk(
    chunk: Document,
    pages,
    confidence_threshold: int = 100
):
    
    header_hierarchy = ["Hoofdstuk", "Sectie", "Subsectie"]

    found_index = -1
    best_match_score = 0
    chunk_metadata = chunk.metadata

    for header_type in header_hierarchy:
        header_title = chunk_metadata.get(header_type)
        if not header_title:
            continue

        print("Looking for", header_title)

        regex_pattern = r"^#{1,3}\s.*"
        
        # Iterate over pages to find a match
        for page in pages:
            markdown_text = page.get("markdown", "")

            page_headers = re.findall(regex_pattern, markdown_text, re.MULTILINE)

            for header in page_headers:
                header_text = re.sub(r"^#{1,3}\s", "", header).strip()
                match_score = fuzz.partial_ratio(header_title, header_text)
            
                if match_score >= confidence_threshold and match_score >= best_match_score:
                    best_match_score = match_score
                    found_page = page.get("index", -1)
                    print(header, match_score, found_page)
                    break
    

def add_chunk_pages_to_metadata(
    raw_ocr_path: Path,
    chunks: list[Document]
):
    try:
        with open(raw_ocr_path, 'r', encoding='utf-8') as f:
            ocr_data = json.load(f)
    except (FileNotFoundError, json.JSONDecodeError):
        return -1
    
    pages = ocr_data.get("pages", [])
    if not pages:
        return -1

    current_index = 0

    for chunk in chunks:
        chunk_metadata = chunk.metadata
        remaining_pages = pages[current_index:]

        page_index = find_page_index_for_chunk(
            chunk=chunk,
            pages=remaining_pages
        )
        print("Found page index: ", page_index)
        current_index = page_index

In [5]:
for chunk in chunks:
    page_number = get_chunk_page_with_fuzzy_matching(
        raw_ocr_path=FilePaths.json_dir / f"{party}.json",
        chunk_metadata=chunk.metadata
    )
    print("Page number: ", page_number)

Looking for **Sterker uit de storm**
# **Sterker uit de storm** 100 0
Page number:  0
Looking for Liberale waarden en keuzes voor Nederland
# Liberale waarden en keuzes voor Nederland  100 1
Page number:  1
Looking for Liberale waarden en keuzes voor Nederland
# Liberale waarden en keuzes voor Nederland  100 1
Page number:  1
Looking for Liberale waarden en keuzes voor Nederland
# Liberale waarden en keuzes voor Nederland  100 1
Page number:  1
Looking for Liberale waarden en keuzes voor Nederland
# Liberale waarden en keuzes voor Nederland  100 1
Page number:  1
Looking for Liberale waarden en keuzes voor Nederland
# Liberale waarden en keuzes voor Nederland  100 1
Page number:  1
Looking for Liberale waarden en keuzes voor Nederland
# Liberale waarden en keuzes voor Nederland  100 1
Page number:  1
Looking for Liberale waarden en keuzes voor Nederland
# Liberale waarden en keuzes voor Nederland  100 1
Page number:  1
Looking for Liberale waarden en keuzes voor Nederland
# Liberale wa

KeyboardInterrupt: 