This notebook explores options for finding the page number for any passage / chunk. The process is as follows:
- given a chunk, get the headers from the metadata
- In the mistral ocr result, find a match for the largest header
- If found, find the corresponding page number
- Look for the second largest header; if found replace the found page number with this page number
- repeat for the third header.

Using this technique, this code should find the page number for the smalles header containing this text.
Note that it's also possible to look for matches for the chunk text itself, but this becomes tricky when a chunk is split over multiple pages.

Since the markdown file is manually edited, it might be possible that header names are edited as well, so the code uses fuzzy matching.

TODO: currently very inefficient. Rewrite to process all chunks at once; in order:
- start at page 1, find first match
- starting at that page, find next match etc
- This prevents looking through the entire doc for every chunk

TODO: define page_offset for every party to align with page number in document

NOTE: If ocr mistakes are made for the header, fuzzy matching needs to be used

In [None]:
from fuzzywuzzy import fuzz
import json
from pathlib import Path
import re

from src.config import FilePaths
from src.document_chunking import chunk_markdown_file
from src.enums import Party

In [None]:
party = Party.CDA

clean_markdown_file = FilePaths.clean_markdown_dir / f"{party}_clean.md"

with open(clean_markdown_file, 'r', encoding='utf-8') as file:
    markdown_string = file.read()

chunks = chunk_markdown_file(markdown_string)

In [None]:
chunk = chunks[100]
chunk

In [None]:
from langchain_core.documents import Document

def find_page_index_for_chunk(
    chunk: Document,
    pages,
    confidence_threshold: int = 60
) -> int:
    
    header_hierarchy = ["Hoofdstuk", "Sectie", "Subsectie"]

    found_index = -1
    best_match_score = 0
    chunk_metadata = chunk.metadata

    for header_type in header_hierarchy:
        header_title = chunk_metadata.get(header_type)
        if not header_title:
            continue

        print("Looking for", header_title)

        regex_pattern = r"^#{1,3}\s.*"
        
        # Iterate over pages to find a match
        for page in pages:
            markdown_text = page.get("markdown", "")

            # Find all headers in a page
            page_headers = re.findall(regex_pattern, markdown_text, re.MULTILINE)

            for header in page_headers:
                header_text = re.sub(r"^#{1,3}\s", "", header).strip()
                match_score = fuzz.partial_ratio(header_title, header_text)
            
                if match_score >= confidence_threshold and match_score >= best_match_score:
                    best_match_score = match_score
                    found_index = page.get("index", -1)
                    print(header, match_score, found_index)
                    break
    return found_index
    

def add_chunk_pages_to_metadata(
    raw_ocr_path: Path,
    chunks: list[Document]
):
    try:
        with open(raw_ocr_path, 'r', encoding='utf-8') as f:
            ocr_data = json.load(f)
    except (FileNotFoundError, json.JSONDecodeError):
        return -1
    
    pages = ocr_data.get("pages", [])
    if not pages:
        return -1

    current_index = 0

    for chunk in chunks:
        remaining_pages = pages[current_index:current_index+3]

        page_index = find_page_index_for_chunk(
            chunk=chunk,
            pages=remaining_pages
        )
        print("Found page index: ", page_index)
        chunk.metadata["Pagina"] = page_index
        if page_index != -1:
            current_index = page_index


In [None]:
add_chunk_pages_to_metadata(
    raw_ocr_path=FilePaths.json_dir / f"{party}.json",
    chunks=chunks
)

In [None]:
for chunk in chunks:
    if chunk.metadata["Pagina"] == -1:
        print(chunk)