In [2]:
from collections import Counter
import re
import fitz
import pymupdf4llm
from pathlib import Path
from scripts.ocr_text import analyze_markdown_header_hierarchy

def test_header_tuning(pdf_path):
    doc = fitz.open(pdf_path)
    margins = (0, 50, 0, 30)

    for body_limit in [9, 10, 11]:
        for max_levels in [2, 3, 4, 5]:
            print(f"\n🔧 Testing body_limit={body_limit}, max_levels={max_levels}")

            headers = pymupdf4llm.IdentifyHeaders(
                doc,
                max_levels=max_levels,
                body_limit=body_limit
            )

            md = pymupdf4llm.to_markdown(doc, hdr_info=headers, margins=margins)
            result = analyze_markdown_header_hierarchy(md)

            print("📊 Header Levels:", {k: v for k, v in result.items() if k.startswith("hdr_level")})
            print("🧠 Assessment:", result["assessment"])

test_pdf = "pdfs/Allossogbe_et_al_2017_Mal_J.pdf"
test_header_tuning(test_pdf)



🔧 Testing body_limit=9, max_levels=2
📊 Header Levels: {'hdr_level_1': 1, 'hdr_level_2': 1}
🧠 Assessment: Header hierarchy appears reasonable.

🔧 Testing body_limit=9, max_levels=3
📊 Header Levels: {'hdr_level_1': 1, 'hdr_level_2': 1, 'hdr_level_3': 2}
🧠 Assessment: Header hierarchy appears reasonable.

🔧 Testing body_limit=9, max_levels=4
📊 Header Levels: {'hdr_level_1': 1, 'hdr_level_2': 1, 'hdr_level_3': 2, 'hdr_level_4': 1}
🧠 Assessment: Header levels are too deeply nested.

🔧 Testing body_limit=9, max_levels=5
📊 Header Levels: {'hdr_level_1': 1, 'hdr_level_2': 1, 'hdr_level_3': 2, 'hdr_level_4': 1}
🧠 Assessment: Header levels are too deeply nested.

🔧 Testing body_limit=10, max_levels=2
📊 Header Levels: {'hdr_level_1': 1, 'hdr_level_2': 1}
🧠 Assessment: Header hierarchy appears reasonable.

🔧 Testing body_limit=10, max_levels=3
📊 Header Levels: {'hdr_level_1': 1, 'hdr_level_2': 1, 'hdr_level_3': 2}
🧠 Assessment: Header hierarchy appears reasonable.

🔧 Testing body_limit=10, max_le