In [61]:
import os
import fitz  # PyMuPDF
import ocrmypdf
import shutil
from pathlib import Path
from pymupdf4llm import to_markdown, IdentifyHeaders
import re

# Create output directory
Path("output_markdown").mkdir(parents=True, exist_ok=True)

In [62]:
def is_scanned_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    for page in doc:
        if page.get_text().strip():
            return False
    return True

In [63]:
def run_ocr(input_path, output_path):
    print("🔁 Running OCRmyPDF...")
    ocrmypdf.ocr(
        input_file=input_path,
        output_file=output_path,
        rotate_pages=True,
        deskew=True,
        force_ocr=True,
        skip_text=True
    )
    print(f"✅ OCR complete: {output_path}")

In [64]:
def normalize_heading_hierarchy(md_text):
    lines = md_text.split('\n')
    updated_lines = []
    for line in lines:
        match = re.match(r'^(#+)\s+(.*)', line)
        if match:
            hashes, content = match.groups()
            # Promote/demote based on content
            if len(hashes) == 1:
                hashes = '##'
            elif len(hashes) == 2:
                if any(word in content.lower() for word in ['mortality', 'knock-down', 'resistance', 'vector', 'prevalence']):
                    hashes = '###'
            line = f"{hashes} {content}"
        updated_lines.append(line)
    return '\n'.join(updated_lines)

In [65]:
# Enhanced version with debugging
def extract_markdown_with_hierarchy(pdf_path, md_output_path):
    doc = fitz.open(pdf_path)
    toc = doc.get_toc()
    
    if toc:
        print(f"📋 Found TOC with {len(toc)} entries")
        for i, (level, title, page) in enumerate(toc[:3]):
            print(f"  Level {level}: '{title}' (page {page})")
        hdr_info = toc
    else:
        print("🔍 No TOC found, using automatic header detection")
        hdr_info = IdentifyHeaders(doc)
        
        # Debug: Show what headers were identified
        if hasattr(hdr_info, 'get_header_id'):
            print("📝 Identified headers:")
            for page_num in range(min(3, len(doc))):  # Check first 3 pages
                page = doc[page_num]
                blocks = page.get_text("dict")["blocks"]
                for block in blocks:
                    if "lines" in block:
                        for line in block["lines"]:
                            for span in line["spans"]:
                                if hdr_info.get_header_id(span, page=page):
                                    print(f"  Page {page_num+1}: '{span['text'].strip()}'")
    
    # Generate markdown with proper hierarchy
    markdown_text = to_markdown(doc, hdr_info=hdr_info)
    
    with open(md_output_path, "w", encoding="utf-8") as f:
        f.write(markdown_text)
    
    print(f"✅ Markdown saved: {md_output_path}")
    return markdown_text


In [66]:
def process_pdf_pipeline(pdf_path):
    filename = Path(pdf_path).stem
    output_dir = Path("output_markdown")
    output_dir.mkdir(parents=True, exist_ok=True)

    temp_dir = Path("temp_ocr")
    temp_dir.mkdir(exist_ok=True)

    ocr_path = temp_dir / f"{filename}_ocr.pdf"
    md_output = output_dir / f"{filename}.md"

    print(f"🔍 Processing PDF: {pdf_path}")
    scanned = is_scanned_pdf(pdf_path)

    if scanned:
        print("🧾 Detected scanned PDF")
        run_ocr(pdf_path, ocr_path)
        used_pdf = ocr_path
    else:
        print("📄 Detected born-digital PDF")
        used_pdf = Path(pdf_path)

    return extract_markdown_with_hierarchy(used_pdf, md_output)

In [67]:
test_pdf = "pdfs/Allossogbe_et_al_2017_Mal_J.pdf"
markdown_text = process_pdf_pipeline(test_pdf)

🔍 Processing PDF: pdfs/Allossogbe_et_al_2017_Mal_J.pdf
📄 Detected born-digital PDF
📋 Found TOC with 36 entries
  Level 1: 'WHO cone bio-assays of classical and new-generation long-lasting insecticidal nets call for innovative insecticides targeting the knock-down resistance mechanism in Benin' (page 1)
  Level 2: 'Abstract ' (page 1)
  Level 3: 'Background: ' (page 1)
✅ Markdown saved: output_markdown\Allossogbe_et_al_2017_Mal_J.md


In [68]:
## DOI if in metadata

doc = fitz.open(test_pdf)
doi = doc.metadata.get("doi", None)
print("DOI:", doi)

DOI: None


In [69]:
## Added DOI if not in metadata

doc = fitz.open(test_pdf)
first_page_text = doc[0].get_text()
match = re.search(r'(10\.\d{4,9}/[-._;()/:A-Z0-9]+)', first_page_text, re.I)
doi = match.group(1) if match else None
print("DOI from text:", doi)


DOI from text: 10.1186/s12936-017-1727-x
