In [2]:
import os
import fitz  # PyMuPDF
import ocrmypdf
import shutil
from pathlib import Path
from pymupdf4llm import to_markdown, IdentifyHeaders
import re

# Create output directory
Path("output_markdown").mkdir(parents=True, exist_ok=True)

In [3]:
def is_scanned_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    for page in doc:
        if page.get_text().strip():
            return False
    return True

In [4]:
def run_ocr(input_path, output_path):
    print("🔁 Running OCRmyPDF...")
    ocrmypdf.ocr(
        input_file=input_path,
        output_file=output_path,
        rotate_pages=True,
        deskew=True,
        force_ocr=True,
        skip_text=True
    )
    print(f"✅ OCR complete: {output_path}")

In [5]:
def normalize_heading_hierarchy(md_text):
    lines = md_text.split('\n')
    updated_lines = []
    for line in lines:
        match = re.match(r'^(#+)\s+(.*)', line)
        if match:
            hashes, content = match.groups()
            # Promote/demote based on content
            if len(hashes) == 1:
                hashes = '##'
            elif len(hashes) == 2:
                if any(word in content.lower() for word in ['mortality', 'knock-down', 'resistance', 'vector', 'prevalence']):
                    hashes = '###'
            line = f"{hashes} {content}"
        updated_lines.append(line)
    return '\n'.join(updated_lines)

In [6]:
def extract_markdown_with_hierarchy(pdf_path, md_output_path):
    doc = fitz.open(pdf_path)
    toc = doc.get_toc()
    if toc:
        hdr_info = {"method": "toc_header_logic", "toc": toc}
    else:
        hdr_info = None

    ## I was using IdenityHeaders for hierarchy but tried to use TOC as u suggested
    
    # hdr_info = IdentifyHeaders(doc) if toc else None
    raw_md = to_markdown(doc, hdr_info=hdr_info)

    normalized_md = normalize_heading_hierarchy(raw_md)

    with open(md_output_path, "w", encoding="utf-8") as f:
        f.write(normalized_md)

    print(f"✅ Markdown with hierarchy saved: {md_output_path}")
    return normalized_md

In [7]:
def process_pdf_pipeline(pdf_path):
    filename = Path(pdf_path).stem
    output_dir = Path("output_markdown")
    output_dir.mkdir(parents=True, exist_ok=True)

    temp_dir = Path("temp_ocr")
    temp_dir.mkdir(exist_ok=True)

    ocr_path = temp_dir / f"{filename}_ocr.pdf"
    md_output = output_dir / f"{filename}.md"

    print(f"🔍 Processing PDF: {pdf_path}")
    scanned = is_scanned_pdf(pdf_path)

    if scanned:
        print("🧾 Detected scanned PDF")
        run_ocr(pdf_path, ocr_path)
        used_pdf = ocr_path
    else:
        print("📄 Detected born-digital PDF")
        used_pdf = Path(pdf_path)

    return extract_markdown_with_hierarchy(used_pdf, md_output)

In [8]:
test_pdf = "pdfs/Allossogbe_et_al_2017_Mal_J.pdf"
markdown_text = process_pdf_pipeline(test_pdf)

🔍 Processing PDF: pdfs/Allossogbe_et_al_2017_Mal_J.pdf
📄 Detected born-digital PDF
✅ Markdown with hierarchy saved: output_markdown\Allossogbe_et_al_2017_Mal_J.md


In [None]:
## DOI if in metadata

doc = fitz.open(test_pdf)
doi = doc.metadata.get("doi", None)
print("DOI:", doi)

DOI: None


In [None]:
## Added DOI if not in metadata

doc = fitz.open(test_pdf)
first_page_text = doc[0].get_text()
match = re.search(r'(10\.\d{4,9}/[-._;()/:A-Z0-9]+)', first_page_text, re.I)
doi = match.group(1) if match else None
print("DOI from text:", doi)


DOI from text: 10.1186/s12936-017-1727-x
