In [64]:
!pip install --upgrade pymupdf4llm
import os
import fitz  # PyMuPDF
import ocrmypdf
import shutil
from pathlib import Path
import pymupdf4llm
import re


# Create output directory
Path("output_markdown").mkdir(parents=True, exist_ok=True)



In [65]:
def is_scanned_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    for page in doc:
        if page.get_text().strip():
            return False
    return True

In [66]:
def run_ocr(input_path, output_path):
    print("🔁 Running OCRmyPDF...")
    ocrmypdf.ocr(
        input_file=input_path,
        output_file=output_path,
        rotate_pages=True,
        deskew=True,
        force_ocr=True,
        skip_text=True
    )
    print(f"✅ OCR complete: {output_path}")

In [67]:
def normalize_heading_hierarchy(md_text):
    lines = md_text.split('\n')
    updated_lines = []
    for line in lines:
        match = re.match(r'^(#+)\s+(.*)', line)
        if match:
            hashes, content = match.groups()
            # Promote/demote based on content
            if len(hashes) == 1:
                hashes = '##'
            elif len(hashes) == 2:
                if any(word in content.lower() for word in ['mortality', 'knock-down', 'resistance', 'vector', 'prevalence']):
                    hashes = '###'
            line = f"{hashes} {content}"
        updated_lines.append(line)
    return '\n'.join(updated_lines)

In [None]:
def extract_markdown_with_hierarchy(pdf_path, md_output_path):
    doc = fitz.open(pdf_path)
    toc = doc.get_toc()
    
    # Set margins to exclude headers/footers (top: 50, bottom: 50 points)
    # This will ignore text in the top 50 points and bottom 50 points of each page
    default_margins = (0, 50, 0, 30)  # (left, top, right, bottom)
    if toc:
        # Use table of contents for header detection    
        toc_headers = pymupdf4llm.TocHeaders(doc)
        md_text = pymupdf4llm.to_markdown(
            doc, 
            hdr_info=toc_headers,
            margins=default_margins
        )
        print(f"📋 Used TocHeaders with {len(toc)} TOC entries and margins {default_margins}")
    else:
        # Generate header info with custom settings when no TOC exists
        my_headers = pymupdf4llm.IdentifyHeaders(
            doc, 
            max_levels=4,  
            body_limit=10 
        )
        md_text = pymupdf4llm.to_markdown(
            doc, 
            hdr_info=my_headers,
            margins=default_margins
        )
        print("🔍 Used IdentifyHeaders with custom settings and margins")
    
    with open(md_output_path, "w", encoding="utf-8") as f:
        f.write(md_text)
    
    print(f"✅ Markdown with hierarchy and margins saved: {md_output_path}")
    return md_text


In [69]:
def process_pdf_pipeline(pdf_path):
    filename = Path(pdf_path).stem
    output_dir = Path("output_markdown")
    output_dir.mkdir(parents=True, exist_ok=True)

    temp_dir = Path("temp_ocr")
    temp_dir.mkdir(exist_ok=True)

    ocr_path = temp_dir / f"{filename}_ocr.pdf"
    md_output = output_dir / f"{filename}.md"

    print(f"🔍 Processing PDF: {pdf_path}")
    scanned = is_scanned_pdf(pdf_path)

    if scanned:
        print("🧾 Detected scanned PDF")
        run_ocr(pdf_path, ocr_path)
        used_pdf = ocr_path
    else:
        print("📄 Detected born-digital PDF")
        used_pdf = Path(pdf_path)

    return extract_markdown_with_hierarchy(used_pdf, md_output)

In [78]:
test_pdf = "pdfs/Muller_et_al_2006_Bull_World_Health_Org.pdf"
markdown_text = process_pdf_pipeline(test_pdf)

🔍 Processing PDF: pdfs/Muller_et_al_2006_Bull_World_Health_Org.pdf
📄 Detected born-digital PDF
🔍 Used IdentifyHeaders with custom settings and margins
✅ Markdown with hierarchy and margins saved: output_markdown\Muller_et_al_2006_Bull_World_Health_Org.md


In [79]:
## DOI if in metadata

doc = fitz.open(test_pdf)
doi = doc.metadata.get("doi", None)
print("DOI:", doi)

DOI: None


In [80]:
## Added DOI if not in metadata

doc = fitz.open(test_pdf)
first_page_text = doc[0].get_text()
match = re.search(r'(10\.\d{4,9}/[-._;()/:A-Z0-9]+)', first_page_text, re.I)
doi = match.group(1) if match else None
print("DOI from text:", doi)


DOI from text: None


In [81]:
def extract_doi(pdf_path: str, bib_entry: dict = None) -> str:
    # 1. Check if provided via BibTeX
    if bib_entry and 'doi' in bib_entry:
        return bib_entry['doi']
    
    # 2. Else extract from PDF text
    doc = fitz.open(pdf_path)
    pages_to_check = [0, len(doc) - 1] if len(doc) > 1 else [0]
    
    for page_num in pages_to_check:
        text = doc[page_num].get_text()
        match = re.search(r'(10\.\d{4,9}/[-._;()/:A-Z0-9]+)', text, re.I)
        if match:
            return match.group(1)
    
    return None  # No DOI found


In [82]:
def analyze_hierarchy(md_text):
    print("📐 Extracted Header Structure:")
    for line in md_text.splitlines():
        if line.startswith("#"):
            print(line)

analyze_hierarchy(markdown_text)


📐 Extracted Header Structure:
# **Effects of insecticide-treated bednets during early infancy in** **an African area of intense malaria transmission: a randomized** **controlled trial**
### Olaf Müller, [a] Corneille Traoré, [b] Bocar Kouyaté, [b] Yazoumé Yé, [b] Claudia Frey, [a] Boubacar Coulibaly, [b] & Heiko Becher [a]
## **Introduction**
### 120 Bulletin of the World Health Organization | February 2006, 84 (2)
## **Methods**
#### **Study area**
#### **Participants**
#### **Study design**
#### **Outcomes**
#### **Sample size**
#### **Randomization**
#### **Procedures**
### Bulletin of the World Health Organization | February 2006, 84 (2) 121
#### **Statistical analysis**
#### **Analysis of longitudinal data**
#### **Analysis of data from cross-** **sectional surveys**
#### **Ethics approval**
### 122 Bulletin of the World Health Organization | February 2006, 84 (2)
## **Results**
#### **Mortality**
#### **Morbidity** **Longitudinal data**
#### **Cross-sectional survey data**
### Bu

In [83]:
for body_limit in [9, 10, 11]:
    for max_levels in [3, 4, 5]:
        print(f"🔧 Trying body_limit={body_limit}, max_levels={max_levels}")
        headers = pymupdf4llm.IdentifyHeaders(doc, body_limit=body_limit, max_levels=max_levels)
        md = pymupdf4llm.to_markdown(doc, hdr_info=headers, margins=(0,50,0,30))
        analyze_hierarchy(md)
        print("-" * 80)


🔧 Trying body_limit=9, max_levels=3
📐 Extracted Header Structure:
# **Effects of insecticide-treated bednets during early infancy in** **an African area of intense malaria transmission: a randomized** **controlled trial**
### Olaf Müller, [a] Corneille Traoré, [b] Bocar Kouyaté, [b] Yazoumé Yé, [b] Claudia Frey, [a] Boubacar Coulibaly, [b] & Heiko Becher [a]
## **Introduction**
### 120 Bulletin of the World Health Organization | February 2006, 84 (2)
## **Methods**
### Bulletin of the World Health Organization | February 2006, 84 (2) 121
### 122 Bulletin of the World Health Organization | February 2006, 84 (2)
## **Results**
### Bulletin of the World Health Organization | February 2006, 84 (2) 123
## **Discussion**
### 124 Bulletin of the World Health Organization | February 2006, 84 (2)
### **Résumé**
### **Resumen**
### Bulletin of the World Health Organization | February 2006, 84 (2) 125
### **References**
### 126 Bulletin of the World Health Organization | February 2006, 84 (2)
---