In [17]:
import fitz
from pathlib import Path
from tqdm import tqdm

PDF to PNG extraction for RTK pipeline with YALTAi/Kraken OCR.

In [18]:
def extract_pdfs(input_path, output_dir="Editions", dpi=300):
    """Extract PDF(s) to PNG images at specified DPI."""
    input_path = Path(input_path)
    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True)
    
    # Get list of PDFs
    if input_path.is_file():
        pdfs = [input_path] if input_path.suffix.lower() == '.pdf' else []
    else:
        pdfs = list(input_path.glob("*.pdf")) + list(input_path.glob("*.PDF"))
    
    if not pdfs:
        print(f"No PDFs found in {input_path}")
        return
    
    total_pages = 0
    start_time = time.time()
    
    for pdf_path in tqdm(pdfs, desc="Processing PDFs"):
        # Create folder: remove spaces, parentheses, apostrophes
        folder_name = pdf_path.stem.replace('(', '').replace(')', '').replace("'", '').replace(' ', '_')
        pdf_output = output_dir / folder_name
        pdf_output.mkdir(exist_ok=True)
        
        # Extract pages
        doc = fitz.open(pdf_path)
        pages = len(doc)
        
        for page_num in tqdm(range(pages), desc=f"Pages from {pdf_path.name}", leave=False):
            page = doc.load_page(page_num)
            pix = page.get_pixmap(dpi=dpi)
            pix.save(pdf_output / f"page_{page_num+1:04d}.png")
        
        total_pages += pages
        doc.close()
        
        print(f"{pdf_path.name}: {pages} pages -> {folder_name}/")
    
    elapsed = time.time() - start_time
    print(f"Total: {total_pages} pages extracted in {elapsed:.1f}s ({total_pages/elapsed:.1f} pages/s)")

Usage examples:

**Single PDF**

extract_pdfs("/Users/carboni/Library/CloudStorage/Box-Box/Projects/Iconology/Ripa's edition/Padova 1618 (Cesare Ripa).pdf")

**Entire folder**

extract_pdfs("/Users/carboni/Library/CloudStorage/Box-Box/Projects/Iconology/Ripa's edition/")

**Custom settings**

extract_pdfs("/path/to/pdfs", output_dir="MyOutput", dpi=400)

In [19]:
extract_pdfs("/Users/carboni/Library/CloudStorage/Box-Box/Projects/Iconology/Ripa's edition/Padova 1618 (Cesare Ripa).pdf")

Processing PDFs:   0%|          | 0/1 [00:00<?, ?it/s]
Pages from Padova 1618 (Cesare Ripa).pdf:   0%|          | 0/704 [00:00<?, ?it/s][A
Pages from Padova 1618 (Cesare Ripa).pdf:   0%|          | 1/704 [00:00<05:00,  2.34it/s][A
Pages from Padova 1618 (Cesare Ripa).pdf:   0%|          | 2/704 [00:00<03:55,  2.98it/s][A
Pages from Padova 1618 (Cesare Ripa).pdf:   0%|          | 3/704 [00:00<03:36,  3.24it/s][A
Pages from Padova 1618 (Cesare Ripa).pdf:   1%|          | 4/704 [00:01<03:27,  3.38it/s][A
Pages from Padova 1618 (Cesare Ripa).pdf:   1%|          | 5/704 [00:01<03:39,  3.18it/s][A
Pages from Padova 1618 (Cesare Ripa).pdf:   1%|          | 6/704 [00:01<03:34,  3.26it/s][A
Pages from Padova 1618 (Cesare Ripa).pdf:   1%|          | 7/704 [00:02<03:43,  3.12it/s][A
Pages from Padova 1618 (Cesare Ripa).pdf:   1%|          | 8/704 [00:02<03:50,  3.02it/s][A
Pages from Padova 1618 (Cesare Ripa).pdf:   1%|▏         | 9/704 [00:02<03:58,  2.92it/s][A
Pages from Padova 1618 

Padova 1618 (Cesare Ripa).pdf: 704 pages -> Padova_1618_Cesare_Ripa/
Total: 704 pages extracted in 271.5s (2.6 pages/s)



