In [7]:
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered, save_output
from marker.config.parser import ConfigParser
import torch
import time
from pathlib import Path

In [2]:
config = ConfigParser(
    {
        "output_format": "markdown",
        "languages": "eng",  # Change to your desired language (e.g., "dan" for Danish)
        "use_llm": False,  # Enable LLM features
        "disable_image_extraction": False, # disables image extraction
    }
)

In [3]:
converter = PdfConverter(
    artifact_dict=create_model_dict(),
    config=config.generate_config_dict(),
    renderer=config.get_renderer()
)

Downloading layout model...: 100%|██████████| 5/5 [00:05<00:00,  1.04s/it]


Loaded layout model s3://layout/2025_02_18 on device mps with dtype torch.float16


Downloading texify model...: 100%|██████████| 9/9 [00:06<00:00,  1.29it/s]


Loaded texify model s3://texify/2025_02_18 on device mps with dtype torch.float16


Downloading text_recognition model...: 100%|██████████| 9/9 [00:18<00:00,  2.03s/it]


Loaded recognition model s3://text_recognition/2025_02_18 on device mps with dtype torch.float16


Downloading table_recognition model...: 100%|██████████| 5/5 [00:03<00:00,  1.25it/s]


Loaded table recognition model s3://table_recognition/2025_02_18 on device mps with dtype torch.float16


Downloading text_detection model...: 100%|██████████| 6/6 [00:04<00:00,  1.39it/s]


Loaded detection model s3://text_detection/2025_02_18 on device mps with dtype torch.float16


Downloading inline_math_detection model...: 100%|██████████| 5/5 [00:02<00:00,  2.50it/s]


Loaded detection model s3://inline_math_detection/2025_02_18 on device mps with dtype torch.float16


Downloading ocr_error_detection model...: 100%|██████████| 8/8 [00:04<00:00,  1.61it/s]


In [8]:
pdf_names = ["wildfire", "spanned-table", "attention", "complex", "cj"]
for pdf in pdf_names:
    out_dir = Path("./marker") / pdf
    out_dir .mkdir(parents=True, exist_ok=True)

    start_time = time.time()
    rendered = converter(f"./{pdf}.pdf")
    end_time = time.time() - start_time
    print(f"Document {pdf} converted in {end_time:.2f} seconds.")
    save_output(rendered, f"./marker/{pdf}/", pdf)

Recognizing layout: 100%|██████████| 1/1 [00:02<00:00,  2.45s/it]
Running OCR Error Detection: 100%|██████████| 1/1 [00:00<00:00,  6.31it/s]
Detecting bboxes: 0it [00:00, ?it/s]
Detecting bboxes: 0it [00:00, ?it/s]
Recognizing tables: 100%|██████████| 1/1 [00:01<00:00,  1.69s/it]


Document wildfire converted in 5.21 seconds.


Recognizing layout: 100%|██████████| 2/2 [00:03<00:00,  1.79s/it]
Running OCR Error Detection: 100%|██████████| 3/3 [00:00<00:00, 12.32it/s]
Detecting bboxes: 0it [00:00, ?it/s]
Detecting bboxes: 0it [00:00, ?it/s]
Recognizing tables: 100%|██████████| 1/1 [00:07<00:00,  7.13s/it]


Document spanned-table converted in 12.34 seconds.


Recognizing layout: 100%|██████████| 2/2 [00:04<00:00,  2.37s/it]
Running OCR Error Detection: 100%|██████████| 3/3 [00:00<00:00, 11.31it/s]
Detecting bboxes: 0it [00:00, ?it/s]
Texify inference: 100%|██████████| 1/1 [00:04<00:00,  4.62s/it]
Detecting bboxes: 0it [00:00, ?it/s]
Recognizing tables: 100%|██████████| 1/1 [00:03<00:00,  3.54s/it]

Document attention converted in 14.75 seconds.



Recognizing layout: 100%|██████████| 5/5 [00:13<00:00,  2.79s/it]
Running OCR Error Detection: 100%|██████████| 8/8 [00:00<00:00, 10.85it/s]
Detecting bboxes: 0it [00:00, ?it/s]
Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  1.47it/s]
Recognizing Text: 100%|██████████| 1/1 [00:02<00:00,  2.92s/it]
Recognizing tables: 100%|██████████| 2/2 [00:07<00:00,  3.98s/it]


Document complex converted in 32.08 seconds.


Recognizing layout: 100%|██████████| 4/4 [00:07<00:00,  1.76s/it]
Running OCR Error Detection: 100%|██████████| 6/6 [00:00<00:00, 11.89it/s]
Detecting bboxes: 0it [00:00, ?it/s]
Detecting bboxes: 100%|██████████| 2/2 [00:01<00:00,  1.11it/s]
Recognizing Text: 100%|██████████| 86/86 [09:00<00:00,  6.29s/it]
Recognizing tables: 100%|██████████| 2/2 [00:11<00:00,  5.87s/it]


Document cj converted in 564.90 seconds.
