In [None]:
# --- Extract Paper (Docling) with Figures, Tables, Formulas, and Markdown Export ---
import logging
import time
from pathlib import Path
import pandas as pd
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem, FormulaItem
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

logging.basicConfig(level=logging.INFO)
_log = logging.getLogger(__name__)

INPUT_PDF_PATH = Path("D:/WorkSpace/PDF_Parser/1706.03762v7.pdf")
OUTPUT_DIR = Path("output_parser")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# --- Configure Docling Pipeline ---
pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = 2.0
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True
pipeline_options.do_formula_enrichment = True  # Enable formula detection

converter = DocumentConverter(
    format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
)

start_time = time.time()
result = converter.convert(INPUT_PDF_PATH)
doc = result.document
pdf_stem = result.input.file.stem



# --- Export Page Images ---


for page_no, page in doc.pages.items():

    page_img_path = OUTPUT_DIR / f"{pdf_stem}-page-{page_no}.png"

    page.image.pil_image.save(page_img_path, format="PNG")

    print(f"[✓] Saved page image: {page_img_path}")



# --- Export Figures (Pictures) ---


for i, (element, _) in enumerate(doc.iterate_items()):

    if isinstance(element, PictureItem):

        img_path = OUTPUT_DIR / f"{pdf_stem}-figure-{i+1}.png"

        element.get_image(doc).save(img_path, "PNG")

        print(f"[✓] Saved figure: {img_path}")



# --- Export Tables (CSV, HTML) ---


for i, table in enumerate(doc.tables):

    df: pd.DataFrame = table.export_to_dataframe()


    csv_path = OUTPUT_DIR / f"{pdf_stem}-table-{i+1}.csv"

    df.to_csv(csv_path, index=False)

    print(f"[✓] Saved table CSV: {csv_path}")


    html_path = OUTPUT_DIR / f"{pdf_stem}-table-{i+1}.html"

    with html_path.open("w", encoding="utf-8") as f:

        f.write(table.export_to_html(doc=doc))

    print(f"[✓] Saved table HTML: {html_path}")



# --- Export Full Markdown (Embedded and Referenced Images) ---



md_embed = OUTPUT_DIR / f"{pdf_stem}-embedded.md"



doc.save_as_markdown(md_embed, image_mode=ImageRefMode.EMBEDDED)



md_ref = OUTPUT_DIR / f"{pdf_stem}-referenced.md"



doc.save_as_markdown(md_ref, image_mode=ImageRefMode.REFERENCED)



# --- Export Full HTML ---



html_file = OUTPUT_DIR / f"{pdf_stem}-referenced.html"



doc.save_as_html(html_file, image_mode=ImageRefMode.REFERENCED)

elapsed = time.time() - start_time



print(f"[✓] Done. Elapsed time: {elapsed:.2f}s")

2025-10-14 10:23:27,657 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-14 10:23:27,668 - INFO - Going to convert document batch...
2025-10-14 10:23:27,669 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 7b0c7615fd324ea001c6e9cb34b0bb94
2025-10-14 10:23:27,671 - INFO - rapidocr cannot be used because onnxruntime is not installed.
2025-10-14 10:23:27,672 - INFO - easyocr cannot be used because it is not installed.
2025-10-14 10:23:27,673 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-10-14 10:23:27,701 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2025-10-14 10:23:27,722 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\vanch\anaconda3\envs\paperreader\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2025-10-14 10:23:27,724 [RapidOCR] torch.py:54: Using C:\Users\vanch\anaconda3\envs\paperreader\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2025-10-14

[✓] Saved page image: output_parser\1706.03762v7-page-1.png
[✓] Saved page image: output_parser\1706.03762v7-page-2.png
[✓] Saved page image: output_parser\1706.03762v7-page-3.png
[✓] Saved page image: output_parser\1706.03762v7-page-4.png
[✓] Saved page image: output_parser\1706.03762v7-page-5.png
[✓] Saved page image: output_parser\1706.03762v7-page-6.png
[✓] Saved page image: output_parser\1706.03762v7-page-7.png
[✓] Saved page image: output_parser\1706.03762v7-page-8.png
[✓] Saved page image: output_parser\1706.03762v7-page-9.png
[✓] Saved page image: output_parser\1706.03762v7-page-10.png
[✓] Saved page image: output_parser\1706.03762v7-page-11.png
[✓] Saved page image: output_parser\1706.03762v7-page-12.png




[✓] Saved page image: output_parser\1706.03762v7-page-13.png
[✓] Saved page image: output_parser\1706.03762v7-page-14.png
[✓] Saved page image: output_parser\1706.03762v7-page-15.png
[✓] Saved figure: output_parser\1706.03762v7-figure-29.png
[✓] Saved figure: output_parser\1706.03762v7-figure-38.png
[✓] Saved figure: output_parser\1706.03762v7-figure-39.png
[✓] Saved figure: output_parser\1706.03762v7-figure-170.png
[✓] Saved figure: output_parser\1706.03762v7-figure-173.png
[✓] Saved figure: output_parser\1706.03762v7-figure-176.png




[✓] Saved table CSV: output_parser\1706.03762v7-table-1.csv
[✓] Saved table HTML: output_parser\1706.03762v7-table-1.html
[✓] Saved table CSV: output_parser\1706.03762v7-table-2.csv
[✓] Saved table HTML: output_parser\1706.03762v7-table-2.html
[✓] Saved table CSV: output_parser\1706.03762v7-table-3.csv
[✓] Saved table HTML: output_parser\1706.03762v7-table-3.html
[✓] Saved table CSV: output_parser\1706.03762v7-table-4.csv
[✓] Saved table HTML: output_parser\1706.03762v7-table-4.html
[✓] Done. Elapsed time: 45.44s
