In [28]:
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
from pathlib import Path
import time
from docling.datamodel.base_models import FigureElement, InputFormat, Table
from docling_core.types.doc import ImageRefMode, PictureItem, TextItem

In [29]:
IMAGE_RESOLUTION_SCALE = 1.0

In [34]:
def analyze_doc(input_doc_path: Path, image_dir: Path, output_doc_dir: Path):
    image_dir.mkdir(parents=True, exist_ok=True)
    output_doc_dir.mkdir(parents=True, exist_ok=True)

    pipeline_options = PdfPipelineOptions()
    pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
    pipeline_options.generate_page_images = True
    pipeline_options.generate_picture_images = True

    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

    start_time = time.time()
    result = converter.convert(input_doc_path)
    end_time = time.time() - start_time
    doc_filename = result.input.file.stem

    # Save images of figures and tables
    page_numbers = set()
    pic_counter = 0
    for element, _level in result.document.iterate_items():
        if isinstance(element, PictureItem):
            page_number = element.prov[0].page_no
            if page_number in page_numbers:
                pic_counter += 1
            else:
                pic_counter = 0
                page_numbers.add(page_number)
            element_image_filename = (
                image_dir / f"{page_number}.{pic_counter}.png"
            )
            with element_image_filename.open("wb") as fp:
                element.get_image(result.document).save(fp, "PNG")
    
    # Save markdown with embedded pictures
    md_filename = output_doc_dir / f"{doc_filename}.md"
    result.document.save_as_markdown(md_filename, image_mode=ImageRefMode.PLACEHOLDER)

    print(f"Document converted in {end_time:.2f} seconds.")

In [36]:
pdf_names = ["wildfire", "spanned-table", "attention", "cj"]
for pdf in pdf_names:
    md_path = "./markdown/docling/"
    result = analyze_doc(Path(".") / f"{pdf}.pdf", Path("./images/docling") / pdf, Path(md_path))

Document converted in 9.98 seconds.
Document converted in 15.02 seconds.
Document converted in 12.25 seconds.
Document converted in 77.11 seconds.


In [33]:
pdf_names = ["complex"]
for pdf in pdf_names:
    md_path = "./markdown/docling/"
    result = analyze_doc(Path(".") / f"{pdf}.pdf", Path("./images/docling") / pdf, Path(md_path))

Annual Report 2024
Annual Report 2024
As the world changes , we make it easier to be tryg
As the world changes , we make it easier to be tryg
-
-
Contents
Contents
Management's review
Management's review
Introduction
Introduction
3
3
A message from the Chair and Group CEO
A message from the Chair and Group CEO
4
4
Tryg at a glance
Tryg at a glance
6
6
Events in 2024
Events in 2024
7
7
Strategy
Strategy
9
9
2024 financial targets fully delivered
2024 financial targets fully delivered
10
10
United Towards '27
United Towards '27
12
12
Financial results
Financial results
15
15
Financial highlights 2024
Financial highlights 2024
16
16
Financial highlights Q4 2024
Financial highlights Q4 2024
17
17
Income overview
Income overview
18
18
Financial outlook
Financial outlook
19
19
Tryg's results
Tryg's results
21
21
Business areas
Business areas
24
24
Private
Private
25
25
Commercial
Commercial
27
27
Corporate
Corporate
29
29
Investment activities
Investment activities
31
31
Tax governance
Tax g

In [None]:
input_doc_path = Path("./wildfire.pdf")
image_dir = Path("./images/docling") / "wildfire"
image_dir.mkdir(parents=True, exist_ok=True)
output_doc_dir = Path("./markdown/docling")
output_doc_dir.mkdir(parents=True, exist_ok=True)
analyze_doc(input_doc_path, image_dir, output_doc_dir)




Figure 1. Annual Wildfires and Acres Burned, 1993-2022
Figure 2. Top Five Years with Largest Wildfire Acreage Burned Since 1960
Figure 4. Acreage Burned by Region and Ownership

Document converted in 9.91 seconds.
