In [0]:
%pip install pandas docling
%restart_python

In [0]:
from pathlib import Path
import pandas as pd
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
)
from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, PdfFormatOption

In [0]:
def main(input_doc_path: str):
    # Define processing pipeline
    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = True
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True
    # Create the converter
    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=pipeline_options,
            )
        }
    )
    # Convert the document
    conversion_result = converter.convert(input_doc_path)
    doc = conversion_result.document
    raw_document_markdown = doc.export_to_markdown()
    tables_markdown = [f"Table {table_ix}:/n{table.export_to_dataframe().to_markdown()}" for table_ix, table in enumerate(doc.tables)]
    return {
        "uuid": doc.origin.binary_hash,
        "document": doc.name,
        "num_pages": doc.num_pages(),
        "full_markdown": raw_document_markdown,
        "tables_markdown": tables_markdown
    }

In [0]:
# Using local files from Workspace directory, recommended to use Volumes instead.
data_folder = Path("./assets").resolve()
input_doc_path = data_folder / "FY24_Q4_Consolidated_Financial_Statements.pdf"
main(input_doc_path)

In [0]:
#Extracting tables
tables = dict()
for table_ix, table in enumerate(doc.tables):
    table_df: pd.DataFrame = table.export_to_dataframe()
    print(f"## Table {table_ix}")
    print(table_df)
    tables[table_ix] = table_df
    # Save the table as csv
    # element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.csv"
    # _log.info(f"Saving CSV table to {element_csv_filename}")
    # table_df.to_csv(element_csv_filename)