# Convert and extract tables


In this example we will use the output of the converted document and extract the tables detected on each page.




### Set notebooks parameters

In [None]:
from pathlib import Path
from dsnotebooks.settings import ProjectNotebookSettings

# notebook settings auto-loaded from .env / env vars
notebook_settings = ProjectNotebookSettings()

PROFILE_NAME = notebook_settings.profile  # the profile to use
PROJ_KEY = notebook_settings.proj_key  # the project to use
INPUT_FILE = Path("../../data/samples/2206.00785.pdf")

## Helper functions

The following blocks define the helper functions used for the visualization of the output

In [None]:
# Import standard dependenices
import json
import tempfile
from zipfile import ZipFile

# IPython utilities
from IPython.display import display, Markdown, HTML

# Import the deepsearch-toolkit
import deepsearch as ds

In [None]:
def get_tablecell_span(cell, ix):
    span = set([s[ix] for s in cell["spans"]])
    if len(span) == 0:
        return 1, None, None
    return len(span), min(span), max(span)


def write_table(item):
    """
    Convert the JSON table representation to HTML, including column and row spans.

    Parameters
    ----------
    item :
        JSON table
    doc_cellsdata :
        Cells document provided by the Deep Search conversion
    ncols : int, Default=3
        Number of columns in the display table.
    """

    table = item
    body = ""

    nrows = table["#-rows"]
    ncols = table["#-cols"]

    body += "<table>\n"
    for i in range(nrows):
        body += "  <tr>\n"
        for j in range(ncols):
            cell = table["data"][i][j]

            rowspan, rowstart, rowend = get_tablecell_span(cell, 0)
            colspan, colstart, colend = get_tablecell_span(cell, 1)

            if rowstart is not None and rowstart != i:
                continue
            if colstart is not None and colstart != j:
                continue

            if rowstart is None:
                rowstart = i
            if colstart is None:
                colstart = j

            content = cell["text"]
            if content == "":
                content = "&nbsp;"

            label = cell["type"]
            label_class = "body"
            if label in ["row_header", "row_multi_header", "row_title"]:
                label_class = "header"
            elif label in ["col_header", "col_multi_header"]:
                label_class = "header"

            celltag = "th" if label_class == "header" else "td"
            style = 'style="text-align: center;"' if label_class == "header" else ""

            body += f'    <{celltag} rowstart="{rowstart}" colstart="{colstart}" rowspan="{rowspan}" colspan="{colspan}" {style}>{content}</{celltag}>\n'

        body += "  </tr>\n"

    body += "</table>"

    return body

In [None]:
def visualize_document_tables(doc_jsondata):
    """
    Visualize the tables idenfitied in the converted document.

    Parameters
    ----------
    doc_jsondata :
        Converted document
    """

    page_counters = {}
    # Iterate through all the tables identified in the converted document
    for table in doc_jsondata.get("tables", []):
        prov = table["prov"][0]
        page = prov["page"]
        page_counters.setdefault(page, 0)
        page_counters[page] += 1

        output_html = write_table(table)
        display(Markdown(f"## Table {page_counters[page]} on page {page}"))
        display(HTML(output_html))

## Document conversion and visualization with Deep Search

In [None]:
api = ds.CpsApi.from_env(profile_name=PROFILE_NAME)

In [None]:
# Launch the docucment conversion and download the results
documents = ds.convert_documents(
    api=api, proj_key=PROJ_KEY, source_path=INPUT_FILE, progress_bar=True
)

In [None]:
with tempfile.TemporaryDirectory() as output_dir:
    documents.download_all(result_dir=output_dir, progress_bar=True)

    # group output files and visualize the output
    json_file = list(Path(output_dir).glob("*.json"))[0]

    with open(json_file) as f:
        doc_jsondata = json.loads(f.read())
        visualize_document_tables(doc_jsondata)
