# Convert and extract tables


In this example we will use the output of the converted document and extract the tables detected on each page.




### Authentication via stored credentials

In this example, we initialize the Deep Search client from the credentials
contained in the file `../../ds-auth.json`. This can be generated with

```shell
!deepsearch login --output ../../ds-auth.json
```

More details in the [docs](https://ds4sd.github.io/deepsearch-toolkit/getting_started/#authentication).

### Notebooks parameters

The following block defines the parameters used to execute the notebook

- `CONFIG_FILE`: location of the Deep Search configuration file
- `INPUT_FILE`: the input PDF to converted and analyzed


In [1]:
# Input parameters for the example flow
from pathlib import Path
CONFIG_FILE = Path("../../ds-auth.json")
PROJ_KEY = "1234567890abcdefghijklmnopqrstvwyz123456"

# INPUT_FILE = Path("../../data/samples/2206.01062.pdf")
INPUT_FILE = Path("../../data/samples/2206.00785.pdf")


## Helper functions

The following blocks define the helper functions used for the visualization of the output

In [2]:
# Import standard dependenices
import json
import tempfile
import typing
from zipfile import ZipFile

# IPython utilities
from IPython.display import display, Markdown, HTML, display_html

# Import the deepsearch-toolkit
import deepsearch as ds

In [3]:
def get_tablecell_span(cell, ix):
    span = set([s[ix] for s in cell['spans']])
    if len(span) == 0:
        return 1, None, None
    return len(span), min(span), max(span)



def write_table(item):
    """
    Convert the JSON table representation to HTML, including column and row spans.
    
    Parameters
    ----------
    item :
        JSON table
    doc_cellsdata :
        Cells document provided by the Deep Search conversion
    ncols : int, Default=3
        Number of columns in the display table.
    """
    
    table = item
    body = ""

    nrows = table['#-rows']
    ncols = table['#-cols']

    body += "<table>\n"
    for i in range(nrows):
        body += "  <tr>\n"
        for j in range(ncols):
            cell = table['data'][i][j]

            rowspan,rowstart,rowend = get_tablecell_span(cell, 0)
            colspan,colstart,colend = get_tablecell_span(cell, 1)

            if rowstart is not None and rowstart != i: continue
            if colstart is not None and colstart != j: continue

            if rowstart is None:
                rowstart = i
            if colstart is None:
                colstart = j

            content = cell['text']
            if content == '':
                content = '&nbsp;'

            label = cell['type']
            label_class = 'body'
            if label in ['row_header', 'row_multi_header', 'row_title']:
                label_class = 'header'
            elif label in ['col_header', 'col_multi_header']:
                label_class = 'header'
            
            
            celltag = 'th' if label_class == 'header' else 'td'
            style = 'style="text-align: center;"' if label_class == 'header' else ''

            body += f'    <{celltag} rowstart="{rowstart}" colstart="{colstart}" rowspan="{rowspan}" colspan="{colspan}" {style}>{content}</{celltag}>\n'

        body += "  </tr>\n"

    body += "</table>"

    return body

In [4]:
def visualize_document_tables(doc_jsondata):
    """
    Visualize the tables idenfitied in the converted document.
    
    Parameters
    ----------
    doc_jsondata :
        Converted document
    """

    
    page_counters = {}
    # Iterate through all the tables identified in the converted document
    for table in doc_jsondata.get("tables", []):
        prov = table["prov"][0]
        page = prov["page"]
        page_counters.setdefault(page, 0)
        page_counters[page] += 1
        
        output_html = write_table(table)
        display(Markdown(f"## Table {page_counters[page]} on page {page}"))
        display(HTML(output_html))   


## Document conversion and visualization with Deep Search

In [5]:
# Initialize the Deep Search client from the config file
config = ds.DeepSearchConfig.parse_file(CONFIG_FILE)
client = ds.CpsApiClient(config)
api = ds.CpsApi(client)

In [6]:
# Launch the docucment conversion and download the results
documents = ds.convert_documents(
    api=api, proj_key=PROJ_KEY, source_path=INPUT_FILE, progress_bar=True
)

Processing input:     : 100%|[38;2;15;98;254m██████████████████████████████[0m| 1/1 [00:00<00:00, 127.04it/s][38;2;15;98;254m                                                                    [0m
Submitting input:     : 100%|[38;2;15;98;254m██████████████████████████████[0m| 1/1 [00:18<00:00, 18.31s/it][38;2;15;98;254m                                                                     [0m
Converting input:     : 100%|[38;2;15;98;254m██████████████████████████████[0m| 1/1 [00:32<00:00, 32.61s/it][38;2;15;98;254m                                                                     [0m


In [7]:
output_dir = tempfile.mkdtemp() # TODO: switch to tempfile.TemporaryDirectory() and use `with`

documents.download_all(result_dir=output_dir, progress_bar=True)

for output_file in Path(output_dir).rglob("json*.zip"):
    with ZipFile(output_file) as archive:
        all_files = archive.namelist()
        for name in all_files:
            if not name.endswith(".json"):
                continue
            
            basename = name.rstrip('.json')
            doc_jsondata = json.loads(archive.read(f"{basename}.json"))

            visualize_document_tables(doc_jsondata)

Downloading result:   : 100%|[38;2;15;98;254m██████████████████████████████[0m| 1/1 [00:02<00:00,  2.21s/it][38;2;15;98;254m                                                                     [0m


## Table 1 on page 7

0,1,2,3
,,component,component
,,worker,ML model
,Memory,450 MB,500 MB
Profile A,CPU,0.4 cores,0.7 cores
,Replica ratio,1,1
,Memory,700 MB,500 MB
Profile B,CPU,1.2 cores,0.7 cores
,Replica ratio,1,4


## Table 1 on page 9

0,1,2
,distribution scheme,distribution scheme
profile,system state,document-level page-level
A,idle 32.6,± 1.1 26.3 ± 0.9
,busy 114.9 ± 17.5,248.1 ± 17.3
,idle 32.6,± 0.2 25.7 ± 0.7
B,busy 109.6 ± 47.7,220.4 ± 34.2
