# Tables demo/ workbook 



In [1]:
# INITIALIZATION BOILERPLATE

# The Jupyter kernel for this notebook usually starts up inside the notebooks
# directory, but the text_extensions_for_pandas package code is in the parent
# directory. Add that parent directory to the front of the Python include path.
import sys
if (sys.path[0] != ".."):
    sys.path[0] = ".."

import json
import os
from ibm_watson import CompareComplyV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from ibm_watson import ApiException

import pandas as pd 
import pickle 



In [2]:
# Retrieve the APIKEY for authentication
apikey = os.environ.get("IBM_API_KEY_TABLES")
if apikey is None:
    raise ValueError("Expected apikey in the environment variable 'IBM_API_KEY'")

# Get the service URL for your IBM Cloud instance
ibm_cloud_service_url = os.environ.get("IBM_SERVICE_URL_TABLES")
if ibm_cloud_service_url is None:
    raise ValueError("Expected IBM cloud service URL in the environment variable 'IBM_SERVICE_URL'")


In [3]:
#initialize the authenticator
authenticator = IAMAuthenticator(apikey)
compare_comply = CompareComplyV1(
    version = '2020-1-1',
    authenticator=authenticator
)

compare_comply.set_service_url(ibm_cloud_service_url)

In [194]:
base_example_path = "../resources/tables/"
file_names = [ "20-populous-countries", "cali-temp-chart", "california-population-chart","WHO-covid-overview-chart", "double_header_table"]

archive_name = "archive1"
#set to true to query watson
re_query_watson = False



In [78]:
#responses_dict = {}
if re_query_watson:
    print("This shouldn't happen")
    for file_n in file_names:
        with open(f"{base_example_path}{file_n}.pdf", 'rb') as base_file:
            try:
                result = compare_comply.extract_tables(base_file).get_result()
                responses_dict[file_n] = result
            except ApiException as ex:
                print ("Method failed with status code " + str(ex.code) + ": " + ex.message)
     #archive file
    with open(f"{base_example_path}{archive_name}.pkl", 'wb') as archive_file:
        pickle.dump(responses_dict, archive_file)
else:
    with open(f"{base_example_path}{archive_name}.pkl", 'rb') as archive_file:
        responses_dict = pickle.load(archive_file)

This shouldn't happen


In [82]:
json.dumps(responses_dict["double_header_table"], indent = 2)

'{\n  "document": {\n    "title": "Document1",\n    "html": "<?xml version=\'1.0\' encoding=\'UTF-8\' standalone=\'yes\'?><html>\\n<head>\\n    <meta content=\\"text/html; charset=UTF-8\\" http-equiv=\\"Content-Type\\"/><meta content=\\"Zachary Eichenberger\\" name=\\"author\\"/><meta content=\\"2020-07-23\\" name=\\"publicationdate\\"/><meta content=\\"1\\" name=\\"numPages\\"/><title>Document1</title><style>/**/\\n .Calibri_Black_12_0{font-size: 12.0pt; font-family: Calibri; color: Black; }\\n/**/</style></head>\\n<body><table border=\\"1\\" data-max-height=\\"115.12005615234375\\" data-max-width=\\"522.1603393554688\\" data-max-x=\\"22.501543045043945\\" data-max-y=\\"67.73318481445312\\" data-min-height=\\"115.12005615234375\\" data-min-width=\\"522.1603393554688\\" data-min-x=\\"22.501543045043945\\" data-min-y=\\"67.73318481445312\\" data-page=\\"1\\"><tbody><tr><td colspan=\\"1\\" colstart=\\"1\\" rowspan=\\"1\\" rowstart=\\"1\\">\\u00a0</td><td colspan=\\"2\\" colstart=\\"2\\" 

In [7]:
import text_extensions_for_pandas as tp

In [180]:
#select a response and process it, printing the outputs
response = responses_dict["double_header_table"]

dfs_dict = tp.watson_tables_parse_response(response)

1


In [181]:
print("displaying headers")
display(dfs_dict['row_headers'])
display(dfs_dict['col_headers'])
print("displaying objects")
display(dfs_dict['body_cells'])

displaying headers


Unnamed: 0,text,column_index_begin,column_index_end,row_index_begin,row_index_end,cell_id,text_normalized
0,Statatory tax rate,0,0,2,2,rowHeader-2810-2829,Statatory tax rate
1,IRS audit settlement,0,0,3,3,rowHeader-4068-4089,IRS audit settlement
2,Dividends received,0,0,4,4,rowHeader-5329-5348,Dividends received
3,Total tax rate,0,0,5,5,rowHeader-6586-6601,Total tax rate


Unnamed: 0,text,column_index_begin,column_index_end,row_index_begin,row_index_end,cell_id,text_normalized
0,,0,0,0,0,colHeader-786-787,
1,Three months ended setptember 30,1,2,0,0,colHeader-1012-1206,Three months ended setptember 30
2,Nine months ended setptember 30,3,4,0,0,colHeader-1444-1514,Nine months ended setptember 30
3,,0,0,1,1,colHeader-1586-1587,
4,2005,1,1,1,1,colHeader-1813-1818,2005
5,2004,2,2,1,1,colHeader-2061-2066,2004
6,2005,3,3,1,1,colHeader-2305-2310,2005
7,2004,4,4,1,1,colHeader-2553-2558,2004


displaying objects


Unnamed: 0,text,column_index_begin,column_index_end,row_index_begin,row_index_end,cell_id,column_header_ids,column_header_texts,row_header_ids,row_header_texts
0,35%,1,1,2,2,bodyCell-3073-3077,"[colHeader-1012-1206, colHeader-1813-1818]","[Three months ended setptember 30, 2005]",[rowHeader-2810-2829],[Statatory tax rate]
1,36%,2,2,2,2,bodyCell-3320-3324,"[colHeader-1012-1206, colHeader-2061-2066]","[Three months ended setptember 30, 2004]",[rowHeader-2810-2829],[Statatory tax rate]
2,37%,3,3,2,2,bodyCell-3564-3568,"[colHeader-1444-1514, colHeader-2305-2310]","[Nine months ended setptember 30, 2005]",[rowHeader-2810-2829],[Statatory tax rate]
3,38%,4,4,2,2,bodyCell-3811-3815,"[colHeader-1444-1514, colHeader-2553-2558]","[Nine months ended setptember 30, 2004]",[rowHeader-2810-2829],[Statatory tax rate]
4,97%,1,1,3,3,bodyCell-4333-4337,"[colHeader-1012-1206, colHeader-1813-1818]","[Three months ended setptember 30, 2005]",[rowHeader-4068-4089],[IRS audit settlement]
5,35.5%,2,2,3,3,bodyCell-4579-4585,"[colHeader-1012-1206, colHeader-2061-2066]","[Three months ended setptember 30, 2004]",[rowHeader-4068-4089],[IRS audit settlement]
6,58%,3,3,3,3,bodyCell-4825-4829,"[colHeader-1444-1514, colHeader-2305-2310]","[Nine months ended setptember 30, 2005]",[rowHeader-4068-4089],[IRS audit settlement]
7,15.2%,4,4,3,3,bodyCell-5071-5077,"[colHeader-1444-1514, colHeader-2553-2558]","[Nine months ended setptember 30, 2004]",[rowHeader-4068-4089],[IRS audit settlement]
8,13.2%,1,1,4,4,bodyCell-5591-5597,"[colHeader-1012-1206, colHeader-1813-1818]","[Three months ended setptember 30, 2005]",[rowHeader-5329-5348],[Dividends received]
9,3.3%,2,2,4,4,bodyCell-5838-5843,"[colHeader-1012-1206, colHeader-2061-2066]","[Three months ended setptember 30, 2004]",[rowHeader-5329-5348],[Dividends received]


In [184]:
body = dfs_dict['body_cells'] 
cols = dfs_dict["col_headers"] 
rows = dfs_dict["row_headers"] 

In [203]:

def horiz_explode(df_in,column):
    # a function to horizontally "explode" column headings 
    
    df = df_in.copy()
    #find max length of list 
    feild_names = []
    for ind, row in df.iterrows():
        feild_list = row[column]
        for num, val in enumerate(feild_list):
            colname = f"{column}_{num}"
            df.loc[ind, colname] = val
            if not(colname in feild_names): feild_names.append(colname)
    df.drop(columns = column)
    return df, feild_names
                
    
exploded, col_header_names = horiz_explode(body,"column_header_texts")
exploded, row_header_names = horiz_explode(exploded,"row_header_texts")

display(col_header_names)

display(exploded)

table = exploded.pivot_table(index= row_header_names, columns= col_header_names, values= "text", aggfunc=(lambda a:" | ".join(a)))
table

['column_header_texts_0', 'column_header_texts_1']

Unnamed: 0,text,column_index_begin,column_index_end,row_index_begin,row_index_end,cell_id,column_header_ids,column_header_texts,row_header_ids,row_header_texts,column_header_texts_0,column_header_texts_1,row_header_texts_0
0,35%,1,1,2,2,bodyCell-3073-3077,"[colHeader-1012-1206, colHeader-1813-1818]","[Three months ended setptember 30, 2005]",[rowHeader-2810-2829],[Statatory tax rate],Three months ended setptember 30,2005,Statatory tax rate
1,36%,2,2,2,2,bodyCell-3320-3324,"[colHeader-1012-1206, colHeader-2061-2066]","[Three months ended setptember 30, 2004]",[rowHeader-2810-2829],[Statatory tax rate],Three months ended setptember 30,2004,Statatory tax rate
2,37%,3,3,2,2,bodyCell-3564-3568,"[colHeader-1444-1514, colHeader-2305-2310]","[Nine months ended setptember 30, 2005]",[rowHeader-2810-2829],[Statatory tax rate],Nine months ended setptember 30,2005,Statatory tax rate
3,38%,4,4,2,2,bodyCell-3811-3815,"[colHeader-1444-1514, colHeader-2553-2558]","[Nine months ended setptember 30, 2004]",[rowHeader-2810-2829],[Statatory tax rate],Nine months ended setptember 30,2004,Statatory tax rate
4,97%,1,1,3,3,bodyCell-4333-4337,"[colHeader-1012-1206, colHeader-1813-1818]","[Three months ended setptember 30, 2005]",[rowHeader-4068-4089],[IRS audit settlement],Three months ended setptember 30,2005,IRS audit settlement
5,35.5%,2,2,3,3,bodyCell-4579-4585,"[colHeader-1012-1206, colHeader-2061-2066]","[Three months ended setptember 30, 2004]",[rowHeader-4068-4089],[IRS audit settlement],Three months ended setptember 30,2004,IRS audit settlement
6,58%,3,3,3,3,bodyCell-4825-4829,"[colHeader-1444-1514, colHeader-2305-2310]","[Nine months ended setptember 30, 2005]",[rowHeader-4068-4089],[IRS audit settlement],Nine months ended setptember 30,2005,IRS audit settlement
7,15.2%,4,4,3,3,bodyCell-5071-5077,"[colHeader-1444-1514, colHeader-2553-2558]","[Nine months ended setptember 30, 2004]",[rowHeader-4068-4089],[IRS audit settlement],Nine months ended setptember 30,2004,IRS audit settlement
8,13.2%,1,1,4,4,bodyCell-5591-5597,"[colHeader-1012-1206, colHeader-1813-1818]","[Three months ended setptember 30, 2005]",[rowHeader-5329-5348],[Dividends received],Three months ended setptember 30,2005,Dividends received
9,3.3%,2,2,4,4,bodyCell-5838-5843,"[colHeader-1012-1206, colHeader-2061-2066]","[Three months ended setptember 30, 2004]",[rowHeader-5329-5348],[Dividends received],Three months ended setptember 30,2004,Dividends received


column_header_texts_0,Nine months ended setptember 30,Nine months ended setptember 30,Three months ended setptember 30,Three months ended setptember 30
column_header_texts_1,2004,2005,2004,2005
row_header_texts_0,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Dividends received,4.7%,15.4%,3.3%,13.2%
IRS audit settlement,15.2%,58%,35.5%,97%
Statatory tax rate,38%,37%,36%,35%
Total tax rate,15.1%,38.8%,4.3%,76.1%


In [145]:
#"Third option" for combination. combines headings. Ideally there would be a more robust way to do this rather than just appending strings 
#  this should work, but is sub ideal. 
def agg_headers(series ):
    for rowOrCol in ["row", "column"]:
        feild = f"{rowOrCol}_header_texts"
        series[feild] = ", ".join(series[feild])
    return series
filled = body.apply(agg_headers, axis=1) 
display(filled)

table = filled.pivot(index= "row_header_texts", columns= "column_header_texts", values= "text")
table

Unnamed: 0,text,column_index_begin,column_index_end,row_index_begin,row_index_end,cell_id,column_header_ids,column_header_texts,row_header_ids,row_header_texts
0,35%,1,1,2,2,bodyCell-3073-3077,"[colHeader-1012-1206, colHeader-1813-1818]","Three months ended setptember 30, 2005",[rowHeader-2810-2829],Statatory tax rate
1,36%,2,2,2,2,bodyCell-3320-3324,"[colHeader-1012-1206, colHeader-2061-2066]","Three months ended setptember 30, 2004",[rowHeader-2810-2829],Statatory tax rate
2,37%,3,3,2,2,bodyCell-3564-3568,"[colHeader-1444-1514, colHeader-2305-2310]","Nine months ended setptember 30, 2005",[rowHeader-2810-2829],Statatory tax rate
3,38%,4,4,2,2,bodyCell-3811-3815,"[colHeader-1444-1514, colHeader-2553-2558]","Nine months ended setptember 30, 2004",[rowHeader-2810-2829],Statatory tax rate
4,97%,1,1,3,3,bodyCell-4333-4337,"[colHeader-1012-1206, colHeader-1813-1818]","Three months ended setptember 30, 2005",[rowHeader-4068-4089],IRS audit settlement
5,35.5%,2,2,3,3,bodyCell-4579-4585,"[colHeader-1012-1206, colHeader-2061-2066]","Three months ended setptember 30, 2004",[rowHeader-4068-4089],IRS audit settlement
6,58%,3,3,3,3,bodyCell-4825-4829,"[colHeader-1444-1514, colHeader-2305-2310]","Nine months ended setptember 30, 2005",[rowHeader-4068-4089],IRS audit settlement
7,15.2%,4,4,3,3,bodyCell-5071-5077,"[colHeader-1444-1514, colHeader-2553-2558]","Nine months ended setptember 30, 2004",[rowHeader-4068-4089],IRS audit settlement
8,13.2%,1,1,4,4,bodyCell-5591-5597,"[colHeader-1012-1206, colHeader-1813-1818]","Three months ended setptember 30, 2005",[rowHeader-5329-5348],Dividends received
9,3.3%,2,2,4,4,bodyCell-5838-5843,"[colHeader-1012-1206, colHeader-2061-2066]","Three months ended setptember 30, 2004",[rowHeader-5329-5348],Dividends received


column_header_texts,"Nine months ended setptember 30, 2004","Nine months ended setptember 30, 2005","Three months ended setptember 30, 2004","Three months ended setptember 30, 2005"
row_header_texts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dividends received,4.7%,15.4%,3.3%,13.2%
IRS audit settlement,15.2%,58%,35.5%,97%
Statatory tax rate,38%,37%,36%,35%
Total tax rate,15.1%,38.8%,4.3%,76.1%


In [184]:
all_cells = body.append(rows).append(cols)

# this method is guarunteed to give something that does indeed work. every row / column pair is unique
# The downside being that right now, it doesn't have proper row/column titles. 
# aesthetically similar, but not really good. 

all_cells['row_index'] = [list(range(r.row_index_begin, r.row_index_end+1)) for _, r in all_cells.iterrows()]
all_cells['column_index'] = [list(range(r.column_index_begin, r.column_index_end+1)) for _, r in all_cells.iterrows()]
all_cells = all_cells.explode("column_index")
all_cells = all_cells.explode("row_index")


table = all_cells.pivot(index= "row_index", columns= "column_index", values = "text")
table

column_index,0,1,2,3,4
row_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,,Three months ended setptember 30,Three months ended setptember 30,Nine months ended setptember 30,Nine months ended setptember 30
1,,2005,2004,2005,2004
2,Statatory tax rate,35%,36%,37%,38%
3,IRS audit settlement,97%,35.5%,58%,15.2%
4,Dividends received,13.2%,3.3%,15.4%,4.7%
5,Total tax rate,76.1%,4.3%,38.8%,15.1%
