# Tables demo/ workbook 



In [1]:
# INITIALIZATION BOILERPLATE

# The Jupyter kernel for this notebook usually starts up inside the notebooks
# directory, but the text_extensions_for_pandas package code is in the parent
# directory. Add that parent directory to the front of the Python include path.
import sys
if (sys.path[0] != ".."):
    sys.path[0] = ".."

import json
import os
from ibm_watson import CompareComplyV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from ibm_watson import ApiException

import pandas as pd 
import text_extensions_for_pandas as tp
from IPython.core.display import HTML



In [27]:
def init_watson_table_api():
    # Retrieve the APIKEY for authentication
    apikey = os.environ.get("IBM_API_KEY_TABLES")
    if apikey is None:
        raise ValueError("Expected apikey in the environment variable 'IBM_API_KEY'")

    # Get the service URL for your IBM Cloud instance
    ibm_cloud_service_url = os.environ.get("IBM_SERVICE_URL_TABLES")
    if ibm_cloud_service_url is None:
        raise ValueError("Expected IBM cloud service URL in the environment variable 'IBM_SERVICE_URL'")

        #initialize the authenticator
    authenticator = IAMAuthenticator(apikey)
    compare_comply = CompareComplyV1(
        version = '2020-1-1',
        authenticator=authenticator
    )

    compare_comply.set_service_url(ibm_cloud_service_url)
    return compare_comply

In [60]:
base_example_path = "../resources/tables/"
file_names = [ "who_covid_report_table", "20-populous-countries", "cali-temp-chart", "california-population-chart", "double_header_table"]

archive_name = "archive"
#set to true to query watson
re_query_watson = False



In [61]:
#responses_dict = {}
if re_query_watson:
    print("This shouldn't happen")
    compare_comply = init_watson_table_api()
    for file_n in file_names:
        with open(f"{base_example_path}{file_n}.pdf", 'rb') as base_file:
            try:
                result = compare_comply.extract_tables(base_file).get_result()
                responses_dict[file_n] = result
            except ApiException as ex:
                print ("Method failed with status code " + str(ex.code) + ": " + ex.message)
     #archive file
    with open(f"{base_example_path}{archive_name}.json", 'w') as archive_file:
        json.dump(responses_dict, archive_file)
else:
    with open(f"{base_example_path}{archive_name}.json", 'r') as archive_file:
        responses_dict = json.load(archive_file)

In [62]:
#select a response and process it, printing the outputs
response = responses_dict["double_header_table"]

#most basic work flow for processing tables
dfs_dict = tp.watson_tables_parse_response(response)
tp.make_table(dfs_dict)

Unnamed: 0_level_0,Nine months ended setptember 30,Nine months ended setptember 30,Three months ended setptember 30,Three months ended setptember 30
Unnamed: 0_level_1,2004,2005,2004,2005
Dividends received,4.7%,15.4%,3.3%,13.2%
IRS audit settlement,15.2%,58%,35.5%,97%
Statatory tax rate,38%,37%,36%,35%
Total tax rate,15.1%,38.8%,4.3%,76.1%


# Break down process of reconstructing table: 

The raw JSON output is omitted for brevity

The first stage we see is the pandas-ified deconstructed table. This gives a good idea of the schema of data we are dealing with. It can be reached by running the ``` tp.watson_tables_parse_response(...)``` command. 
It contains information about the row headers, column headers and body cells and their contents


Next we have the value-attribute correlated table. This phase correlates each value wits respective row and header cells using information about cell ID' row and column numbers and other information from the original table. 

In [8]:
print("displaying headers")
display(dfs_dict['row_headers'])
display(dfs_dict['col_headers'])
print("displaying objects")
display(dfs_dict['body_cells'].head(30))

displaying headers


Unnamed: 0,text,column_index_begin,column_index_end,row_index_begin,row_index_end,cell_id,text_normalized
0,Statatory tax rate,0,0,2,2,rowHeader-2810-2829,Statatory tax rate
1,IRS audit settlement,0,0,3,3,rowHeader-4068-4089,IRS audit settlement
2,Dividends received,0,0,4,4,rowHeader-5329-5348,Dividends received
3,Total tax rate,0,0,5,5,rowHeader-6586-6601,Total tax rate


Unnamed: 0,text,column_index_begin,column_index_end,row_index_begin,row_index_end,cell_id,text_normalized
0,,0,0,0,0,colHeader-786-787,
1,Three months ended setptember 30,1,2,0,0,colHeader-1012-1206,Three months ended setptember 30
2,Nine months ended setptember 30,3,4,0,0,colHeader-1444-1514,Nine months ended setptember 30
3,,0,0,1,1,colHeader-1586-1587,
4,2005,1,1,1,1,colHeader-1813-1818,2005
5,2004,2,2,1,1,colHeader-2061-2066,2004
6,2005,3,3,1,1,colHeader-2305-2310,2005
7,2004,4,4,1,1,colHeader-2553-2558,2004


displaying objects


Unnamed: 0,text,column_index_begin,column_index_end,row_index_begin,row_index_end,cell_id,column_header_ids,column_header_texts,row_header_ids,row_header_texts
0,35%,1,1,2,2,bodyCell-3073-3077,"[colHeader-1012-1206, colHeader-1813-1818]","[Three months ended setptember 30, 2005]",[rowHeader-2810-2829],[Statatory tax rate]
1,36%,2,2,2,2,bodyCell-3320-3324,"[colHeader-1012-1206, colHeader-2061-2066]","[Three months ended setptember 30, 2004]",[rowHeader-2810-2829],[Statatory tax rate]
2,37%,3,3,2,2,bodyCell-3564-3568,"[colHeader-1444-1514, colHeader-2305-2310]","[Nine months ended setptember 30, 2005]",[rowHeader-2810-2829],[Statatory tax rate]
3,38%,4,4,2,2,bodyCell-3811-3815,"[colHeader-1444-1514, colHeader-2553-2558]","[Nine months ended setptember 30, 2004]",[rowHeader-2810-2829],[Statatory tax rate]
4,97%,1,1,3,3,bodyCell-4333-4337,"[colHeader-1012-1206, colHeader-1813-1818]","[Three months ended setptember 30, 2005]",[rowHeader-4068-4089],[IRS audit settlement]
5,35.5%,2,2,3,3,bodyCell-4579-4585,"[colHeader-1012-1206, colHeader-2061-2066]","[Three months ended setptember 30, 2004]",[rowHeader-4068-4089],[IRS audit settlement]
6,58%,3,3,3,3,bodyCell-4825-4829,"[colHeader-1444-1514, colHeader-2305-2310]","[Nine months ended setptember 30, 2005]",[rowHeader-4068-4089],[IRS audit settlement]
7,15.2%,4,4,3,3,bodyCell-5071-5077,"[colHeader-1444-1514, colHeader-2553-2558]","[Nine months ended setptember 30, 2004]",[rowHeader-4068-4089],[IRS audit settlement]
8,13.2%,1,1,4,4,bodyCell-5591-5597,"[colHeader-1012-1206, colHeader-1813-1818]","[Three months ended setptember 30, 2005]",[rowHeader-5329-5348],[Dividends received]
9,3.3%,2,2,4,4,bodyCell-5838-5843,"[colHeader-1012-1206, colHeader-2061-2066]","[Three months ended setptember 30, 2004]",[rowHeader-5329-5348],[Dividends received]


In [87]:
tp.make_exploded_df(dfs_dict)[0][["text", "column_header_texts_0", "column_header_texts_1", "row_header_texts_0"]]

Unnamed: 0,text,column_header_texts_0,column_header_texts_1,row_header_texts_0
0,35%,Three months ended setptember 30,2005,Statatory tax rate
1,36%,Three months ended setptember 30,2004,Statatory tax rate
2,37%,Nine months ended setptember 30,2005,Statatory tax rate
3,38%,Nine months ended setptember 30,2004,Statatory tax rate
4,97%,Three months ended setptember 30,2005,IRS audit settlement
5,35.5%,Three months ended setptember 30,2004,IRS audit settlement
6,58%,Nine months ended setptember 30,2005,IRS audit settlement
7,15.2%,Nine months ended setptember 30,2004,IRS audit settlement
8,13.2%,Three months ended setptember 30,2005,Dividends received
9,3.3%,Three months ended setptember 30,2004,Dividends received


In [72]:
body = dfs_dict['body_cells'] 
cols = dfs_dict["col_headers"] 
rows = dfs_dict["row_headers"] 



In [11]:
#"Third option" for combination. combines headings. Ideally there would be a more robust way to do this rather than just appending strings 
#  this should work, but is sub ideal. 
def agg_headers(series ):
    for rowOrCol in ["row", "column"]:
        feild = f"{rowOrCol}_header_texts"
        series[feild] = ", ".join(series[feild])
    return series
filled = body.apply(agg_headers, axis=1) 
display(filled)

# table = filled.pivot(index= "row_header_texts", columns= "column_header_texts", values= "text")
# table

Unnamed: 0,text,column_index_begin,column_index_end,row_index_begin,row_index_end,cell_id,column_header_ids,column_header_texts,row_header_ids,row_header_texts
0,1,0,0,1,1,bodyCell-3552-3554,[colHeader-1611-1616],Rank,[],
1,China [b],1,1,1,1,bodyCell-3799-3975,[colHeader-1859-2250],Country (or\ndependent\nterritory),[],
2,1403627360,2,2,1,1,bodyCell-4226-4240,[colHeader-2504-2515],Population,[],
3,18.0%,3,3,1,1,bodyCell-4483-4489,[colHeader-2758-2779],% of worldpopulation,[],
4,21 Jul 2020,4,4,1,1,bodyCell-4734-4746,[colHeader-3034-3039],Date,[],
...,...,...,...,...,...,...,...,...,...,...
121,World,1,1,21,21,bodyCell-39162-39169,[colHeader-1859-2250],Country (or\ndependent\nterritory),[],
122,7800767000,2,2,21,21,bodyCell-39425-39439,[colHeader-2504-2515],Population,[],
123,100%,3,3,21,21,bodyCell-39694-39699,[colHeader-2758-2779],% of worldpopulation,[],
124,21 Jul 2020,4,4,21,21,bodyCell-39954-39966,[colHeader-3034-3039],Date,[],


In [54]:
dfs_dict = tp.watson_tables_parse_response(response)
table = tp.make_table(dfs_dict)
display(table)
display(HTML(response["document"]["html"]))

table["Three months ended setptember 30"]

Unnamed: 0_level_0,Nine months ended setptember 30,Nine months ended setptember 30,Three months ended setptember 30,Three months ended setptember 30
Unnamed: 0_level_1,2004,2005,2004,2005
Dividends received,4.7%,15.4%,3.3%,13.2%
IRS audit settlement,15.2%,58%,35.5%,97%
Statatory tax rate,38%,37%,36%,35%
Total tax rate,15.1%,38.8%,4.3%,76.1%


0,1,2,3,4
,Three months ended setptember 30,Three months ended setptember 30,Nine months ended setptember 30,Nine months ended setptember 30
,2005,2004,2005,2004
Statatory tax rate,35%,36%,37%,38%
IRS audit settlement,97%,35.5%,58%,15.2%
Dividends received,13.2%,3.3%,15.4%,4.7%
Total tax rate,76.1%,4.3%,38.8%,15.1%


Unnamed: 0,2004,2005
Dividends received,3.3%,13.2%
IRS audit settlement,35.5%,97%
Statatory tax rate,36%,35%
Total tax rate,4.3%,76.1%


In [34]:
response = responses_dict["who_covid_report_table"]
num_tabs = len(response.get("tables", []))

df = pd.DataFrame()

for i in range (num_tabs):
    dfs_dict = tp.watson_tables_parse_response(response, table_number=i)
    table = tp.make_table(dfs_dict,concat_with='')
    #display(table)
    df = df.append(table, ignore_index = True)
    

row_headers = none
row_headers = none
row_headers = none
row_headers = none
row_headers = none
row_headers = none
row_headers = none
row_headers = none
row_headers = none
row_headers = none


In [38]:
rows_to_keep = [r.to_list().count('') == 0 for _,r in df.iterrows()] 
df = df[rows_to_keep]
df.head(100)


Unnamed: 0_level_0,Days since last,Reporting Country/ Territory/Area,Total confirmed,Total confirmed,Total deaths,Total new deaths,Transmission
Unnamed: 0_level_1,reported case,Unnamed: 2_level_1,cases,new cases,Unnamed: 5_level_1,Unnamed: 6_level_1,classification i
1,0,South Africa,287 796,11 554,4 172,93,Community transmission
2,0,Nigeria,33153,595,744,4,Community transmission
3,0,Ghana,24988,470,139,0,Community transmission
4,0,Algeria,19689,494,1 018,7,Community transmission
5,1,Cameroon,15173,0,359,0,Community transmission
...,...,...,...,...,...,...,...
99,0,Curaçao,26,1,1,0,Sporadic cases
100,79,Falkland Islands (Malvinas),13,0,0,0,No cases
101,3,Montserrat,12,0,1,0,No cases
102,59,British Virgin Islands,8,0,1,0,No cases


In [59]:
with open(f"{base_example_path}archive.json", 'r') as json_archive: 
    obj = json.load(json_archive)
    print(obj==responses_dict)

True
