# Tables demo/ workbook 



In [1]:
# INITIALIZATION BOILERPLATE

# The Jupyter kernel for this notebook usually starts up inside the notebooks
# directory, but the text_extensions_for_pandas package code is in the parent
# directory. Add that parent directory to the front of the Python include path.
import sys
if (sys.path[0] != ".."):
    sys.path[0] = ".."

import json
import os
from ibm_watson import CompareComplyV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from ibm_watson import ApiException

import pandas as pd 
import text_extensions_for_pandas as tp
from IPython.core.display import HTML



In [2]:
def init_watson_table_api():
    # Retrieve the APIKEY for authentication
    apikey = os.environ.get("IBM_API_KEY_TABLES")
    if apikey is None:
        raise ValueError("Expected apikey in the environment variable 'IBM_API_KEY'")

    # Get the service URL for your IBM Cloud instance
    ibm_cloud_service_url = os.environ.get("IBM_SERVICE_URL_TABLES")
    if ibm_cloud_service_url is None:
        raise ValueError("Expected IBM cloud service URL in the environment variable 'IBM_SERVICE_URL'")

        #initialize the authenticator
    authenticator = IAMAuthenticator(apikey)
    compare_comply = CompareComplyV1(
        version = '2020-1-1',
        authenticator=authenticator
    )

    compare_comply.set_service_url(ibm_cloud_service_url)
    return compare_comply

In [3]:
base_example_path = "../resources/tables/"
file_names = [ "who_covid_report_table", "20-populous-countries", "cali-temp-chart", "california-population-chart", "double_header_table"]

archive_name = "archive"
#set to true to query watson
re_query_watson = False



In [4]:
#responses_dict = {}
if re_query_watson:
    print("This shouldn't happen")
    compare_comply = init_watson_table_api()
    for file_n in file_names:
        with open(f"{base_example_path}{file_n}.pdf", 'rb') as base_file:
            try:
                result = compare_comply.extract_tables(base_file).get_result()
                responses_dict[file_n] = result
            except ApiException as ex:
                print ("Method failed with status code " + str(ex.code) + ": " + ex.message)
     #archive file
    with open(f"{base_example_path}{archive_name}.json", 'w') as archive_file:
        json.dump(responses_dict, archive_file)
else:
    with open(f"{base_example_path}{archive_name}.json", 'r') as archive_file:
        responses_dict = json.load(archive_file)

In [5]:
#select a response and process it, printing the outputs
response = responses_dict["double_header_table"]

#most basic work flow for processing tables
dfs_dict = tp.watson_tables_parse_response(response)
table = tp.make_table(dfs_dict)
table

Unnamed: 0_level_0,Nine months ended setptember 30,Nine months ended setptember 30,Three months ended setptember 30,Three months ended setptember 30
Unnamed: 0_level_1,2004,2005,2004,2005
Dividends received,4.7%,15.4%,3.3%,13.2%
IRS audit settlement,15.2%,58%,35.5%,97%
Statatory tax rate,38%,37%,36%,35%
Total tax rate,15.1%,38.8%,4.3%,76.1%


# Break down process of reconstructing table: 

The raw JSON output is omitted for brevity

The first stage we see is the pandas-ified deconstructed table. This gives a good idea of the schema of data we are dealing with. It can be reached by running the ``` tp.watson_tables_parse_response(...)``` command. 
It contains information about the row headers, column headers and body cells and their contents


Next we have the value-attribute correlated table. This phase correlates each value wits respective row and header cells using information about cell ID' row and column numbers and other information from the original table. 

In [17]:
#select a response and process it, printing the outputs
response = responses_dict["double_header_table"]



print("displaying row headers:")
display(dfs_dict['row_headers'])
print("displaying column headers:")
display(dfs_dict['col_headers'])
print("displaying objects:")
display(dfs_dict['body_cells'].head(30))

displaying row headers:


None

displaying column headers:


Unnamed: 0,text,column_index_begin,column_index_end,row_index_begin,row_index_end,cell_id,text_normalized
0,Reporting Country/ Territory/Area,0,0,0,0,colHeader-426382-426416,Reporting Country/ Territory/Area
1,Total confirmed,1,1,0,0,colHeader-426693-426709,Total confirmed
2,Total confirmed,2,2,0,0,colHeader-426987-427003,Total confirmed
3,Total deaths,3,3,0,0,colHeader-427282-427295,Total deaths
4,Total new deaths,4,4,0,0,colHeader-427573-427590,Total new deaths
5,Transmission,5,5,0,0,colHeader-427868-427881,Transmission
6,Days since last,6,6,0,0,colHeader-428159-428175,Days since last
7,,0,0,1,1,colHeader-428261-428262,
8,cases,1,1,1,1,colHeader-428522-428528,cases
9,new cases,2,2,1,1,colHeader-428807-428817,new cases


displaying objects:


Unnamed: 0,text,column_index_begin,column_index_end,row_index_begin,row_index_end,cell_id,column_header_ids,column_header_texts,row_header_ids,row_header_texts
0,Australia,0,0,2,2,bodyCell-429950-429960,"[colHeader-426382-426416, colHeader-428261-428...","[Reporting Country/ Territory/Area, ]",[],[]
1,9 980,1,1,2,2,bodyCell-430204-430211,"[colHeader-426693-426709, colHeader-428522-428...","[Total confirmed, cases]",[],[]
2,183,2,2,2,2,bodyCell-430454-430459,"[colHeader-426987-427003, colHeader-428807-428...","[Total confirmed, new cases]",[],[]
3,108,3,3,2,2,bodyCell-430702-430707,"[colHeader-427282-427295, colHeader-428894-428...","[Total deaths, ]",[],[]
4,0,4,4,2,2,bodyCell-430949-430952,"[colHeader-427573-427590, colHeader-428954-428...","[Total new deaths, ]",[],[]
5,Clusters of cases,5,5,2,2,bodyCell-431194-431212,"[colHeader-427868-427881, colHeader-429214-429...","[Transmission, classification i]",[],[]
6,0,6,6,2,2,bodyCell-431453-431455,"[colHeader-428159-428175, colHeader-429684-429...","[Days since last, reported case]",[],[]
7,Malaysia,0,0,3,3,bodyCell-431708-431717,"[colHeader-426382-426416, colHeader-428261-428...","[Reporting Country/ Territory/Area, ]",[],[]
8,8 725,1,1,3,3,bodyCell-431962-431969,"[colHeader-426693-426709, colHeader-428522-428...","[Total confirmed, cases]",[],[]
9,7,2,2,3,3,bodyCell-432212-432215,"[colHeader-426987-427003, colHeader-428807-428...","[Total confirmed, new cases]",[],[]


In [18]:
table_exploded, row_headers, col_headers = tp.make_exploded_df(dfs_dict)
display(table_exploded)

Unnamed: 0,text,row_index,column_header_texts_0,column_header_texts_1
0,Australia,2,Reporting Country/ Territory/Area,
1,9 980,2,Total confirmed,cases
2,183,2,Total confirmed,new cases
3,108,2,Total deaths,
4,0,2,Total new deaths,
...,...,...,...,...
128,196 775,20,Total confirmed,new cases
129,570 288,20,Total deaths,
130,3 634,20,Total new deaths,
131,,20,Transmission,classification i


In [19]:
dfs_dict = tp.watson_tables_parse_response(response)
table = tp.make_table(dfs_dict)
display(table)
display(HTML(response["document"]["html"]))




Unnamed: 0_level_0,Nine months ended setptember 30,Nine months ended setptember 30,Three months ended setptember 30,Three months ended setptember 30
Unnamed: 0_level_1,2004,2005,2004,2005
Dividends received,4.7%,15.4%,3.3%,13.2%
IRS audit settlement,15.2%,58%,35.5%,97%
Statatory tax rate,38%,37%,36%,35%
Total tax rate,15.1%,38.8%,4.3%,76.1%


0,1,2,3,4
,Three months ended setptember 30,Three months ended setptember 30,Nine months ended setptember 30,Nine months ended setptember 30
,2005,2004,2005,2004
Statatory tax rate,35%,36%,37%,38%
IRS audit settlement,97%,35.5%,58%,15.2%
Dividends received,13.2%,3.3%,15.4%,4.7%
Total tax rate,76.1%,4.3%,38.8%,15.1%


Once the data is in the pandas dataframe format it is extremely easy to manipulate: 

In [22]:
#change names of heading feilds
table= table.rename_axis(columns=["period","year"])
#display whole table
display(table)
#select just one set time period
display(table.loc[:,"Nine months ended setptember 30"])
# One specific type 
display(table[("Nine months ended setptember 30","2005")])
# Display data from just one year
display(table.swaplevel(axis=1)["2004"])
#select one specific field
display(table.loc["Dividends received",:])

period,Nine months ended setptember 30,Nine months ended setptember 30,Three months ended setptember 30,Three months ended setptember 30
year,2004,2005,2004,2005
Dividends received,4.7%,15.4%,3.3%,13.2%
IRS audit settlement,15.2%,58%,35.5%,97%
Statatory tax rate,38%,37%,36%,35%
Total tax rate,15.1%,38.8%,4.3%,76.1%


year,2004,2005
Dividends received,4.7%,15.4%
IRS audit settlement,15.2%,58%
Statatory tax rate,38%,37%
Total tax rate,15.1%,38.8%


Dividends received      15.4%
IRS audit settlement      58%
Statatory tax rate        37%
Total tax rate          38.8%
Name: (Nine months ended setptember 30, 2005), dtype: object

period,Nine months ended setptember 30,Three months ended setptember 30
Dividends received,4.7%,3.3%
IRS audit settlement,15.2%,35.5%
Statatory tax rate,38%,36%
Total tax rate,15.1%,4.3%


period                            year
Nine months ended setptember 30   2004     4.7%
                                  2005    15.4%
Three months ended setptember 30  2004     3.3%
                                  2005    13.2%
Name: Dividends received, dtype: object

## Application example: extracting data from 

In [11]:
response = responses_dict["who_covid_report_table"]
num_tabs = len(response.get("tables", []))

df = pd.DataFrame()

for i in range (num_tabs):
    dfs_dict = tp.watson_tables_parse_response(response, table_number=i)
    table = tp.make_table(dfs_dict,concat_with='')
    #display(table)
    df = df.append(table, ignore_index = True)
    

In [12]:
rows_to_keep = [r.to_list().count('') == 0 for _,r in df.iterrows()] 
df = df[rows_to_keep]
df.head(100)


Unnamed: 0_level_0,Days since last,Reporting Country/ Territory/Area,Total confirmed,Total confirmed,Total deaths,Total new deaths,Transmission
Unnamed: 0_level_1,reported case,Unnamed: 2_level_1,cases,new cases,Unnamed: 5_level_1,Unnamed: 6_level_1,classification i
1,0,South Africa,287 796,11 554,4 172,93,Community transmission
2,0,Nigeria,33153,595,744,4,Community transmission
3,0,Ghana,24988,470,139,0,Community transmission
4,0,Algeria,19689,494,1 018,7,Community transmission
5,1,Cameroon,15173,0,359,0,Community transmission
...,...,...,...,...,...,...,...
99,0,Curaçao,26,1,1,0,Sporadic cases
100,79,Falkland Islands (Malvinas),13,0,0,0,No cases
101,3,Montserrat,12,0,1,0,No cases
102,59,British Virgin Islands,8,0,1,0,No cases
