# Tables demo/ workbook 



In [1]:
# INITIALIZATION BOILERPLATE

# The Jupyter kernel for this notebook usually starts up inside the notebooks
# directory, but the text_extensions_for_pandas package code is in the parent
# directory. Add that parent directory to the front of the Python include path.
import sys
if (sys.path[0] != ".."):
    sys.path[0] = ".."

import json
import os
from ibm_watson import CompareComplyV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from ibm_watson import ApiException

import pandas as pd 
import text_extensions_for_pandas as tp
from IPython.core.display import HTML



In [2]:
def init_watson_table_api():
    # Retrieve the APIKEY for authentication
    apikey = os.environ.get("IBM_API_KEY_TABLES")
    if apikey is None:
        raise ValueError("Expected apikey in the environment variable 'IBM_API_KEY'")

    # Get the service URL for your IBM Cloud instance
    ibm_cloud_service_url = os.environ.get("IBM_SERVICE_URL_TABLES")
    if ibm_cloud_service_url is None:
        raise ValueError("Expected IBM cloud service URL in the environment variable 'IBM_SERVICE_URL'")

        #initialize the authenticator
    authenticator = IAMAuthenticator(apikey)
    compare_comply = CompareComplyV1(
        version = '2020-1-1',
        authenticator=authenticator
    )

    compare_comply.set_service_url(ibm_cloud_service_url)
    return compare_comply

In [3]:
base_example_path = "../resources/tables/"
file_names = [ "who_covid_report_table", "20-populous-countries", "cali-temp-chart", "california-population-chart", "double_header_table"]

archive_name = "archive"
#set to true to query watson
re_query_watson = False



In [4]:
#responses_dict = {}
if re_query_watson:
    print("This shouldn't happen")
    compare_comply = init_watson_table_api()
    for file_n in file_names:
        with open(f"{base_example_path}{file_n}.pdf", 'rb') as base_file:
            try:
                result = compare_comply.extract_tables(base_file).get_result()
                responses_dict[file_n] = result
            except ApiException as ex:
                print ("Method failed with status code " + str(ex.code) + ": " + ex.message)
     #archive file
    with open(f"{base_example_path}{archive_name}.json", 'w') as archive_file:
        json.dump(responses_dict, archive_file)
else:
    with open(f"{base_example_path}{archive_name}.json", 'r') as archive_file:
        responses_dict = json.load(archive_file)

In [5]:
#select a response and process it, printing the outputs
response = responses_dict["20-populous-countries"]

#most basic work flow for processing tables
dfs_dict = tp.watson_tables_parse_response(response)
table = tp.make_table(dfs_dict)
table

Unnamed: 0,% of worldpopulation,Country (or\ndependent\nterritory),Date,Population,Rank,Source
1,18.0%,China [b],21 Jul 2020,1403627360,1.0,National populationclock [3]
2,17.5%,India [c],21 Jul 2020,1364965498,2.0,National populationclock [4]
3,4.23%,United States [d],21 Jul 2020,329991308,3.0,National population\nclock [5]
4,3.46%,Indonesia,1 Jul 2020,269603400,4.0,National annualprojection [6]
5,2.83%,Pakistan [e],1 Jul 2020,220892331,5.0,UN Projection [2]
6,2.72%,Brazil,21 Jul 2020,211822143,6.0,National populationclock [7]
7,2.64%,Nigeria,1 Jul 2020,206139587,7.0,UN Projection [2]
8,2.17%,Bangladesh,21 Jul 2020,168990780,8.0,National populationclock [8]
9,1.88%,Russia [f],1 Jan 2020,146748590,9.0,National estimate [9]
10,1.64%,Mexico,1 Jul 2020,127792286,10.0,National annualprojection [10]


# Break down process of reconstructing table: 

The raw JSON output is omitted for brevity

The first stage we see is the pandas-ified deconstructed table. This gives a good idea of the schema of data we are dealing with. It can be reached by running the ``` tp.watson_tables_parse_response(...)``` command. 
It contains information about the row headers, column headers and body cells and their contents


Next we have the value-attribute correlated table. This phase correlates each value wits respective row and header cells using information about cell ID' row and column numbers and other information from the original table. 

In [6]:
#select a response and process it, printing the outputs
response = responses_dict["double_header_table"]



print("displaying row headers:")
display(dfs_dict['row_headers'])
print("displaying column headers:")
display(dfs_dict['col_headers'])
print("displaying objects:")
display(dfs_dict['body_cells'].head(30))

displaying row headers:


None

displaying column headers:


Unnamed: 0,text,column_index_begin,column_index_end,row_index_begin,row_index_end,cell_id,text_normalized
0,Rank,0,0,0,0,colHeader-1611-1616,Rank
1,Country (or\ndependent\nterritory),1,1,0,0,colHeader-1859-2250,Country (or\ndependent\nterritory)
2,Population,2,2,0,0,colHeader-2504-2515,Population
3,% of worldpopulation,3,3,0,0,colHeader-2758-2779,% of worldpopulation
4,Date,4,4,0,0,colHeader-3034-3039,Date
5,Source,5,5,0,0,colHeader-3284-3291,Source


displaying objects:


Unnamed: 0,text,column_index_begin,column_index_end,row_index_begin,row_index_end,cell_id,column_header_ids,column_header_texts,row_header_ids,row_header_texts
0,1,0,0,1,1,bodyCell-3552-3554,[colHeader-1611-1616],[Rank],[],[]
1,China [b],1,1,1,1,bodyCell-3799-3975,[colHeader-1859-2250],[Country (or\ndependent\nterritory)],[],[]
2,1403627360,2,2,1,1,bodyCell-4226-4240,[colHeader-2504-2515],[Population],[],[]
3,18.0%,3,3,1,1,bodyCell-4483-4489,[colHeader-2758-2779],[% of worldpopulation],[],[]
4,21 Jul 2020,4,4,1,1,bodyCell-4734-4746,[colHeader-3034-3039],[Date],[],[]
5,National populationclock [3],5,5,1,1,bodyCell-4991-5186,[colHeader-3284-3291],[Source],[],[]
6,2,0,0,2,2,bodyCell-5452-5454,[colHeader-1611-1616],[Rank],[],[]
7,India [c],1,1,2,2,bodyCell-5700-5875,[colHeader-1859-2250],[Country (or\ndependent\nterritory)],[],[]
8,1364965498,2,2,2,2,bodyCell-6127-6141,[colHeader-2504-2515],[Population],[],[]
9,17.5%,3,3,2,2,bodyCell-6385-6391,[colHeader-2758-2779],[% of worldpopulation],[],[]


In [8]:
table_exploded, row_headers, col_headers = tp.make_exploded_df(dfs_dict)
display(table_exploded)

Unnamed: 0,text,row_index,column_header_texts_0
0,1,1,Rank
1,China [b],1,Country (or\ndependent\nterritory)
2,1403627360,1,Population
3,18.0%,1,% of worldpopulation
4,21 Jul 2020,1,Date
...,...,...,...
121,World,21,Country (or\ndependent\nterritory)
122,7800767000,21,Population
123,100%,21,% of worldpopulation
124,21 Jul 2020,21,Date


In [9]:
dfs_dict = tp.watson_tables_parse_response(response)
table = tp.make_table(dfs_dict)
display(table)
display(HTML(response["document"]["html"]))


Unnamed: 0_level_0,Nine months ended setptember 30,Nine months ended setptember 30,Three months ended setptember 30,Three months ended setptember 30
Unnamed: 0_level_1,2004,2005,2004,2005
Dividends received,4.7%,15.4%,3.3%,13.2%
IRS audit settlement,15.2%,58%,35.5%,97%
Statatory tax rate,38%,37%,36%,35%
Total tax rate,15.1%,38.8%,4.3%,76.1%


0,1,2,3,4
,Three months ended setptember 30,Three months ended setptember 30,Nine months ended setptember 30,Nine months ended setptember 30
,2005,2004,2005,2004
Statatory tax rate,35%,36%,37%,38%
IRS audit settlement,97%,35.5%,58%,15.2%
Dividends received,13.2%,3.3%,15.4%,4.7%
Total tax rate,76.1%,4.3%,38.8%,15.1%


Once the data is in the pandas dataframe format it is extremely easy to manipulate: 

In [10]:
# TODO: make some nice examples of data manipulation for here 

## Application example: extracting data from WHO covid status report table

In [11]:
response = responses_dict["who_covid_report_table"]
num_tabs = len(response.get("tables", []))

df = pd.DataFrame()

for i in range (num_tabs):
    dfs_dict = tp.watson_tables_parse_response(response, table_number=i)
    table = tp.make_table(dfs_dict,concat_with='')
    #display(table)
    df = df.append(table, ignore_index = True)
    

In [12]:
rows_to_keep = [r.to_list().count('') == 0 for _,r in df.iterrows()] 
df = df[rows_to_keep]
df.head(100)


Unnamed: 0_level_0,Days since last,Reporting Country/ Territory/Area,Total confirmed,Total confirmed,Total deaths,Total new deaths,Transmission
Unnamed: 0_level_1,reported case,Unnamed: 2_level_1,cases,new cases,Unnamed: 5_level_1,Unnamed: 6_level_1,classification i
1,0,South Africa,287 796,11 554,4 172,93,Community transmission
2,0,Nigeria,33153,595,744,4,Community transmission
3,0,Ghana,24988,470,139,0,Community transmission
4,0,Algeria,19689,494,1 018,7,Community transmission
5,1,Cameroon,15173,0,359,0,Community transmission
...,...,...,...,...,...,...,...
99,0,Curaçao,26,1,1,0,Sporadic cases
100,79,Falkland Islands (Malvinas),13,0,0,0,No cases
101,3,Montserrat,12,0,1,0,No cases
102,59,British Virgin Islands,8,0,1,0,No cases
