In [1]:
# INITIALIZATION BOILERPLATE

# The Jupyter kernel for this notebook usually starts up inside the notebooks
# directory, but the text_extensions_for_pandas package code is in the parent
# directory. Add that parent directory to the front of the Python include path.
import sys
if (sys.path[0] != ".."):
    sys.path[0] = ".."

import json
import os
from ibm_watson import CompareComplyV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from ibm_watson import ApiException

import pandas as pd 
import text_extensions_for_pandas as tp
from IPython.core.display import HTML


## Initialize data and pre-process through watson Compare & Comply API

Run some initialization code, and process tables either through watson api, or load a cached version of such tables, for ease of use.

In [2]:
def init_watson_table_api():
    # Retrieve the APIKEY for authentication
    apikey = os.environ.get("IBM_API_KEY_TABLES")
    if apikey is None:
        raise ValueError("Expected apikey in the environment variable 'IBM_API_KEY'")

    # Get the service URL for your IBM Cloud instance
    ibm_cloud_service_url = os.environ.get("IBM_SERVICE_URL_TABLES")
    if ibm_cloud_service_url is None:
        raise ValueError("Expected IBM cloud service URL in the environment variable 'IBM_SERVICE_URL'")

        #initialize the authenticator
    authenticator = IAMAuthenticator(apikey)
    compare_comply = CompareComplyV1(
        version = '2020-1-1',
        authenticator=authenticator
    )

    compare_comply.set_service_url(ibm_cloud_service_url)
    return compare_comply

In [3]:
base_example_path = "../resources/tables/"
file_names = ["20-populous-countries", "cali-temp-chart", "california-population-chart", "double_header_table", "who_covid_report_table"]

archive_name = "archive"
#set to true to query watson
re_query_watson = False



In [4]:
#load in files either by querying cloud or from archive. 
if re_query_watson:
    print("This shouldn't happen")
    compare_comply = init_watson_table_api()
    for file_n in file_names:
        with open(f"{base_example_path}{file_n}.pdf", 'rb') as base_file:
            try:
                result = compare_comply.extract_tables(base_file).get_result()
                responses_dict[file_n] = result
            except ApiException as ex:
                print ("Method failed with status code " + str(ex.code) + ": " + ex.message)
     #archive file
    with open(f"{base_example_path}{archive_name}.json", 'w') as archive_file:
        json.dump(responses_dict, archive_file)
else:
    with open(f"{base_example_path}{archive_name}.json", 'r') as archive_file:
        responses_dict = json.load(archive_file)

## Now demo table extraction on a number of different tables.
To show how this tool can work in a variety of circumstances, here we demo using it on a number of different tables. 

First we display the reconstructed table, then we display the html that is created as an intermediate step of the table extraction, for comparison 

In [9]:
for name in file_names:
    print(f"\n\n Displaying table {name}:")
    print("\n Displaying Dataframe: \n")
    parsed = tp.watson_tables_parse_response(responses_dict[name])
    display(tp.make_exploded_df(parsed)[0])
    display(tp.make_table(parsed, concat_with=''))
    print("\n\n\n HTML: \n\n")
    display(HTML(tp.get_raw_html(responses_dict[name],parsed)))



 Displaying table 20-populous-countries:

 Displaying Dataframe: 



Unnamed: 0,text,attributes.type,row_index,column_header_texts_0
0,1,[Number],1,Rank
1,China [b],[Location],1,Country (or\ndependent\nterritory)
2,1403627360,[Number],1,Population
3,18.0%,[Percentage],1,% of worldpopulation
4,21 Jul 2020,[DateTime],1,Date
...,...,...,...,...
121,World,[],21,Country (or\ndependent\nterritory)
122,7800767000,[Number],21,Population
123,100%,[Percentage],21,% of worldpopulation
124,21 Jul 2020,[DateTime],21,Date


ERROR READING VALUE:""	 Filling with <NA>


Unnamed: 0,Rank,Country (or\ndependent\nterritory),Population,% of worldpopulation,Date,Source
1,1.0,China [b],1403627000.0,18.0,21 Jul 2020,National populationclock [3]
2,2.0,India [c],1364965000.0,17.5,21 Jul 2020,National populationclock [4]
3,3.0,United States [d],329991300.0,4.23,21 Jul 2020,National population\nclock [5]
4,4.0,Indonesia,269603400.0,3.46,1 Jul 2020,National annualprojection [6]
5,5.0,Pakistan [e],220892300.0,2.83,1 Jul 2020,UN Projection [2]
6,6.0,Brazil,211822100.0,2.72,21 Jul 2020,National populationclock [7]
7,7.0,Nigeria,206139600.0,2.64,1 Jul 2020,UN Projection [2]
8,8.0,Bangladesh,168990800.0,2.17,21 Jul 2020,National populationclock [8]
9,9.0,Russia [f],146748600.0,1.88,1 Jan 2020,National estimate [9]
10,10.0,Mexico,127792300.0,1.64,1 Jul 2020,National annualprojection [10]





 HTML: 




0,1,2,3,4,5
Rank,Country (or dependent territory),Population,% of worldpopulation,Date,Source
1,China[b],1403627360,18.0%,21 Jul 2020,National populationclock[3]
2,India[c],1364965498,17.5%,21 Jul 2020,National populationclock[4]
3,United States[d],329991308,4.23%,21 Jul 2020,National population clock[5]
4,Indonesia,269603400,3.46%,1 Jul 2020,National annualprojection[6]
5,Pakistan[e],220892331,2.83%,1 Jul 2020,UN Projection[2]
6,Brazil,211822143,2.72%,21 Jul 2020,National populationclock[7]
7,Nigeria,206139587,2.64%,1 Jul 2020,UN Projection[2]
8,Bangladesh,168990780,2.17%,21 Jul 2020,National populationclock[8]
9,Russia[f],146748590,1.88%,1 Jan 2020,National estimate[9]




 Displaying table cali-temp-chart:

 Displaying Dataframe: 



Unnamed: 0,text,attributes.type,row_header_texts_0,column_header_texts_0
0,83/64,,Los Angeles,August(°F)
1,29/18,,Los Angeles,August\n(°C)
2,66/48,,Los Angeles,January\n(°F)
3,20/8,,Los Angeles,January\n(°C)
4,377/15,,Los Angeles,Annual\nPrecipitation\n(mm/in)
...,...,...,...,...
60,77/45,,Mammoth Lakes,August(°F)
61,25/7,,Mammoth Lakes,August\n(°C)
62,40/15,,Mammoth Lakes,January\n(°F)
63,4/ −9,,Mammoth Lakes,January\n(°C)


Unnamed: 0,August(°F),August\n(°C),January\n(°F),January\n(°C),Annual\nPrecipitation\n(mm/in)
Los Angeles,83/64,29/18,66/48,20/8,377/15
LAX/LA Beaches,75/64,23/18,65/49,18/9,326/13
San Diego,76/67,24/19,65/49,18/9,262/10
San Jose,82/58,27/14,58/42,14/5,401/16
San Francisco,67/54,20/12,56/46,14/8,538/21
Fresno,97/66,34/19,55/38,12/3,292/11
Sacramento,91/58,33/14,54/39,12/3,469/18
Oakland,73/58,23/14,58/44,14/7,588/23
Bakersfield,96/69,36/21,56/39,13/3,165/7
Riverside,94/60,35/18,67/39,19/4,260/10





 HTML: 




0,1,2,3,4,5
Location,August(°F),August (°C),January (°F),January (°C),Annual Precipitation (mm/in)
Los Angeles,83/64,29/18,66/48,20/8,377/15
LAX/LA Beaches,75/64,23/18,65/49,18/9,326/13
San Diego,76/67,24/19,65/49,18/9,262/10
San Jose,82/58,27/14,58/42,14/5,401/16
San Francisco,67/54,20/12,56/46,14/8,538/21
Fresno,97/66,34/19,55/38,12/3,292/11
Sacramento,91/58,33/14,54/39,12/3,469/18
Oakland,73/58,23/14,58/44,14/7,588/23
Bakersfield,96/69,36/21,56/39,13/3,165/7




 Displaying table california-population-chart:

 Displaying Dataframe: 



Unnamed: 0,text,attributes.type,row_index,column_header_texts_0
0,1850,[Number],1,Census
1,92597,[Number],1,Pop.
2,-,[],1,%±
3,1860,[Number],2,Census
4,379994,[Number],2,Pop.
5,310.4%,[Percentage],2,%±
6,1870,[Number],3,Census
7,560247,[Number],3,Pop.
8,47.4%,[Percentage],3,%±
9,1880,[Number],4,Census


ERROR READING VALUE:"-"	 Filling with <NA>


Unnamed: 0,Census,Pop.,%±
1,1850.0,92597.0,
2,1860.0,379994.0,310.4
3,1870.0,560247.0,47.4
4,1880.0,864694.0,54.3
5,1890.0,1213398.0,40.3
6,1900.0,1485053.0,22.4
7,1910.0,2377549.0,60.1
8,1920.0,3426861.0,44.1
9,1930.0,5677251.0,65.7
10,1940.0,6907387.0,21.7





 HTML: 




0,1,2
Census,Pop.,%±
1850,92597,-
1860,379994,310.4%
1870,560247,47.4%
1880,864694,54.3%
1890,1213398,40.3%
1900,1485053,22.4%
1910,2377549,60.1%
1920,3426861,44.1%
1930,5677251,65.7%




 Displaying table double_header_table:

 Displaying Dataframe: 



Unnamed: 0,text,attributes.type,row_header_texts_0,column_header_texts_0,column_header_texts_1
0,35%,[Percentage],Statatory tax rate,Three months ended setptember 30,2005
1,36%,[Percentage],Statatory tax rate,Three months ended setptember 30,2004
2,37%,[Percentage],Statatory tax rate,Nine months ended setptember 30,2005
3,38%,[Percentage],Statatory tax rate,Nine months ended setptember 30,2004
4,97%,[Percentage],IRS audit settlement,Three months ended setptember 30,2005
5,35.5%,[Percentage],IRS audit settlement,Three months ended setptember 30,2004
6,58%,[Percentage],IRS audit settlement,Nine months ended setptember 30,2005
7,15.2%,[Percentage],IRS audit settlement,Nine months ended setptember 30,2004
8,13.2%,[Percentage],Dividends received,Three months ended setptember 30,2005
9,3.3%,[Percentage],Dividends received,Three months ended setptember 30,2004


Unnamed: 0_level_0,Nine months ended setptember 30,Nine months ended setptember 30,Three months ended setptember 30,Three months ended setptember 30
Unnamed: 0_level_1,2004,2005,2004,2005
Statatory tax rate,38.0,37.0,36.0,35.0
IRS audit settlement,15.2,58.0,35.5,97.0
Dividends received,4.7,15.4,3.3,13.2
Total tax rate,15.1,38.8,4.3,76.1





 HTML: 




0,1,2,3,4
,Three months ended setptember 30,Three months ended setptember 30,Nine months ended setptember 30,Nine months ended setptember 30
,2005,2004,2005,2004
Statatory tax rate,35%,36%,37%,38%
IRS audit settlement,97%,35.5%,58%,15.2%
Dividends received,13.2%,3.3%,15.4%,4.7%
Total tax rate,76.1%,4.3%,38.8%,15.1%




 Displaying table who_covid_report_table:

 Displaying Dataframe: 



Unnamed: 0,text,attributes.type,row_index,column_header_texts_0,column_header_texts_1
0,Africa,[Location],2,Reporting Country/ Territory/Area,
1,,[],2,Total confirmed,cases
2,,[],2,Total confirmed,new cases
3,,[],2,Total deaths,
4,,[],2,Total new deaths,
...,...,...,...,...,...
169,60,[Number],23,Total confirmed,new cases
170,26,[Number],23,Total deaths,
171,1,[Number],23,Total new deaths,
172,Community transmission,[],23,Transmission,classification i


ERROR READING VALUE:""	 Filling with <NA>
ERROR READING VALUE:""	 Filling with <NA>
ERROR READING VALUE:""	 Filling with <NA>
ERROR READING VALUE:""	 Filling with <NA>


Unnamed: 0_level_0,Days since last,Reporting Country/ Territory/Area,Total confirmed,Total confirmed,Total deaths,Total new deaths,Transmission
Unnamed: 0_level_1,reported case,Unnamed: 2_level_1,cases,new cases,Unnamed: 5_level_1,Unnamed: 6_level_1,classification i
2,,Africa,,,,,
3,0.0,South Africa,287 796,11554.0,4172.0,93.0,Community transmission
4,0.0,Nigeria,33153,595.0,744.0,4.0,Community transmission
5,0.0,Ghana,24988,470.0,139.0,0.0,Community transmission
6,0.0,Algeria,19689,494.0,1018.0,7.0,Community transmission
7,1.0,Cameroon,15173,0.0,359.0,0.0,Community transmission
8,0.0,Côƚe d͛Iǀoiƌe,12872,106.0,84.0,0.0,Community transmission
9,0.0,Kenya,10294,189.0,197.0,12.0,Community transmission
10,0.0,Senegal,8198,63.0,150.0,2.0,Community transmission
11,0.0,Democratic Republic of the Congo,8074,42.0,189.0,1.0,Community transmission





 HTML: 




0,1,2,3,4,5,6,7,8
Reporting Country/ Territory/Area,Total confirmed,Total confirmed,Total confirmed,Total confirmed,Total deaths,Total new deaths,Transmission,Days since last
,cases,cases,cases,new cases,,,classificationi,reported case
Africa,,,,,,,,
South Africa,287 796,287 796,287 796,11 554,4 172,93,Community transmission,0
Nigeria,33,33,153,595,744,4,Community transmission,0
Ghana,24,24,988,470,139,0,Community transmission,0
Algeria,19,19,689,494,1 018,7,Community transmission,0
Cameroon,15,15,173,0,359,0,Community transmission,1
Côƚe d͛Iǀoiƌe,12,12,872,106,84,0,Community transmission,0
Kenya,10,10,294,189,197,12,Community transmission,0
