In [5]:
# INITIALIZATION BOILERPLATE

# The Jupyter kernel for this notebook usually starts up inside the notebooks
# directory, but the text_extensions_for_pandas package code is in the parent
# directory. Add that parent directory to the front of the Python include path.
import sys
if (sys.path[0] != ".."):
    sys.path[0] = ".."

import json
import os
from ibm_watson import CompareComplyV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from ibm_watson import ApiException

import pandas as pd 
import text_extensions_for_pandas as tp
from IPython.core.display import HTML



## Initialize data and pre-process through watson Compare & Comply API

In [2]:
def init_watson_table_api():
    # Retrieve the APIKEY for authentication
    apikey = os.environ.get("IBM_API_KEY_TABLES")
    if apikey is None:
        raise ValueError("Expected apikey in the environment variable 'IBM_API_KEY'")

    # Get the service URL for your IBM Cloud instance
    ibm_cloud_service_url = os.environ.get("IBM_SERVICE_URL_TABLES")
    if ibm_cloud_service_url is None:
        raise ValueError("Expected IBM cloud service URL in the environment variable 'IBM_SERVICE_URL'")

        #initialize the authenticator
    authenticator = IAMAuthenticator(apikey)
    compare_comply = CompareComplyV1(
        version = '2020-1-1',
        authenticator=authenticator
    )

    compare_comply.set_service_url(ibm_cloud_service_url)
    return compare_comply

In [7]:
base_example_path = "../resources/tables/"
file_names = ["20-populous-countries", "cali-temp-chart", "california-population-chart", "double_header_table", "who_covid_report_table"]

archive_name = "archive"
#set to true to query watson
re_query_watson = False



In [14]:
#load in files either by querying cloud or from archive. 
if re_query_watson:
    print("This shouldn't happen")
    compare_comply = init_watson_table_api()
    for file_n in file_names:
        with open(f"{base_example_path}{file_n}.pdf", 'rb') as base_file:
            try:
                result = compare_comply.extract_tables(base_file).get_result()
                responses_dict[file_n] = result
            except ApiException as ex:
                print ("Method failed with status code " + str(ex.code) + ": " + ex.message)
     #archive file
    with open(f"{base_example_path}{archive_name}.json", 'w') as archive_file:
        json.dump(responses_dict, archive_file)
else:
    with open(f"{base_example_path}{archive_name}.json", 'r') as archive_file:
        responses_dict = json.load(archive_file)

## Now demo table extraction on a number of different tables.
To show how this tool can work in a variety of circumstances, here we demo using it on a number of different tables. 

First we display the reconstructed table, then we display the html that is created as an intermediate step of the table extraction, for comparison 

In [19]:
for name in file_names:
    print(f"\n\n Displaying table {name}:")
    print("\n Displaying Dataframe: \n")
    parsed = tp.watson_tables_parse_response(responses_dict[name])
    display(tp.make_table(parsed))
    print("\n\n\n HTML: \n\n")
    display(HTML(responses_dict[name]["document"]["html"]))




 Displaying table 20-populous-countries:

 Displaying Dataframe: 



Unnamed: 0,% of worldpopulation,Country (or\ndependent\nterritory),Date,Population,Rank,Source
1,18.0%,China [b],21 Jul 2020,1403627360,1.0,National populationclock [3]
2,17.5%,India [c],21 Jul 2020,1364965498,2.0,National populationclock [4]
3,4.23%,United States [d],21 Jul 2020,329991308,3.0,National population\nclock [5]
4,3.46%,Indonesia,1 Jul 2020,269603400,4.0,National annualprojection [6]
5,2.83%,Pakistan [e],1 Jul 2020,220892331,5.0,UN Projection [2]
6,2.72%,Brazil,21 Jul 2020,211822143,6.0,National populationclock [7]
7,2.64%,Nigeria,1 Jul 2020,206139587,7.0,UN Projection [2]
8,2.17%,Bangladesh,21 Jul 2020,168990780,8.0,National populationclock [8]
9,1.88%,Russia [f],1 Jan 2020,146748590,9.0,National estimate [9]
10,1.64%,Mexico,1 Jul 2020,127792286,10.0,National annualprojection [10]





 HTML: 




0,1,2,3,4,5
Rank,Country (or dependent territory),Population,% of worldpopulation,Date,Source
1,China[b],1403627360,18.0%,21 Jul 2020,National populationclock[3]
2,India[c],1364965498,17.5%,21 Jul 2020,National populationclock[4]
3,United States[d],329991308,4.23%,21 Jul 2020,National population clock[5]
4,Indonesia,269603400,3.46%,1 Jul 2020,National annualprojection[6]
5,Pakistan[e],220892331,2.83%,1 Jul 2020,UN Projection[2]
6,Brazil,211822143,2.72%,21 Jul 2020,National populationclock[7]
7,Nigeria,206139587,2.64%,1 Jul 2020,UN Projection[2]
8,Bangladesh,168990780,2.17%,21 Jul 2020,National populationclock[8]
9,Russia[f],146748590,1.88%,1 Jan 2020,National estimate[9]




 Displaying table cali-temp-chart:

 Displaying Dataframe: 



Unnamed: 0,Annual\nPrecipitation\n(mm/in),August\n(°C),August(°F),January\n(°C),January\n(°F)
Bakersfield,165/7,36/21,96/69,13/3,56/39
Death Valley,53/2,45/29,113/84,18/3,64/37
Eureka,960/38,16/11,62/53,12/5,54/41
Fresno,292/11,34/19,97/66,12/3,55/38
LAX/LA Beaches,326/13,23/18,75/64,18/9,65/49
Los Angeles,377/15,29/18,83/64,20/8,66/48
Mammoth Lakes,583/23,25/7,77/45,4/ −9,40/15
Oakland,588/23,23/14,73/58,14/7,58/44
Riverside,260/10,35/18,94/60,19/4,67/39
Sacramento,469/18,33/14,91/58,12/3,54/39





 HTML: 




0,1,2,3,4,5
Location,August(°F),August (°C),January (°F),January (°C),Annual Precipitation (mm/in)
Los Angeles,83/64,29/18,66/48,20/8,377/15
LAX/LA Beaches,75/64,23/18,65/49,18/9,326/13
San Diego,76/67,24/19,65/49,18/9,262/10
San Jose,82/58,27/14,58/42,14/5,401/16
San Francisco,67/54,20/12,56/46,14/8,538/21
Fresno,97/66,34/19,55/38,12/3,292/11
Sacramento,91/58,33/14,54/39,12/3,469/18
Oakland,73/58,23/14,58/44,14/7,588/23
Bakersfield,96/69,36/21,56/39,13/3,165/7




 Displaying table california-population-chart:

 Displaying Dataframe: 



Unnamed: 0,%±,Census,Pop.
1,-,1850,92597
2,310.4%,1860,379994
3,47.4%,1870,560247
4,54.3%,1880,864694
5,40.3%,1890,1213398
6,22.4%,1900,1485053
7,60.1%,1910,2377549
8,44.1%,1920,3426861
9,65.7%,1930,5677251
10,21.7%,1940,6907387





 HTML: 




0,1,2
Census,Pop.,%±
1850,92597,-
1860,379994,310.4%
1870,560247,47.4%
1880,864694,54.3%
1890,1213398,40.3%
1900,1485053,22.4%
1910,2377549,60.1%
1920,3426861,44.1%
1930,5677251,65.7%




 Displaying table double_header_table:

 Displaying Dataframe: 



Unnamed: 0_level_0,Nine months ended setptember 30,Nine months ended setptember 30,Three months ended setptember 30,Three months ended setptember 30
Unnamed: 0_level_1,2004,2005,2004,2005
Dividends received,4.7%,15.4%,3.3%,13.2%
IRS audit settlement,15.2%,58%,35.5%,97%
Statatory tax rate,38%,37%,36%,35%
Total tax rate,15.1%,38.8%,4.3%,76.1%





 HTML: 




0,1,2,3,4
,Three months ended setptember 30,Three months ended setptember 30,Nine months ended setptember 30,Nine months ended setptember 30
,2005,2004,2005,2004
Statatory tax rate,35%,36%,37%,38%
IRS audit settlement,97%,35.5%,58%,15.2%
Dividends received,13.2%,3.3%,15.4%,4.7%
Total tax rate,76.1%,4.3%,38.8%,15.1%




 Displaying table who_covid_report_table:

 Displaying Dataframe: 



Unnamed: 0_level_0,Days since last,Reporting Country/ Territory/Area,Total confirmed,Total confirmed,Total deaths,Total new deaths,Transmission
Unnamed: 0_level_1,reported case,Unnamed: 2_level_1,cases,new cases,Unnamed: 5_level_1,Unnamed: 6_level_1,classification i
2,,Africa,,,,,
3,0.0,South Africa,287 796,11 554,4 172,93.0,Community transmission
4,0.0,Nigeria,33 | 153,595,744,4.0,Community transmission
5,0.0,Ghana,24 | 988,470,139,0.0,Community transmission
6,0.0,Algeria,19 | 689,494,1 018,7.0,Community transmission
7,1.0,Cameroon,15 | 173,0,359,0.0,Community transmission
8,0.0,Côƚe d͛Iǀoiƌe,12 | 872,106,84,0.0,Community transmission
9,0.0,Kenya,10 | 294,189,197,12.0,Community transmission
10,0.0,Senegal,8 | 198,63,150,2.0,Community transmission
11,0.0,Democratic Republic of the Congo,8 | 074,42,189,1.0,Community transmission





 HTML: 




0,1,2,3,4,5,6,7,8
Reporting Country/ Territory/Area,Total confirmed,Total confirmed,Total confirmed,Total confirmed,Total deaths,Total new deaths,Transmission,Days since last
,cases,cases,cases,new cases,,,classificationi,reported case
Africa,,,,,,,,
South Africa,287 796,287 796,287 796,11 554,4 172,93,Community transmission,0
Nigeria,33,33,153,595,744,4,Community transmission,0
Ghana,24,24,988,470,139,0,Community transmission,0
Algeria,19,19,689,494,1 018,7,Community transmission,0
Cameroon,15,15,173,0,359,0,Community transmission,1
Côƚe d͛Iǀoiƌe,12,12,872,106,84,0,Community transmission,0
Kenya,10,10,294,189,197,12,Community transmission,0

0,1,2,3,4,5,6,7
Reporting Country/ Territory/Area,Total confirmed,Total confirmed,Total confirmed,Total deaths,Total new deaths,Transmission,Days since last
,cases,cases,new cases,,,classificationi,reported case
Cabo Verde,1,722,24,19,0,Clusters of cases,0
Sierra Leone,1,642,7,63,0,Community transmission,0
Eswatini,1,389,38,20,0,Clusters of cases,0
Benin,1,378,0,26,0,Community transmission,2
Rwanda,1,378,41,4,0,Sporadic cases,0
Mozambique,1,219,62,9,0,Clusters of cases,0
Niger,1,099,0,68,0,Community transmission,3
Equatorial Guinea,1,043,0,12,0,Community transmission,51

0,1,2,3,4,5,6,7,8,9
Reporting Country/ Territory/Area,Total confirmed,Total confirmed,Total confirmed,Total confirmed,Total deaths,Total deaths,Total new deaths,Transmission,Days since last
,cases,cases,cases,new cases,,,,classificationi,reported case
Eritrea,232,232,232,0,0,0,0,Sporadic cases,4
Seychelles,100,100,100,0,0,0,0,Clusters of cases,4
Gambia,64,64,64,0,3,3,0,Sporadic cases,4
Territoriesii,,,,,,,,,
Mayotte,2 724,2 724,2 724,13,40,40,0,Clusters of cases,0
Réunion,596,596,596,3,3,3,0,Clusters of cases,0
Americas,,,,,,,,,
United States of America,3 286 063,3 286 063,3 286 063,60 113,134 704,134 704,312,Community transmission,0

0,1,2,3,4,5,6
Reporting Country/ Territory/Area,Total confirmed,Total confirmed,Total deaths,Total new deaths,Transmission,Days since last
,cases,new cases,,,classificationi,reported case
Haiti,6 727,37,139,0,Community transmission,0
Paraguay,2 980,160,25,4,Community transmission,0
Cuba,2 428,2,87,0,Clusters of cases,0
Nicaragua,2 411,0,91,0,Community transmission,5
Uruguay,987,1,31,1,Clusters of cases,0
Suriname,762,21,18,0,Clusters of cases,0
Jamaica,758,0,10,0,Clusters of cases,1
Guyana,297,6,17,0,Clusters of cases,0

0,1,2,3,4,5,6,7,8
Reporting Country/ Territory/Area,Total confirmed,Total confirmed,Total confirmed,Total confirmed,Total deaths,Total new deaths,Transmission,Days since last
,cases,cases,cases,new cases,,,classificationi,reported case
Guadeloupe,190,190,190,0,14,0,Clusters of cases,3
Bermuda,150,150,150,0,9,0,Sporadic cases,2
Aruba,105,105,105,0,3,0,Sporadic cases,7
Sint Maarten,78,78,,0,15,0,No cases,10
Turks and Caicos Islands,72,72,,1,2,0,Clusters of cases,0
Saint Martin,44,44,,0,3,0,Sporadic cases,9
Curaçao,26,26,,1,1,0,Sporadic cases,0
Falkland Islands (Malvinas),13,13,,0,0,0,No cases,79

0,1,2,3,4,5,6,7,8,9
Reporting Country/ Territory/Area,Total confirmed,Total confirmed,Total confirmed,Total confirmed,Total deaths,Total deaths,Total new deaths,Transmission,Days since last
,cases,cases,cases,new cases,,,,classificationi,reported case
Afghanistan,34 740,34 740,34 740,289,1 045,1 045,35,Clusters of cases,0
Bahrain,33 476,33 476,33 476,535,109,109,1,Clusters of cases,0
Morocco,15 936,15 936,15 936,191,255,255,5,Clusters of cases,0
Sudan,10 316,10 316,10 316,66,657,657,7,Community transmission,0
Djibouti,4,977,977,5,56,56,0,Clusters of cases,0
Somalia,3,072,072,13,93,93,0,Sporadic cases,0
Lebanon,2,419,419,85,36,36,0,Clusters of cases,0
Libya,1,512,512,79,40,40,1,Clusters of cases,0

0,1,2,3,4,5,6,7,8
Reporting Country/ Territory/Area,Total confirmed,Total confirmed,Total confirmed,Total confirmed,Total deaths,Total new deaths,Transmission,Days since last
,cases,cases,cases,new cases,,,classificationi,reported case
Belgium,62,62,781,74,9 787,5,Community transmission,0
Kazakhstan,61,61,755,1 856,375,0,Clusters of cases,0
Ukraine,54,54,771,638,1 412,14,Community transmission,0
Netherlands,51,51,038,71,6 128,0,Community transmission,0
Portugal,46,46,818,306,1 662,2,Community transmission,0
Israel,39,39,294,1 279,364,7,Pending,0
Poland,38,38,190,299,1 576,5,Community transmission,0
Romania,32,32,948,413,1 901,17,Community transmission,0

0,1,2,3,4,5,6,7
Reporting Country/ Territory/Area,Total confirmed,Total confirmed,Total confirmed,Total deaths,Total new deaths,Transmission,Days since last
,cases,cases,new cases,,,classificationi,reported case
Tajikistan,6 595,6 595,44,55,0,Pending,0
Luxembourg,4,956,31,111,0,Community transmission,0
Hungary,4,247,13,595,0,Community transmission,0
Greece,3,826,23,193,0,Clusters of cases,0
Croatia,3,775,53,119,0,Clusters of cases,0
Albania,3,667,96,97,2,Clusters of cases,0
Estonia,2,014,0,69,0,Clusters of cases,2
Slovakia,1,902,1,28,0,Clusters of cases,0

0,1,2,3,4,5,6,7
Reporting Country/ Territory/Area,Total confirmed,Total confirmed,Total confirmed,Total deaths,Total new deaths,Transmission,Days since last
,cases,cases,new cases,,,classificationi,reported case
Isle of Man,336,336,0,24,0,No cases,53
Jersey,329,329,4,31,0,Community transmission,0
Guernsey,252,252,0,13,0,Community transmission,72
Faroe Islands,188,188,0,0,0,Pending,6
Gibraltar,180,180,0,0,0,Clusters of cases,3
Greenland,13,13,0,0,0,No cases,46
South-East Asia,,,,,,,
India,906 752,906 752,28 498,23 727,553,Clusters of cases,0

0,1,2,3,4,5,6
Reporting Country/ Territory/Area,Total confirmed,Total confirmed,Total deaths,Total new deaths,Transmission,Days since last
,cases,new cases,,,classificationi,reported case
Australia,9 980,183,108,0,Clusters of cases,0
Malaysia,8 725,7,122,0,Clusters of cases,0
New Zealand,1 195,1,22,0,Clusters of cases,0
Viet Nam,372,0,0,0,Clusters of cases,1
Mongolia,230,0,0,0,Sporadic cases,1
Cambodia,165,9,0,0,Sporadic cases,0
Brunei Darussalam,141,0,3,0,No cases,67
Fiji,26,0,0,0,Sporadic cases,2
