In [2]:
import transformers
import json

In [3]:
with open("../finqa_dataset/train.json") as input_file:
        train_data = json.load(input_file)

In [4]:
len(train_data)

6251

In [6]:
example = train_data[7]

In [7]:
example

{'pre_text': ['free cash flow conversion rate we believe this measure provides useful information to investors because it is important for assessing our efficiency in converting earnings to cash and returning cash to shareholders .',
  'the calculation of free cash flow conversion rate and net cash provided by operating activities conversion rate , its equivalent gaap measure , follows: .'],
 'post_text': ['( a ) see note 14 to the consolidated financial statements in item 8 of this report .',
  '( b ) see note 7 to the consolidated financial statements in item 8 of this report .',
  '( c ) see note 3 to the consolidated financial statements in item 8 of this report .',
  '( d ) see note 4 to the consolidated financial statements in item 8 of this report .',
  '( e ) impact of hyperinflationary accounting for our argentina subsidiary , which was sold in the third quarter of fiscal 2019 .',
  '( f ) valuation gains on certain corporate investments .',
  '( g ) legal recovery related to 

In [8]:
example["qa"]["question"]

'in 2019 what was the percent of the net earnings to the net cash provided by operating activities'

In [18]:
example["table"]

[['in millions', 'fiscal 2019'],
 ['net earnings including earnings attributable to redeemable and noncontrolling interests asreported',
  '$ 1786.2'],
 ['net tax benefit ( a )', '$ -7.2 ( 7.2 )'],
 ['tax item ( a )', '-72.9 ( 72.9 )'],
 ['mark-to-marketeffects net of tax ( b )', '27.7'],
 ['acquisition integration costs net of tax ( c )', '19.7'],
 ['divestitures loss net of tax ( c )', '16.4'],
 ['restructuring charges net of tax ( d )', '63.0'],
 ['project-related costs net of tax ( d )', '1.1'],
 ['asset impairments net of tax ( d )', '159.7'],
 ['hyperinflationary accounting net of tax ( e )', '3.2'],
 ['investment valuation adjustments net of tax ( f )', '-17.6 ( 17.6 )'],
 ['legal recovery net of tax ( g )', '-10.8 ( 10.8 )'],
 ['cpw restructuring costs net of tax ( h )', '11.1'],
 ['adjusted net earnings including earnings attributable to redeemable and noncontrollinginterests',
  '$ 1979.6'],
 ['net cash provided by operating activities', '$ 2807.0'],
 ['purchases of land buil

In [9]:
example["qa"]["model_input"]

[['table_1',
  'in millions the net earnings including earnings attributable to redeemable and noncontrolling interests asreported of fiscal 2019 is $ 1786.2 ;'],
 ['table_14',
  'in millions the adjusted net earnings including earnings attributable to redeemable and noncontrollinginterests of fiscal 2019 is $ 1979.6 ;'],
 ['table_15',
  'in millions the net cash provided by operating activities of fiscal 2019 is $ 2807.0 ;']]

In [13]:
def remove_space(text_in):
    res = []

    for tmp in text_in.split(" "):
        if tmp != "":
            res.append(tmp)

    return " ".join(res)

def table_row_to_text(header, row):
    '''
    use templates to convert table row to text
    '''
    res = ""
    
    if header[0]:
        res += (header[0] + " ")

    for head, cell in zip(header[1:], row[1:]):
        res += ("the " + row[0] + " of " + head + " is " + cell + " ; ")
    
    res = remove_space(res)
    return res.strip()

In [14]:
table = example["table"]
table_text = ""
for row in table[1:]:
        this_sent = table_row_to_text(table[0], row)
        table_text += this_sent

In [16]:
(table_text)

in millions the net earnings including earnings attributable to redeemable and noncontrolling interests asreported of fiscal 2019 is $ 1786.2 ;in millions the net tax benefit ( a ) of fiscal 2019 is $ -7.2 ( 7.2 ) ;in millions the tax item ( a ) of fiscal 2019 is -72.9 ( 72.9 ) ;in millions the mark-to-marketeffects net of tax ( b ) of fiscal 2019 is 27.7 ;in millions the acquisition integration costs net of tax ( c ) of fiscal 2019 is 19.7 ;in millions the divestitures loss net of tax ( c ) of fiscal 2019 is 16.4 ;in millions the restructuring charges net of tax ( d ) of fiscal 2019 is 63.0 ;in millions the project-related costs net of tax ( d ) of fiscal 2019 is 1.1 ;in millions the asset impairments net of tax ( d ) of fiscal 2019 is 159.7 ;in millions the hyperinflationary accounting net of tax ( e ) of fiscal 2019 is 3.2 ;in millions the investment valuation adjustments net of tax ( f ) of fiscal 2019 is -17.6 ( 17.6 ) ;in millions the legal recovery net of tax ( g ) of fiscal 201

In [19]:
context = " ".join(example["pre_text"]) + " " + " ".join(example["post_text"]) + " " + table_text

In [22]:
context

'free cash flow conversion rate we believe this measure provides useful information to investors because it is important for assessing our efficiency in converting earnings to cash and returning cash to shareholders . the calculation of free cash flow conversion rate and net cash provided by operating activities conversion rate , its equivalent gaap measure , follows: . ( a ) see note 14 to the consolidated financial statements in item 8 of this report . ( b ) see note 7 to the consolidated financial statements in item 8 of this report . ( c ) see note 3 to the consolidated financial statements in item 8 of this report . ( d ) see note 4 to the consolidated financial statements in item 8 of this report . ( e ) impact of hyperinflationary accounting for our argentina subsidiary , which was sold in the third quarter of fiscal 2019 . ( f ) valuation gains on certain corporate investments . ( g ) legal recovery related to our yoplait sas subsidiary . ( h ) the cpw restructuring charges are

In [25]:
original_question = example["qa"]["question"] + " " + context.strip()

In [33]:
[i for i,ex in enumerate(train_data) if len(ex["qa"]["answer"])>0][:10]

[0, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [39]:
example["qa"]["question"]

'in 2019 what was the percent of the net earnings to the net cash provided by operating activities'

In [42]:
example["qa"]["steps"]

[{'op': 'divide1-1', 'arg1': '1786.2', 'arg2': '2807.0', 'res': '63.6%'}]

In [40]:
example["qa"]["steps"]

[{'op': 'divide1-1', 'arg1': '1786.2', 'arg2': '2807.0', 'res': '63.6%'}]

In [62]:
possible_steps = [ex["qa"]["steps"] for ex in train_data for step in ex["qa"]["steps"] if "table" in step["op"]]
possible_steps

[]