In [25]:
from utils import *
import json
import pandas as pd

In [26]:
import torch
from torch.utils.data import Dataset, DataLoader

In [27]:
from transformers import T5Tokenizer, T5Model, T5ForConditionalGeneration, T5TokenizerFast

In [28]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [29]:
with open("../finqa_dataset/train.json") as input_file:
        train_data = json.load(input_file)

with open("../finqa_dataset/dev.json") as input_file:
        valid_data = json.load(input_file)

with open("../finqa_dataset/test.json") as input_file:
        test_data = json.load(input_file)        

In [30]:
print (len(train_data),len(valid_data),len(test_data))

6251 883 1147


In [31]:
def ad_data(json_data, verbose=True):
    net=[]
    err_cnt=0
    for example in json_data:
        question = example["qa"]["question"]
        table = example["table"]
        table_text = ""
        for row in table[1:]:
            this_sent = table_row_to_text(table[0], row)
            table_text += this_sent
        try:
            steps_text = format_steps(example["qa"]["steps"])
            inputs = {"context": table_text, "question": question, "answer": steps_text} 
            net.append(inputs)
        except:
            err_cnt+=1
            if verbose:
                print ("-"*25)
                print (example["filename"])
                print (example["qa"]["steps"])
                print ("-"*25+"\n")
    if err_cnt>0:
        print ("Net Errors:",err_cnt)
    return net

In [32]:
train_pd = pd.DataFrame(ad_data(train_data,False))
valid_pd = pd.DataFrame(ad_data(valid_data,False))
test_pd = pd.DataFrame(ad_data(test_data,False))

Net Errors: 198
Net Errors: 35
Net Errors: 39


In [33]:
train_pd.shape, valid_pd.shape, test_pd.shape

((6053, 3), (848, 3), (1108, 3))

In [34]:
TOKENIZER = T5Tokenizer.from_pretrained("t5-base")
# MODEL = T5ForConditionalGeneration.from_pretrained("t5-base", return_dict=True)

In [35]:
vocab_size = TOKENIZER.vocab_size
print("Vocabulary Size:", vocab_size)

Vocabulary Size: 32100


In [36]:
class FinQA_Dataset(Dataset):
    def __init__(self, tokenizer, df, max_q_len, max_a_len):
        self.tokenizer = tokenizer
        self.q_len = max_q_len
        self.t_len = max_a_len
        self.data = df
        self.questions = self.data["question"]
        self.context = self.data["context"]
        self.answer = self.data['answer']        
    def __len__(self):
        return len(self.questions)
    def __getitem__(self, idx):
        question = self.questions[idx]
        context = self.context[idx]
        answer = self.answer[idx]
        question_tokenized = self.tokenizer(question, context, max_length=self.q_len, padding="max_length",
                                                    truncation=True, pad_to_max_length=True, add_special_tokens=True)
        answer_tokenized = self.tokenizer(answer, max_length=self.t_len, padding="max_length", 
                                          truncation=True, pad_to_max_length=True, add_special_tokens=True)
        labels = torch.tensor(answer_tokenized["input_ids"], dtype=torch.long)
        labels[labels == 0] = -100
        
        return {
            "input_ids": torch.tensor(question_tokenized["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(question_tokenized["attention_mask"], dtype=torch.long),
            "labels": labels,
            "decoder_attention_mask": torch.tensor(answer_tokenized["attention_mask"], dtype=torch.long)
        }

In [37]:
MAX_Q_LEN = 1024
MAX_A_LEN = 128 

In [38]:
finqa_train = FinQA_Dataset(TOKENIZER, train_pd, MAX_Q_LEN, MAX_A_LEN)
finqa_valid = FinQA_Dataset(TOKENIZER, valid_pd, MAX_Q_LEN, MAX_A_LEN)
finqa_test = FinQA_Dataset(TOKENIZER, test_pd, MAX_Q_LEN, MAX_A_LEN)

In [39]:
train_pd.iloc[0]

context     the fair value of forward exchange contracts a...
question            what is the the interest expense in 2009?
answer      Step 1: Divide 100 by 100. This gives the resu...
Name: 0, dtype: object

In [40]:
finqa_train[0]

{'input_ids': tensor([125,  19,   8,  ...,   0,   0,   0]),
 'attention_mask': tensor([1, 1, 1,  ..., 0, 0, 0]),
 'labels': tensor([ 5021,   209,    10, 22390,   910,    57,   910,     5,   100,  1527,
             8,   741,    10,     3,  4704,  1713, 30345, 30345, 30345,  5021,
           204,    10, 22390,     3, 26195,    57,     3,  4704,     5,   100,
          1527,     8,   741,    10,     3, 22671,     1,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
        

In [41]:
finqa_train[0]['labels']

tensor([ 5021,   209,    10, 22390,   910,    57,   910,     5,   100,  1527,
            8,   741,    10,     3,  4704,  1713, 30345, 30345, 30345,  5021,
          204,    10, 22390,     3, 26195,    57,     3,  4704,     5,   100,
         1527,     8,   741,    10,     3, 22671,     1,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100])

In [42]:
train_pd.iloc[0]["answer"]

'Step 1: Divide 100 by 100. This gives the result: 1% ####### Step 2: Divide 3.8 by 1%. This gives the result: 380'

In [43]:
TOKENIZER.decode([x for x in finqa_train[0]['labels'] if x!= -100], skip_special_tokens=True)

'Step 1: Divide 100 by 100. This gives the result: 1% ####### Step 2: Divide 3.8 by 1%. This gives the result: 380'

In [44]:
train_data[0]["qa"]["steps"]

[{'op': 'divide1-1', 'arg1': '100', 'arg2': '100', 'res': '1%'},
 {'op': 'divide1-2', 'arg1': '3.8', 'arg2': '#0', 'res': '380'}]

In [45]:
torch.save(finqa_train, '../finqa_dataset/finqa_train.pth')
torch.save(finqa_valid, '../finqa_dataset/finqa_valid.pth')
torch.save(finqa_test, '../finqa_dataset/finqa_test.pth')