In [1]:
%%capture
!pip install -q -U bitsandbytes
# !pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install transformers==4.31
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets
!pip install evaluate
!pip install -qqq trl==0.7.1
!pip install torch

In [2]:
!pip install -q rouge_score

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
%ls

[0m[01;34mdrive[0m/  [01;34msample_data[0m/


In [5]:
%cd /content/drive/MyDrive/DLT_Project/

/content/drive/MyDrive/DLT_Project


In [6]:
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import transformers
from torch.optim import Adam
from tqdm import tqdm
import utils
from utils import *
import json
import pandas as pd
import torch
# from torch.utils.data import Dataset, DataLoader
import time
import evaluate
from datasets import Dataset, load_dataset
import random
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig


In [7]:
dir_path = '/content/drive/MyDrive/DLT_Project/'
os.chdir(dir_path)

In [7]:
with open(f"{dir_path}/train.json") as input_file:
        train_data = json.load(input_file)

with open(f"{dir_path}/dev.json") as input_file:
        valid_data = json.load(input_file)

with open(f"{dir_path}/test.json") as input_file:
        test_data = json.load(input_file)

In [8]:
assert len(train_data) == 6251, "The train data seems to be off, please check"
assert len(valid_data) == 883, "The validation data seems to be off, please check"
assert len(test_data) == 1147, "The test data seems to be off, please check"

In [9]:
df_train_1step_serialized =pd.read_csv(f"{dir_path}/df_train_1step_serialized.csv")

In [10]:
df_train_1step_serialized.shape

(3532, 9)

In [11]:
train_data=pd.DataFrame(train_data).merge(df_train_1step_serialized[['id','serialized_text_gpt35']],on='id',how='inner').to_dict(orient='records')

In [12]:
len(train_data)

3532

In [13]:
train_data[0].keys()

dict_keys(['pre_text', 'post_text', 'filename', 'table_ori', 'table', 'qa', 'id', 'table_retrieved', 'text_retrieved', 'table_retrieved_all', 'text_retrieved_all', 'serialized_text_gpt35'])

In [14]:
def prepare_data(json_data, verbose=True, split = 'train'):
    net=[]
    err_cnt=0
    for example in json_data:
        question = example["qa"]["question"]
        table = example["table"]
        id=example["id"]
        table_ori=example["table_ori"]
        answer_steps=example["qa"]["steps"]
        if split == 'train':
            serialized_text=example["serialized_text_gpt35"]
            table_text = ""
            for row in table[1:]:
                this_sent = table_row_to_text(table[0], row)
                table_text += this_sent
            try:
                steps_text = format_steps(example["qa"]["steps"])
                inputs = {"id":id, "context": serialized_text, "question": question, "answer": steps_text,\
                          "table":table_ori,"answer_steps":answer_steps}
                net.append(inputs)
            except:
                err_cnt+=1
                if verbose:
                    print ("-"*25)
                    print (example["filename"])
                    print (example["qa"]["steps"])
                    print ("-"*25+"\n")
        else:
            table_text = ""
            for row in table[1:]:
                this_sent = table_row_to_text(table[0], row)
                table_text += this_sent
            try:
                steps_text = format_steps(example["qa"]["steps"])
                inputs = {"id":id, "context": table_text, "question": question, "answer": steps_text,\
                          "table":table_ori,"answer_steps":answer_steps}
                net.append(inputs)
            except:
                err_cnt+=1
                if verbose:
                    print ("-"*25)
                    print (example["filename"])
                    print (example["qa"]["steps"])
                    print ("-"*25+"\n")

    if err_cnt>0:
        print ("Net Errors:",err_cnt)
    return net

In [15]:
## Preparing huggingface dataset
data_splits = {'train': train_data, 'valid': valid_data, 'test': test_data}
datasets = {split: Dataset.from_pandas(pd.DataFrame(prepare_data(data, False, split))) for split, data in data_splits.items()}

Net Errors: 35
Net Errors: 39


In [16]:
datasets

{'train': Dataset({
     features: ['id', 'context', 'question', 'answer', 'table', 'answer_steps'],
     num_rows: 3532
 }),
 'valid': Dataset({
     features: ['id', 'context', 'question', 'answer', 'table', 'answer_steps'],
     num_rows: 848
 }),
 'test': Dataset({
     features: ['id', 'context', 'question', 'answer', 'table', 'answer_steps'],
     num_rows: 1108
 })}

In [17]:
step_pattern = r"Step \d+"
import re

In [18]:
def num_steps_dict_fn(example):
    example["num_steps"] = len(re.findall(step_pattern, example["answer"]))
    return example

In [19]:
datasets['train']=datasets['train'].map(num_steps_dict_fn)
datasets['valid']=datasets['valid'].map(num_steps_dict_fn)
datasets['test']=datasets['test'].map(num_steps_dict_fn)

Map:   0%|          | 0/3532 [00:00<?, ? examples/s]

Map:   0%|          | 0/848 [00:00<?, ? examples/s]

Map:   0%|          | 0/1108 [00:00<?, ? examples/s]

In [20]:
datasets['train']=datasets['train'].filter(lambda example: example['num_steps']==1)
datasets['valid']=datasets['valid'].filter(lambda example: example['num_steps']==1)
datasets['test']=datasets['test'].filter(lambda example: example['num_steps']==1)

Filter:   0%|          | 0/3532 [00:00<?, ? examples/s]

Filter:   0%|          | 0/848 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1108 [00:00<?, ? examples/s]

In [21]:
datasets

{'train': Dataset({
     features: ['id', 'context', 'question', 'answer', 'table', 'answer_steps', 'num_steps'],
     num_rows: 3532
 }),
 'valid': Dataset({
     features: ['id', 'context', 'question', 'answer', 'table', 'answer_steps', 'num_steps'],
     num_rows: 489
 }),
 'test': Dataset({
     features: ['id', 'context', 'question', 'answer', 'table', 'answer_steps', 'num_steps'],
     num_rows: 616
 })}

In [22]:
import getpass

# Prompt for the Hugging Face token
hf_token = getpass.getpass("Enter your Hugging Face token: ")

import os
os.environ['HUGGINGFACE_TOKEN'] = hf_token

Enter your Hugging Face token: ··········


In [23]:
model_id =  "meta-llama/Llama-2-7b-hf"
# model_id = "meta-llama/Llama-2-7b-chat-hf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto",use_auth_token=hf_token)

tokenizer = AutoTokenizer.from_pretrained(model_id,use_auth_token=hf_token)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]



model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]



tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [24]:
def format_instruction(id, context: str, question: str, answer: str,table , answer_steps, num_steps):
  prompt = f"""### Instruction: Act as a financial analyst adept in reading Earning Reports and Financial Documents.
  Answer the question in a single step by extracting relevant values from the context given below. Perform the necessary arithmetic operation.

  ### Context:
  {context}

  ### Question:
  {question}

  ### Answer:
  {answer}
  """.strip()
  return prompt

In [25]:
index = 5

In [26]:
print(format_instruction(**datasets['train'][index]))

### Instruction: Act as a financial analyst adept in reading Earning Reports and Financial Documents.
  Answer the question in a single step by extracting relevant values from the context given below. Perform the necessary arithmetic operation.

  ### Context:
  The table shows the various expenses incurred during a reorganization process. The expenses include a labor-related deemed claim of $1,733, aircraft and facility financing renegotiations and rejections costing $325, fair value of conversion discount amounting to $218, professional fees totaling $199, and other expenses amounting to $180. The total reorganization expenses come to $2,655.

  ### Question:
  what is the percent of the labor-related deemed claim as part of the total reorganization items net in 2013

  ### Answer:
  Step 1: Divide 1733 by 2655. This gives the result: 65.3%


In [27]:
def generate_instruction_dataset(data_point):

    return {
        "context": data_point["context"],
        "question": data_point["question"],
        "answer": data_point["answer"],
        "answer_steps": data_point["answer_steps"],
        "input_prompt": format_instruction(**data_point)
    }

In [28]:
def process_dataset(data: Dataset):
    return (
        data.shuffle(seed=42)
        .map(generate_instruction_dataset)
    )

In [29]:
## APPLYING PREPROCESSING ON WHOLE DATASET
datasets["train"] = process_dataset(datasets["train"])
datasets["test"] = process_dataset(datasets["test"])
datasets["valid"] = process_dataset(datasets["valid"])

Map:   0%|          | 0/3532 [00:00<?, ? examples/s]

Map:   0%|          | 0/616 [00:00<?, ? examples/s]

Map:   0%|          | 0/489 [00:00<?, ? examples/s]

In [30]:
# Select 2048 rows from the training split
train_data = datasets['train'].shuffle(seed=42).select([i for i in range(512)])

# Select 64 rows from the test and validation splits
validation_data = datasets['valid'].shuffle(seed=42).select([i for i in range(64)])
test_data = datasets['test'].shuffle(seed=42).select([i for i in range(64)])


train_data,validation_data,test_data

(Dataset({
     features: ['id', 'context', 'question', 'answer', 'table', 'answer_steps', 'num_steps', 'input_prompt'],
     num_rows: 512
 }),
 Dataset({
     features: ['id', 'context', 'question', 'answer', 'table', 'answer_steps', 'num_steps', 'input_prompt'],
     num_rows: 64
 }),
 Dataset({
     features: ['id', 'context', 'question', 'answer', 'table', 'answer_steps', 'num_steps', 'input_prompt'],
     num_rows: 64
 }))

In [31]:
index=0
print(datasets["train"]["input_prompt"][index])

### Instruction: Act as a financial analyst adept in reading Earning Reports and Financial Documents.
  Answer the question in a single step by extracting relevant values from the context given below. Perform the necessary arithmetic operation.

  ### Context:
  The table provides information on the payments due by period for various contractual obligations of a company. The first row of the table lists the categories of obligations, namely long-term debt (including capital leases), interest payments on long-term debt, operating leases, and purchase obligations. The second row indicates the total amount due for each category, in millions of dollars. The following rows provide a breakdown of these amounts by period, namely less than 1 year, 1-3 years, 3-5 years, and more than 5 years. 

For long-term debt, the total amount due is $2,750.1 million, with $34.5 million due in less than 1 year, $188.3 million due in 1-3 years, $367.1 million due in 3-5 years, and $2,160.2 million due in mor

In [32]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():

        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [33]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [34]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )


In [35]:
print_trainable_parameters(model)

trainable params: 0 || all params: 3500412928 || trainable%: 0.0


In [36]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=16,
    lora_alpha=64,
    # target_modules=["query_key_value"],
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"], #specific to Llama models.
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
print_trainable_parameters(model)

trainable params: 16777216 || all params: 3517190144 || trainable%: 0.477006226934315


In [40]:
OUTPUT_DIR = "./llama2-7b-finqa-adapter"

%reload_ext tensorboard
%tensorboard --logdir llama2-7b-finqa-adapter/runs

ERROR: Failed to launch TensorBoard (exited with 1).
Contents of stderr:
2023-12-08 06:21:09.009873: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-08 06:21:09.009974: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-08 06:21:09.010017: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

Address already in use
Port 6006 is in use by another program. Either identify and stop that pr

In [43]:
from transformers import TrainingArguments

training_arguments = TrainingArguments(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    logging_steps=1,
    learning_rate=2e-5,
    fp16=True,
    max_grad_norm=0.3,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=0.2,
    warmup_ratio=0.05,
    save_strategy="epoch",
    group_by_length=True,
    output_dir=OUTPUT_DIR,
    report_to="tensorboard",
    save_safetensors=True,
    lr_scheduler_type="cosine",
    seed=42,
)
model.config.use_cache = False

In [44]:
from trl import SFTTrainer
trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=validation_data,
    peft_config=lora_config,
    dataset_text_field="input_prompt",
    max_seq_length=1024,
    tokenizer=tokenizer,
    args=training_arguments,
)

trainer.train()

Map:   0%|          | 0/512 [00:00<?, ? examples/s]

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
20,1.4207,1.132541
40,1.1714,0.874806
60,0.9689,0.77047
80,0.9228,0.756476




TrainOutput(global_step=96, training_loss=1.1073239588489134, metrics={'train_runtime': 674.7902, 'train_samples_per_second': 2.276, 'train_steps_per_second': 0.142, 'total_flos': 1.1216123562295296e+16, 'train_loss': 1.1073239588489134, 'epoch': 3.0})

In [45]:
peft_model_path="./finqa-answer-steps-llama-7b"

trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

('./finqa-answer-steps-llama-7b/tokenizer_config.json',
 './finqa-answer-steps-llama-7b/special_tokens_map.json',
 './finqa-answer-steps-llama-7b/tokenizer.json')

In [None]:
from transformers import TextStreamer
model.config.use_cache = True
model.eval()

In [47]:
import warnings
warnings.filterwarnings('ignore')
index = 20

context = datasets['test'][index]['context']
answer = datasets['test'][index]['answer']
question = datasets['test'][index]['question']

prompt = f"""### Instruction: Act as a financial analyst adept in reading Earning Reports and Financial Documents.
  Answer the question in a single step by extracting relevant values from the context given below. Perform the necessary arithmetic operation.
  Do not generate any new numbers for the calculation. Only refer to information present in the context.

  ### Context:
  {context}

  ### Question:
  {question}

  ### Answer:

  """.strip()

input_ids = tokenizer(prompt, return_tensors='pt',truncation=True).input_ids.cuda()
outputs = model.generate(input_ids=input_ids,
        temperature=0.1,
        do_sample=True,
        max_new_tokens=128,
    )

output= tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE ANSWER:\n{answer}\n')
print(dash_line)
print(f'Llama2 FINETUNED MODEL GENERATED TEXT :\n{output}')

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


---------------------------------------------------------------------------------------------------
INPUT PROMPT:
### Instruction: Act as a financial analyst adept in reading Earning Reports and Financial Documents.
  Answer the question in a single step by extracting relevant values from the context given below. Perform the necessary arithmetic operation.
  Do not generate any new numbers for the calculation. Only refer to information present in the context.

  ### Context:
  commitment type the capital leases of 2011 is $ 18 ; the capital leases of 2012 is $ 19 ; the capital leases of 2013 is $ 19 ; the capital leases of 2014 is $ 20 ; the capital leases of 2015 is $ 21 ; the capital leases of after 2016 is $ 112 ; the capital leases of total is $ 209 ;commitment type the operating leases of 2011 is 348 ; the operating leases of 2012 is 268 ; the operating leases of 2013 is 205 ; the operating leases of 2014 is 150 ; the operating leases of 2015 is 113 ; the operating leases of after

In [None]:
# from peft import AutoPeftModelForCausalLM
# from transformers import AutoTokenizer

# peft_model_dir = "finqa-answer-steps-llama-chat"

# # load base LLM model and tokenizer
# llama_trained_model = AutoPeftModelForCausalLM.from_pretrained(
#     peft_model_dir,
#     low_cpu_mem_usage=True,
#     torch_dtype=torch.float16,
#     load_in_4bit=True,
#     use_auth_token=hf_token
# )
# llama_tokenizer = AutoTokenizer.from_pretrained(peft_model_dir, use_auth_token=hf_token)

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]



model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]



In [None]:
# import warnings
# warnings.filterwarnings('ignore')
# index = 5

# context = datasets['train'][index]['context']
# answer = datasets['train'][index]['answer']
# question = datasets['train'][index]['question']

# prompt = f"""### Instruction: Answer the question in a single step by extracting relevant values from the context given below. Perform the necessary arithmetic operation.

#   ### Context:
#   {context}

#   ### Question:
#   {question}

#   ### Answer:

#   """.strip()

# input_ids = llama_tokenizer(prompt, return_tensors='pt',truncation=True).input_ids.cuda()
# outputs = llama_trained_model.generate(input_ids=input_ids,
#         temperature=0.1,
#         do_sample=True,
#         max_new_tokens=128,
#     )

# output= llama_tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]

# dash_line = '-'.join('' for x in range(100))
# print(dash_line)
# print(f'INPUT PROMPT:\n{prompt}')
# print(dash_line)
# print(f'BASELINE ANSWER:\n{answer}\n')
# print(dash_line)
# print(f'Llama2 FINETUNED MODEL GENERATED TEXT :\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:
### Instruction: Answer the question in a single step by extracting relevant values from the context given below. Perform the necessary arithmetic operation.
  
  ### Context:
  In the year ended December 31, 2005, the net income as reported was $838 million. An additional stock option compensation expense of $20 million, net of related taxes, was included in the reported net income. However, after deducting the total stock option compensation expense determined under fair value method for all awards, net of related taxes, which amounted to $27 million, the pro forma net income was $831 million. The earnings per share for the year were $2.53 for basic-as reported and $2.51 for basic-pro forma. The diluted earnings per share were $2.50 for diluted-as reported and $2.48 for diluted-pro forma.

  ### Question:
  what is the number of outstanding shares based on the eps , ( in 

INFERENCE AND EVALUATION

Post-processing

In [8]:
import re

def extract_info(input_text):
  try:
      # Regular expression to match the answer pattern
      answer_pattern = r"Step 1: ([A-Za-z]+) ([-]?[$]?[\d,.]+) .* ([-]?[$]?[\d,.]+)\.* This gives the result: ([-]?[$]?[\d.]+%?)"

      # Find the matching groups
      match = re.search(answer_pattern, input_text)
      if match:
          operation, arg1, arg2, result = match.groups()
          # Removing commas from the arguments and converting to appropriate types

          chars_to_replace=[',','$']
          for char in chars_to_replace:
              arg1=arg1.replace(char,'')
              arg2=arg2.replace(char,'')
              result=result.replace(char,'')
          # arg1 = float(arg1)
          # arg2 = float(arg2)
          # Checking if result is a percentage or a number
          if '%' in result:
              result = (result.replace('%', ''))
          else:
              result = (result)

          # Constructing the output dictionary
          output = {
              'Answer': match.group(0),
              'Operation': operation.capitalize(),
              'argument_1': arg1,
              'argument_2': arg2,
              'result': result
          }
          return output
  except:
      return "Pattern not found"

In [9]:
ops_dict={'Divide':'/', 'Multiply':'*', 'Add':'+', 'Subtract':'-'}

In [50]:
 ##Post-Processing the Output
split_text = re.split(r'-{5,}', output)
last_segment = split_text[-1].strip()
ans_dict=extract_info(last_segment)

In [51]:
ans_dict

{'Answer': 'Step 1: Divide 2963 by 23556. This gives the result: 12.3%',
 'Operation': 'Divide',
 'argument_1': '2963',
 'argument_2': '23556.',
 'result': '12.3'}

In [10]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [11]:
%%capture
!pip install langchain-experimental
from langchain.agents import Tool
from langchain_experimental.utilities import PythonREPL
python_repl = PythonREPL()

In [54]:
def output_finetuning(row):
  context = row['context']
  question = row['question']
  answer = row['answer']

  prompt = f"""### Instruction: Act as a financial analyst adept in reading Earning Reports and Financial Documents.
  Answer the question in a single step by extracting relevant values from the context given below. Perform the necessary arithmetic operation.
  Do not generate any new numbers for the calculation. Only refer to information present in the context.

  ### Context:
  {context}

  ### Question:
  {question}

  ### Answer:

  """.strip()
  try:
    input_ids = tokenizer(prompt, return_tensors='pt',truncation=True).input_ids.cuda()
    outputs = model.generate(input_ids=input_ids,
            temperature=0.1,
            do_sample=True,
            max_new_tokens=128,
        )

    output= tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]
    ##Post-Processing the Output
    split_text = re.split(r'-{5,}', output)
    last_segment = split_text[-1].strip()
    ans_dict=extract_info(last_segment)

    return ans_dict
  except:
    return {'answer':None}

In [12]:
from tqdm import tqdm
tqdm.pandas(desc="Progress:")

In [59]:
df_valid=pd.DataFrame(validation_data)

In [60]:
df_valid.shape

(64, 8)

In [58]:
import warnings
warnings.filterwarnings('ignore')
df_valid.head(2).progress_apply(lambda row:output_finetuning(row),axis=1)

Progress:: 100%|██████████| 2/2 [01:18<00:00, 39.22s/it]


0    {'Answer': 'Step 1: Divide 100.00 by 175.99. T...
1    {'Answer': 'Step 1: Divide 7367 by 60261. This...
dtype: object

In [61]:
df_valid.loc[:,"llama_finetuned_answers"]= df_valid.progress_apply(lambda row:output_finetuning(row),axis=1)

Progress:: 100%|██████████| 64/64 [16:09<00:00, 15.14s/it]


In [62]:
df_valid.to_csv("./llama2_7b_finetuned_answers_valid_dataset64.csv",index=False)

In [63]:
df_valid

Unnamed: 0,id,context,question,answer,table,answer_steps,num_steps,input_prompt,llama_finetuned_answers
0,APD/2014/page_39.pdf-1,the sales of 2014 is $ 450.4 ; the sales of 20...,what was the operating margin for 2014?,Step 1: Divide 88.2 by 450.4. This gives the r...,"[[, 2014, 2013, 2012], [Sales, $450.4, $451.1,...","[{'arg1': '88.2', 'arg2': '450.4', 'op': 'divi...",1,### Instruction: Act as a financial analyst ad...,{'Answer': 'Step 1: Divide 88.2 by 450.4. This...
1,MRO/2017/page_111.pdf-3,beginning of year the revisions of previous es...,what would end of year proven reserves be with...,Step 1: Subtract 57 from 546. This gives the r...,"[[Beginning of year, 552], [Revisions of previ...","[{'arg1': '546', 'arg2': '57', 'op': 'minus2-1...",1,### Instruction: Act as a financial analyst ad...,{'Answer': 'Step 1: Divide 546 by 552. This gi...
2,HUM/2017/page_133.pdf-3,paymentdate the 2015 of amountper share is $ 1...,what was the number of shares issued in 2015 i...,Step 1: Divide 170 by 1.14. This gives the res...,"[[PaymentDate, Amountper Share, TotalAmount (i...","[{'arg1': '170', 'arg2': '1.14', 'op': 'divide...",1,### Instruction: Act as a financial analyst ad...,{'Answer': 'Step 1: Divide 170 by 1.14. This g...
3,SLB/2012/page_44.pdf-1,the 2012 of total cost of shares purchased is ...,as of december 312012 what was the outstanding...,Step 1: Subtract 7.12 from const_8. This gives...,"[[, Total cost of shares purchased, Total numb...","[{'arg1': 'const_8', 'arg2': '7.12', 'op': 'mi...",1,### Instruction: Act as a financial analyst ad...,{'Answer': 'Step 1: Divide 971883 by 14087.8. ...
4,RE/2015/page_33.pdf-1,( dollars in millions ) the 2015 of december 3...,what is the book to market ratio of the commer...,Step 1: Divide 264.9 by 266.3. This gives the ...,"[[, December 31,], [(Dollars in millions), Ave...","[{'arg1': '264.9', 'arg2': '266.3', 'op': 'div...",1,### Instruction: Act as a financial analyst ad...,{'Answer': 'Step 1: Divide 17430.8 by 16831.9....
...,...,...,...,...,...,...,...,...,...
59,HOLX/2007/page_128.pdf-1,net tangible assets acquired as of july 13 200...,what is the fair value of hologic common stock...,Step 1: Divide 205500 by 4400. This gives the ...,"[[Net tangible assets acquired as of July 13, ...","[{'arg1': '205500', 'arg2': '4400', 'op': 'div...",1,### Instruction: Act as a financial analyst ad...,{'Answer': 'Step 1: Divide 220600 by 10200. Th...
60,AAPL/2014/page_38.pdf-2,the cash cash equivalents and marketable secur...,what was the change in property plant and equi...,Step 1: Subtract 16597 from 20624. This gives ...,"[[, 2014, 2013, 2012], [Cash, cash equivalents...","[{'arg1': '20624', 'arg2': '16597', 'op': 'min...",1,### Instruction: Act as a financial analyst ad...,{'Answer': 'Step 1: Divide 20624 by 16597. Thi...
61,GPN/2017/page_77.pdf-2,cash the customer-related intangible assets of...,what portion of the total purchase considerati...,Step 1: Divide 203828 by 265982. This gives th...,"[[Cash, $45,826], [Customer-related intangible...","[{'arg1': '203828', 'arg2': '265982', 'op': 'd...",1,### Instruction: Act as a financial analyst ad...,{'Answer': 'Step 1: Divide 203828 by 265982. T...
62,PNC/2018/page_171.pdf-1,in millions the commitments to extend credit o...,in 2018 what was the percent of the total comm...,Step 1: Divide 16944 by 181612. This gives the...,"[[In millions, December 31 2018, December 3120...","[{'arg1': '16944', 'arg2': '181612', 'op': 'di...",1,### Instruction: Act as a financial analyst ad...,{'Answer': 'Step 1: Divide 16944 by 181612. Th...


In [64]:
df_valid['answer_formatted'] = df_valid['answer'].progress_apply(lambda x:extract_info(x))

Progress:: 100%|██████████| 64/64 [00:00<00:00, 46978.55it/s]


In [None]:
import ast

In [None]:
def convert_to_dict(string):
  try:
    return ast.literal_eval(string)
  except:
    return None

In [None]:
df_valid['finetuned_answers'] = df_valid['llama_finetuned_answers'].progress_apply(lambda x:convert_to_dict(x))

In [67]:
def clean_digits(ans):
  if ans is None:
    return None
  for i, j in ans.items():
    try:
      ans_cleaned = re.sub('((\d+)[\.])(?!([\d]+))','\g<2>',j)
      ans[i] = float(ans_cleaned)
    except:
      ans[i] = j
  return ans

In [68]:
df_valid['actual_answer_final'] = df_valid['answer_formatted'].progress_apply(lambda x:clean_digits(x))
df_valid['finetuned_answers_final'] = df_valid['llama_finetuned_answers'].progress_apply(lambda x:clean_digits(x))

Progress:: 100%|██████████| 64/64 [00:00<00:00, 17676.51it/s]
Progress:: 100%|██████████| 64/64 [00:00<00:00, 14940.47it/s]


In [69]:
def evaluation(question, actual_ans, model_ans):
  '''
  actual_ans, model_ans: dict of answer components (operation, arguments, result)
  '''
  if actual_ans is None or model_ans is None:
    return None
  operator_match = int(actual_ans['Operation'] == model_ans['Operation'])
  # arg1_match = int(actual_ans['argument_1'] == model_ans['argument_1'])
  if float(abs(actual_ans['argument_1'] - model_ans['argument_1'])) < 0.1:
    arg1_match = 1
  else:
    arg1_match = 0
  if float(abs(actual_ans['argument_2'] - model_ans['argument_2'])) < 0.1:
    arg2_match = 1
  else:
    arg2_match = 0
  # arg2_match = int(actual_ans['argument_2'] == model_ans['argument_2'])
  if float(abs(actual_ans['result'] - model_ans['result'])) < 0.1:
    result_match = 1
  else:
    result_match = 0
  # result_match = int(actual_ans['result'] == model_ans['result'])
  result_deviation = float(abs(abs(actual_ans['result']) - abs(model_ans['result'])))

  reverse_arg_match = int((actual_ans['argument_1'] == model_ans['argument_2'])&(actual_ans['argument_2'] == model_ans['argument_1']))
  benchmark_answer=actual_ans['result']
  model_answer=model_ans['result']
  # rounded_ans = None
  # computed_result_deviation = None
  ##Computing the answer based on model's returned argument and operator
  if (arg1_match ==1) and (arg2_match ==1):
    value_to_compute= str(model_ans['argument_1'])+(ops_dict[model_ans['Operation']])+ str(model_ans['argument_2'])
    if 'percent' in question.lower():
      computed_ans = python_repl.run(f"print({value_to_compute}*100.0)")
      rounded_ans = round(float(computed_ans.strip()),2)
      computed_result_deviation = abs(abs(actual_ans['result']) - abs(rounded_ans))
    else:
      computed_ans = python_repl.run(f"print({value_to_compute})")
      rounded_ans = round(float(computed_ans.strip()),2)
      computed_result_deviation = abs(abs(actual_ans['result']) - abs(rounded_ans))

  else:
    rounded_ans = None
    computed_result_deviation = None

  evaluation_dict = ['operator_match','arg1_match','arg2_match','result_match','result_deviation','model_program_ans','computed_result_deviation','reverse_arg_match','benchmark_answer','model_answer']
  evaluation_dict = {k:v for k,v in zip(evaluation_dict, [operator_match, arg1_match, arg2_match, result_match, result_deviation,rounded_ans, computed_result_deviation,reverse_arg_match,benchmark_answer,model_answer])}
  return evaluation_dict

In [70]:
df_valid['evaluation_dict'] = df_valid.progress_apply(lambda x: evaluation(x['question'],x['actual_answer_final'], x['finetuned_answers_final']), axis=1)

Progress:: 100%|██████████| 64/64 [00:00<00:00, 1515.21it/s]


In [71]:
df_valid_nonnull = df_valid[df_valid['evaluation_dict'].notna()]

In [72]:
df_valid_nonnull.shape

(57, 13)

In [73]:
np.mean(df_valid_nonnull['evaluation_dict'].progress_apply(lambda x: x['operator_match']))

Progress:: 100%|██████████| 57/57 [00:00<00:00, 78180.29it/s]


0.7719298245614035

In [74]:
np.mean(df_valid_nonnull['evaluation_dict'].progress_apply(lambda x: x['arg1_match']))

Progress:: 100%|██████████| 57/57 [00:00<00:00, 77046.51it/s]


0.43859649122807015

In [75]:
np.mean(df_valid_nonnull['evaluation_dict'].progress_apply(lambda x: x['arg2_match']))

Progress:: 100%|██████████| 57/57 [00:00<00:00, 75394.30it/s]


0.45614035087719296

In [76]:
np.mean(df_valid_nonnull['evaluation_dict'].progress_apply(lambda x: x['result_match']))

Progress:: 100%|██████████| 57/57 [00:00<00:00, 74455.10it/s]


0.17543859649122806

In [77]:
np.mean(df_valid_nonnull['evaluation_dict'].progress_apply(lambda x: x['result_deviation']))

Progress:: 100%|██████████| 57/57 [00:00<00:00, 74874.83it/s]


260949704.68999475

In [78]:
np.mean(df_valid_nonnull['evaluation_dict'].progress_apply(lambda x: x['computed_result_deviation']))

Progress:: 100%|██████████| 57/57 [00:00<00:00, 84538.66it/s]


5.099128571428572

In [79]:
np.median(df_valid_nonnull['evaluation_dict'].progress_apply(lambda x: x['result_deviation']))

Progress:: 100%|██████████| 57/57 [00:00<00:00, 97541.95it/s]


10.549999999999999

In [80]:
df_valid_nonnull[(df_valid_nonnull['evaluation_dict'].str.get('arg1_match')==1) & (df_valid_nonnull['evaluation_dict'].str.get('arg2_match')==1)\
                        & (df_valid_nonnull['evaluation_dict'].str.get('operator_match')==1)]['evaluation_dict'].str.get('computed_result_deviation').mean()

5.0991285714285715

In [19]:
df_valid_nonnull[(df_valid_nonnull['evaluation_dict'].str.get('arg1_match')==1) & (df_valid_nonnull['evaluation_dict'].str.get('arg2_match')==1)\
                        & (df_valid_nonnull['evaluation_dict'].str.get('operator_match')==1)]['evaluation_dict'].str.get('result_deviation').mean()

nan

In [82]:
import seaborn as sns

In [83]:
result_dev = df_valid_nonnull['evaluation_dict'].progress_apply(lambda x: x['result_deviation'])

Progress:: 100%|██████████| 57/57 [00:00<00:00, 65734.21it/s]


In [84]:
index= [i for i in df_valid_nonnull.sample(10, random_state = 0).index]
for idx in index:
  print(f"Index: {idx}, Llama Answer: {df_valid_nonnull.loc[idx,'finetuned_answers_final']}\n Actual Answer:{df_valid_nonnull.loc[idx,'actual_answer_final']}")
  print("-"*10)

Index: 40, Llama Answer: {'Answer': 'Step 1: Divide 100 by 2. This gives the result: 50', 'Operation': 'Divide', 'argument_1': 100.0, 'argument_2': 2.0, 'result': 50.0}
 Actual Answer:{'Answer': 'Step 1: Add 255.7 and 327.1. This gives the result: 582.8', 'Operation': 'Add', 'argument_1': 255.7, 'argument_2': 327.1, 'result': 582.8}
----------
Index: 39, Llama Answer: {'Answer': 'Step 1: Divide 10530 by 12.12. This gives the result: 869.1', 'Operation': 'Divide', 'argument_1': 10530.0, 'argument_2': 12.12, 'result': 869.1}
 Actual Answer:{'Answer': 'Step 1: Multiply 246 by 12.12. This gives the result: 2981.5', 'Operation': 'Multiply', 'argument_1': 246.0, 'argument_2': 12.12, 'result': 2981.5}
----------
Index: 48, Llama Answer: {'Answer': 'Step 1: Divide 3915795 by 100.26. This gives the result: 3915795', 'Operation': 'Divide', 'argument_1': 3915795.0, 'argument_2': 100.26, 'result': 3915795.0}
 Actual Answer:{'Answer': 'Step 1: Multiply 15340810 by 96.08. This gives the result: 1473

In [None]:
index= [i for i in df_valid_nonnull.sample(15).index]
for idx in index:
  print(f"Index: {idx}, Llama Answer: {df_valid_nonnull.loc[idx,'finetuned_answers_final']}\n Actual Answer:{df_valid_nonnull.loc[idx,'actual_answer_final']}")
  print("-"*10)

Index: 297, Llama Answer: {'Answer': 'Step 1: Divide 29.9 by 54.9. This gives the result: 55.3%', 'Operation': 'Divide', 'argument_1': 29.9, 'argument_2': 54.9, 'result': 55.3}
 Actual Answer:{'Answer': 'Step 1: Divide 29.9 by 32.2. This gives the result: 92.8%', 'Operation': 'Divide', 'argument_1': 29.9, 'argument_2': 32.2, 'result': 92.8}
----------
Index: 92, Llama Answer: {'Answer': 'Step 1: Divide 8.3 by 54.9. This gives the result: 15%', 'Operation': 'Divide', 'argument_1': 8.3, 'argument_2': 54.9, 'result': 15.0}
 Actual Answer:{'Answer': 'Step 1: Divide 8.3 by 54.9. This gives the result: 15%', 'Operation': 'Divide', 'argument_1': 8.3, 'argument_2': 54.9, 'result': 15.0}
----------
Index: 385, Llama Answer: {'Answer': 'Step 1: Divide 29.9 by 54.9. This gives the result: 55.3%', 'Operation': 'Divide', 'argument_1': 29.9, 'argument_2': 54.9, 'result': 55.3}
 Actual Answer:{'Answer': 'Step 1: Divide 46.6 by 54.9. This gives the result: 85%', 'Operation': 'Divide', 'argument_1': 46

In [13]:
rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [14]:
def rogue_metric(actual_answer, model_answer):
  model_answer = (str(model_answer['Answer'])+str(model_answer['argument_1'])+str(model_answer['argument_2'])+str(model_answer['Operation']))
  actual_answer = (str(actual_answer['Answer'])+str(actual_answer['argument_1'])+str(actual_answer['argument_2'])+str(actual_answer['Operation']))
  try:
    instruct_model_results = rouge.compute(
    predictions=model_answer,
    references=actual_answer,
    use_aggregator=True,
    use_stemmer=True,
    )
  except:
    instruct_model_results = None
  return instruct_model_results

In [134]:
df_valid_nonnull['rogue_scores'] = df_valid_nonnull.progress_apply(lambda x: rogue_metric(x['actual_answer_final'], x['finetuned_answers_final']), axis=1)

Progress:: 100%|██████████| 57/57 [00:04<00:00, 13.16it/s]


In [136]:
df_valid_nonnull[df_valid_nonnull['rogue_scores'].notna()].shape
# .progress_apply(lambda x: x['rouge1'].mid.fmeasure)

(7, 14)

In [140]:
df_valid_nonnull_rogue = df_valid_nonnull[df_valid_nonnull['rogue_scores'].notna()]
# .progress_apply(lambda x: x['rouge1'].mid.fmeasure)

In [146]:
df_valid_nonnull_rogue["rogue_scores"].str.get("rouge1").mean()

0.6878837290298012

In [147]:
df_valid_nonnull_rogue["rogue_scores"].str.get("rouge2").mean()

0.0

In [149]:
df_valid_nonnull_rogue["rogue_scores"].str.get("rougeL").mean()

0.6878837290298012

In [150]:
df_valid_nonnull_rogue["rogue_scores"].str.get("rougeLsum").mean()

0.6878837290298012

In [151]:
df_valid_nonnull_rogue

Unnamed: 0,id,context,question,answer,table,answer_steps,num_steps,input_prompt,llama_finetuned_answers,answer_formatted,actual_answer_final,finetuned_answers_final,evaluation_dict,rogue_scores
16,V/2008/page_17.pdf-2,company the visa inc. ( 1 ) of payments volume...,what is the average payment volume per transac...,Step 1: Divide 55 by 0.6. This gives the resul...,"[[Company, Payments Volume (billions), Total ...","[{'arg1': '55', 'arg2': '0.6', 'op': 'divide2-...",1,### Instruction: Act as a financial analyst ad...,{'Answer': 'Step 1: Divide 55 by 61. This give...,{'Answer': 'Step 1: Divide 55 by 0.6. This giv...,{'Answer': 'Step 1: Divide 55 by 0.6. This giv...,{'Answer': 'Step 1: Divide 55 by 61. This give...,"{'operator_match': 1, 'arg1_match': 1, 'arg2_m...","{'rouge1': 0.3283582089552239, 'rouge2': 0.0, ..."
24,ABMD/2007/page_52.pdf-2,contractual obligations the operating lease ob...,how much of the total contractual commitments ...,Step 1: Divide 8381 by 14090. This gives the r...,"[[, Payments Due By Fiscal Year], [Contractual...","[{'arg1': '8381', 'arg2': '14090', 'op': 'divi...",1,### Instruction: Act as a financial analyst ad...,{'Answer': 'Step 1: Divide 7669 by 14090. This...,{'Answer': 'Step 1: Divide 8381 by 14090. This...,{'Answer': 'Step 1: Divide 8381 by 14090. This...,{'Answer': 'Step 1: Divide 7669 by 14090. This...,"{'operator_match': 1, 'arg1_match': 0, 'arg2_m...","{'rouge1': 0.6493506493506493, 'rouge2': 0.0, ..."
41,MRO/2015/page_18.pdf-2,( in thousands ) the u.s . of net undeveloped ...,"in 2018 , what percentage of undeveloped acres...",Step 1: Divide 128 by 1018. This gives the res...,"[[, Net Undeveloped Acres Expiring Year Ended ...","[{'arg1': '128', 'arg2': '1018', 'op': 'divide...",1,### Instruction: Act as a financial analyst ad...,{'Answer': 'Step 1: Divide 128 by 1018. This g...,{'Answer': 'Step 1: Divide 128 by 1018. This g...,{'Answer': 'Step 1: Divide 128 by 1018. This g...,{'Answer': 'Step 1: Divide 128 by 1018. This g...,"{'operator_match': 1, 'arg1_match': 1, 'arg2_m...","{'rouge1': 0.7671232876712328, 'rouge2': 0.0, ..."
46,BLK/2013/page_125.pdf-2,year the 2014 of amount is $ 135 ;year the 201...,what are the future minimum commitments under ...,Step 1: Divide 127 by 1286. This gives the res...,"[[Year, Amount], [2014, $135], [2015, 127], [2...","[{'arg1': '127', 'arg2': '1286', 'op': 'divide...",1,### Instruction: Act as a financial analyst ad...,{'Answer': 'Step 1: Divide 127 by 1286. This g...,{'Answer': 'Step 1: Divide 127 by 1286. This g...,{'Answer': 'Step 1: Divide 127 by 1286. This g...,{'Answer': 'Step 1: Divide 127 by 1286. This g...,"{'operator_match': 1, 'arg1_match': 1, 'arg2_m...","{'rouge1': 0.726027397260274, 'rouge2': 0.0, '..."
49,PPG/2011/page_70.pdf-3,( millions ) the royalty income of 2011 is 55 ...,what was royalty income as a percentage of tot...,Step 1: Divide 45 by 150. This gives the resul...,"[[<i>(Millions)</i>, <i>2011</i>, <i>2010</i>,...","[{'arg1': '45', 'arg2': '150', 'op': 'divide1-...",1,### Instruction: Act as a financial analyst ad...,{'Answer': 'Step 1: Divide 45 by 150. This giv...,{'Answer': 'Step 1: Divide 45 by 150. This giv...,{'Answer': 'Step 1: Divide 45 by 150. This giv...,{'Answer': 'Step 1: Divide 45 by 150. This giv...,"{'operator_match': 1, 'arg1_match': 1, 'arg2_m...","{'rouge1': 0.7611940298507462, 'rouge2': 0.0, ..."
51,AES/2016/page_185.pdf-2,"december 31, the ipalco common stock of 2016 i...",what was the change in millions of ipalco comm...,Step 1: Subtract 460 from 618. This gives the ...,"[[December 31,, 2016, 2015], [IPALCO common st...","[{'arg1': '618', 'arg2': '460', 'op': 'minus1-...",1,### Instruction: Act as a financial analyst ad...,{'Answer': 'Step 1: Subtract 460 from 618. Thi...,{'Answer': 'Step 1: Subtract 460 from 618. Thi...,{'Answer': 'Step 1: Subtract 460 from 618. Thi...,{'Answer': 'Step 1: Subtract 460 from 618. Thi...,"{'operator_match': 1, 'arg1_match': 1, 'arg2_m...","{'rouge1': 0.8, 'rouge2': 0.0, 'rougeL': 0.8, ..."
61,GPN/2017/page_77.pdf-2,cash the customer-related intangible assets of...,what portion of the total purchase considerati...,Step 1: Divide 203828 by 265982. This gives th...,"[[Cash, $45,826], [Customer-related intangible...","[{'arg1': '203828', 'arg2': '265982', 'op': 'd...",1,### Instruction: Act as a financial analyst ad...,{'Answer': 'Step 1: Divide 203828 by 265982. T...,{'Answer': 'Step 1: Divide 203828 by 265982. T...,{'Answer': 'Step 1: Divide 203828 by 265982. T...,{'Answer': 'Step 1: Divide 203828 by 265982. T...,"{'operator_match': 1, 'arg1_match': 1, 'arg2_m...","{'rouge1': 0.7831325301204819, 'rouge2': 0.0, ..."


In [152]:
df_valid_nonnull.to_csv("./llama7b_notchat_finetuned_answers_evaluation.csv",index=False)

In [15]:
df_valid_nonnull = pd.read_csv("./llama7b_notchat_finetuned_answers_evaluation.csv",index_col=False)

In [18]:
df_valid_nonnull[(df_valid_nonnull['evaluation_dict'].str.get('arg1_match')==1) & (df_valid_nonnull['evaluation_dict'].str.get('arg2_match')==1)\
                        & (df_valid_nonnull['evaluation_dict'].str.get('operator_match')==1)]['evaluation_dict'].str.get('result_deviation').mean()

nan

In [17]:
df_valid_nonnull

Unnamed: 0,id,context,question,answer,table,answer_steps,num_steps,input_prompt,llama_finetuned_answers,answer_formatted,actual_answer_final,finetuned_answers_final,evaluation_dict,rogue_scores
0,APD/2014/page_39.pdf-1,the sales of 2014 is $ 450.4 ; the sales of 20...,what was the operating margin for 2014?,Step 1: Divide 88.2 by 450.4. This gives the r...,"[['', '2014', '2013', '2012'], ['Sales', '$450...","[{'arg1': '88.2', 'arg2': '450.4', 'op': 'divi...",1,### Instruction: Act as a financial analyst ad...,{'Answer': 'Step 1: Divide 88.2 by 450.4. This...,{'Answer': 'Step 1: Divide 88.2 by 450.4. This...,{'Answer': 'Step 1: Divide 88.2 by 450.4. This...,{'Answer': 'Step 1: Divide 88.2 by 450.4. This...,"{'operator_match': 1, 'arg1_match': 1, 'arg2_m...",
1,MRO/2017/page_111.pdf-3,beginning of year the revisions of previous es...,what would end of year proven reserves be with...,Step 1: Subtract 57 from 546. This gives the r...,"[['Beginning of year', '552'], ['Revisions of ...","[{'arg1': '546', 'arg2': '57', 'op': 'minus2-1...",1,### Instruction: Act as a financial analyst ad...,{'Answer': 'Step 1: Divide 546 by 552. This gi...,{'Answer': 'Step 1: Subtract 57 from 546. This...,{'Answer': 'Step 1: Subtract 57 from 546. This...,{'Answer': 'Step 1: Divide 546 by 552. This gi...,"{'operator_match': 0, 'arg1_match': 0, 'arg2_m...",
2,HUM/2017/page_133.pdf-3,paymentdate the 2015 of amountper share is $ 1...,what was the number of shares issued in 2015 i...,Step 1: Divide 170 by 1.14. This gives the res...,"[['PaymentDate', 'Amountper Share', 'TotalAmou...","[{'arg1': '170', 'arg2': '1.14', 'op': 'divide...",1,### Instruction: Act as a financial analyst ad...,{'Answer': 'Step 1: Divide 170 by 1.14. This g...,{'Answer': 'Step 1: Divide 170 by 1.14. This g...,{'Answer': 'Step 1: Divide 170 by 1.14. This g...,{'Answer': 'Step 1: Divide 170 by 1.14. This g...,"{'operator_match': 1, 'arg1_match': 1, 'arg2_m...",
3,RE/2015/page_33.pdf-1,( dollars in millions ) the 2015 of december 3...,what is the book to market ratio of the commer...,Step 1: Divide 264.9 by 266.3. This gives the ...,"[['', 'December 31,'], ['(Dollars in millions)...","[{'arg1': '264.9', 'arg2': '266.3', 'op': 'div...",1,### Instruction: Act as a financial analyst ad...,{'Answer': 'Step 1: Divide 17430.8 by 16831.9....,{'Answer': 'Step 1: Divide 264.9 by 266.3. Thi...,{'Answer': 'Step 1: Divide 264.9 by 266.3. Thi...,{'Answer': 'Step 1: Divide 17430.8 by 16831.9....,"{'operator_match': 1, 'arg1_match': 0, 'arg2_m...",
4,AMT/2005/page_105.pdf-1,"years ended december 31, the 2006 to 2010 of f...",in 2005 what was the percentage of the federal...,Step 1: Divide 397691 by 2157503. This gives t...,"[['Years ended December 31,', 'Federal', 'Stat...","[{'arg1': '397691', 'arg2': '2157503', 'op': '...",1,### Instruction: Act as a financial analyst ad...,{'Answer': 'Step 1: Divide 10012 by 5248. This...,{'Answer': 'Step 1: Divide 397691 by 2157503. ...,{'Answer': 'Step 1: Divide 397691 by 2157503. ...,{'Answer': 'Step 1: Divide 10012 by 5248. This...,"{'operator_match': 1, 'arg1_match': 0, 'arg2_m...",
5,SPGI/2015/page_57.pdf-1,current assets the property plant and equipmen...,what was the net equity in the assets acquired,Step 1: Subtract 144 from 2378. This gives the...,"[['Current assets', '$23'], ['Property, plant ...","[{'arg1': '2378', 'arg2': '144', 'op': 'minus1...",1,### Instruction: Act as a financial analyst ad...,{'Answer': 'Step 1: Divide 2234 by 23. This gi...,{'Answer': 'Step 1: Subtract 144 from 2378. Th...,{'Answer': 'Step 1: Subtract 144 from 2378. Th...,{'Answer': 'Step 1: Divide 2234 by 23. This gi...,"{'operator_match': 0, 'arg1_match': 0, 'arg2_m...",
6,LMT/2017/page_80.pdf-3,the weighted average common shares outstanding...,what was the change in millions of weighted av...,Step 1: Subtract 314.7 from 303.1. This gives ...,"[['', '2017', '2016', '2015'], ['Weighted aver...","[{'arg1': '303.1', 'arg2': '314.7', 'op': 'min...",1,### Instruction: Act as a financial analyst ad...,{'Answer': 'Step 1: Divide 314.7 by 310.3. Thi...,{'Answer': 'Step 1: Subtract 314.7 from 303.1....,{'Answer': 'Step 1: Subtract 314.7 from 303.1....,{'Answer': 'Step 1: Divide 314.7 by 310.3. Thi...,"{'operator_match': 0, 'arg1_match': 1, 'arg2_m...",
7,ETR/2017/page_316.pdf-3,the 2016 net revenue of amount ( in millions )...,in 2016 what was the ratio of the net income i...,Step 1: Divide 92.9 by 1520.5. This gives the ...,"[['', 'Amount (In Millions)'], ['2016 net reve...","[{'arg1': '92.9', 'arg2': '1520.5', 'op': 'div...",1,### Instruction: Act as a financial analyst ad...,{'Answer': 'Step 1: Divide 1520.5 by 1522.6. T...,{'Answer': 'Step 1: Divide 92.9 by 1520.5. Thi...,{'Answer': 'Step 1: Divide 92.9 by 1520.5. Thi...,{'Answer': 'Step 1: Divide 1520.5 by 1522.6. T...,"{'operator_match': 1, 'arg1_match': 0, 'arg2_m...",
8,IP/2006/page_32.pdf-2,in millions the sales of 2006 is $ 2455 ; the ...,in 2005 what percentage of consumer packaging ...,Step 1: Divide 437 by 2245. This gives the res...,"[['<i>In millions</i>', '2006', '2005', '2004'...","[{'arg1': '437', 'arg2': '2245', 'op': 'divide...",1,### Instruction: Act as a financial analyst ad...,{'Answer': 'Step 1: Divide 121 by 2245. This g...,{'Answer': 'Step 1: Divide 437 by 2245. This g...,{'Answer': 'Step 1: Divide 437 by 2245. This g...,{'Answer': 'Step 1: Divide 121 by 2245. This g...,"{'operator_match': 1, 'arg1_match': 0, 'arg2_m...",
9,ZBH/2003/page_58.pdf-3,the finished goods of 2003 is $ 384.3 ; the fi...,what percent of inventory is ready for liquida...,Step 1: Divide 384.3 by 527.7. This gives the ...,"[['', '2003', '2002'], ['Finished goods', '$38...","[{'arg1': '384.3', 'arg2': '527.7', 'op': 'div...",1,### Instruction: Act as a financial analyst ad...,{'Answer': 'Step 1: Divide 527.7 by 384.3. Thi...,{'Answer': 'Step 1: Divide 384.3 by 527.7. Thi...,{'Answer': 'Step 1: Divide 384.3 by 527.7. Thi...,{'Answer': 'Step 1: Divide 527.7 by 384.3. Thi...,"{'operator_match': 1, 'arg1_match': 0, 'arg2_m...",
