In [1]:
%%capture
!pip install -q -U bitsandbytes
# !pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install transformers==4.31
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets
!pip install evaluate
!pip install -qqq trl==0.7.1
!pip install torch
!pip install -q rouge_score

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
%ls

[0m[01;34mdrive[0m/  [01;34msample_data[0m/


In [4]:
%cd /content/drive/MyDrive/DLT_Project/

/content/drive/MyDrive/DLT_Project


In [5]:
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import transformers
from torch.optim import Adam
from tqdm import tqdm
import utils
from utils import *
import json
import pandas as pd
import torch
# from torch.utils.data import Dataset, DataLoader
import time
import evaluate
from datasets import Dataset, load_dataset
import random
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig


In [6]:
dir_path = '/content/drive/MyDrive/DLT_Project/'
os.chdir(dir_path)

In [7]:
with open(f"{dir_path}/train.json") as input_file:
        train_data = json.load(input_file)

with open(f"{dir_path}/dev.json") as input_file:
        valid_data = json.load(input_file)

with open(f"{dir_path}/test.json") as input_file:
        test_data = json.load(input_file)

In [8]:
assert len(train_data) == 6251, "The train data seems to be off, please check"
assert len(valid_data) == 883, "The validation data seems to be off, please check"
assert len(test_data) == 1147, "The test data seems to be off, please check"

In [9]:
df_train_1step_serialized =pd.read_csv(f"{dir_path}/df_train_1step_serialized.csv")

In [10]:
train_data=pd.DataFrame(train_data).merge(df_train_1step_serialized[['id','serialized_text_gpt35']],on='id',how='inner').to_dict(orient='records')

In [11]:
def prepare_data(json_data, verbose=True, split = 'train'):
    net=[]
    err_cnt=0
    for example in json_data:
        question = example["qa"]["question"]
        table = example["table"]
        id=example["id"]
        table_ori=example["table_ori"]
        answer_steps=example["qa"]["steps"]
        if split == 'train':
            serialized_text=example["serialized_text_gpt35"]
            table_text = ""
            for row in table[1:]:
                this_sent = table_row_to_text(table[0], row)
                table_text += this_sent
            try:
                steps_text = format_steps(example["qa"]["steps"])
                inputs = {"id":id, "context": serialized_text, "question": question, "answer": steps_text,\
                          "table":table_ori,"answer_steps":answer_steps}
                net.append(inputs)
            except:
                err_cnt+=1
                if verbose:
                    print ("-"*25)
                    print (example["filename"])
                    print (example["qa"]["steps"])
                    print ("-"*25+"\n")
        else:
            table_text = ""
            for row in table[1:]:
                this_sent = table_row_to_text(table[0], row)
                table_text += this_sent
            try:
                steps_text = format_steps(example["qa"]["steps"])
                inputs = {"id":id, "context": table_text, "question": question, "answer": steps_text,\
                          "table":table_ori,"answer_steps":answer_steps}
                net.append(inputs)
            except:
                err_cnt+=1
                if verbose:
                    print ("-"*25)
                    print (example["filename"])
                    print (example["qa"]["steps"])
                    print ("-"*25+"\n")

    if err_cnt>0:
        print ("Net Errors:",err_cnt)
    return net

In [12]:
## Preparing huggingface dataset
data_splits = {'train': train_data, 'valid': valid_data, 'test': test_data}
datasets = {split: Dataset.from_pandas(pd.DataFrame(prepare_data(data, False, split))) for split, data in data_splits.items()}

Net Errors: 35
Net Errors: 39


In [13]:
step_pattern = r"Step \d+"
import re

In [14]:
def num_steps_dict_fn(example):
    example["num_steps"] = len(re.findall(step_pattern, example["answer"]))
    return example

In [15]:
datasets['train']=datasets['train'].map(num_steps_dict_fn)
datasets['valid']=datasets['valid'].map(num_steps_dict_fn)
datasets['test']=datasets['test'].map(num_steps_dict_fn)

Map:   0%|          | 0/3532 [00:00<?, ? examples/s]

Map:   0%|          | 0/848 [00:00<?, ? examples/s]

Map:   0%|          | 0/1108 [00:00<?, ? examples/s]

In [16]:
datasets['train']=datasets['train'].filter(lambda example: example['num_steps']==1)
datasets['valid']=datasets['valid'].filter(lambda example: example['num_steps']==1)
datasets['test']=datasets['test'].filter(lambda example: example['num_steps']==1)

Filter:   0%|          | 0/3532 [00:00<?, ? examples/s]

Filter:   0%|          | 0/848 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1108 [00:00<?, ? examples/s]

In [17]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [18]:
import getpass

# Prompt for the Hugging Face token
hf_token = getpass.getpass("Enter your Hugging Face token: ")

import os
os.environ['HUGGINGFACE_TOKEN'] = hf_token

Enter your Hugging Face token: ··········


In [19]:
# model_id =  "meta-llama/Llama-2-7b-hf"
model_id = "meta-llama/Llama-2-7b-chat-hf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto",use_auth_token=hf_token)

tokenizer = AutoTokenizer.from_pretrained(model_id,use_auth_token=hf_token)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]



model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]



tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [20]:
import re

def extract_info(input_text):
  try:
      # Regular expression to match the answer pattern
      answer_pattern = r"Step 1: ([A-Za-z]+) ([-]?[$]?[\d,.]+) .* ([-]?[$]?[\d,.]+)\.* This gives the result: ([-]?[$]?[\d.]+%?)"

      # Find the matching groups
      match = re.search(answer_pattern, input_text)
      if match:
          operation, arg1, arg2, result = match.groups()
          # Removing commas from the arguments and converting to appropriate types

          chars_to_replace=[',','$']
          for char in chars_to_replace:
              arg1=arg1.replace(char,'')
              arg2=arg2.replace(char,'')
              result=result.replace(char,'')
          # arg1 = float(arg1)
          # arg2 = float(arg2)
          # Checking if result is a percentage or a number
          if '%' in result:
              result = (result.replace('%', ''))
          else:
              result = (result)

          # Constructing the output dictionary
          output = {
              'Answer': match.group(0),
              'Operation': operation.capitalize(),
              'argument_1': arg1,
              'argument_2': arg2,
              'result': result
          }
          return output
  except:
      return "Pattern not found"

In [21]:
%%capture
!pip install langchain-experimental
from langchain.agents import Tool
from langchain_experimental.utilities import PythonREPL
python_repl = PythonREPL()

In [22]:
df_valid =pd.DataFrame(datasets['valid'])
df_train =pd.DataFrame(datasets['train'])

Generating few-shot examples from the training dataset

In [23]:
list_indices=[15,45,21]

prompt=''
dash_line = '-'.join('' for x in range(100))
for i,j in enumerate(list_indices):
  context = datasets['train'][j]['context']
  answer = datasets['train'][j]['answer']
  question = datasets['train'][j]['question']

  prompt+= f"""### Instruction: Act as a financial analyst adept in reading Earning Reports and Financial Documents.
  Answer the question in a single step by extracting relevant values from the context given below. Perform the necessary arithmetic operation.
  Do not generate any new numbers for the calculation. Only refer to information present in the context.

  ### Context:
  {context}

  ### Question:
  {question}

  ### Answer:
  {answer}
  """.strip()

  prompt+="\n###SEP###\n"

In [24]:
print(prompt)

### Instruction: Act as a financial analyst adept in reading Earning Reports and Financial Documents.
  Answer the question in a single step by extracting relevant values from the context given below. Perform the necessary arithmetic operation.
  Do not generate any new numbers for the calculation. Only refer to information present in the context.

  ### Context:
  The table above provides information on the number of units and square footage of properties owned by a company. The first row indicates that the company owns multifamily communities consisting of 304 units in total. Of these, 303 units are consolidated and 1 unit is unconsolidated. The second row shows that the company also owns commercial properties totaling 260,000 square feet, all of which are consolidated.

  ### Question:
  what is the percentage of consolidated communities among the total communities?

  ### Answer:
  Step 1: Divide 303 by 304. This gives the result: 99.67%
###SEP###
### Instruction: Act as a financia

In [25]:
def extract_answer_fewshot_output(output):
    # Find the last '###SEP###' occurrence. If it exists, extract the text following it.
    sep = '###SEP###'
    parts = output.split(sep)
    if len(parts) > 1:
        # Remove the last part which is after the last '###SEP###'
        parts.pop()
        # Join the remaining parts back together
        text_without_last_sep = sep.join(parts)
    else:
        text_without_last_sep = output

    # Extract the text after the last answer before the last '###SEP###'
    answer_token = '### Answer:'
    last_answer_start = text_without_last_sep.rfind(answer_token) + len(answer_token)
    last_answer_end = text_without_last_sep.find('###', last_answer_start)
    text_following_last_answer = text_without_last_sep[last_answer_start:last_answer_end].strip()
    return {'answer':text_following_last_answer}

In [26]:
def output_fewshot(row,prompt=prompt):
  context = row['context']
  question = row['question']
  answer = row['answer']
  prompt+= f"""### Instruction: Act as a financial analyst adept in reading Earning Reports and Financial Documents.
  Answer the question in a single step by extracting relevant values from the context given below. Perform the necessary arithmetic operation.
  Do not generate any new numbers for the calculation. Only refer to information present in the context.

  ### Context:
  {context}

  ### Question:
  {question}

  ### Answer:

  """.strip()
  try:
    inputs = tokenizer(prompt, return_tensors='pt')
    output = tokenizer.decode(
        model.generate(
            inputs["input_ids"],
            temperature=0.1,
            do_sample=True,
            max_new_tokens=128,
        )[0],
        skip_special_tokens=True
    )
    # ##Post-Processing the Output
    ans_dict = extract_answer_fewshot_output(output)
    return ans_dict
  except:
    return {'answer':None}

In [27]:
from tqdm import tqdm
tqdm.pandas(desc="Progress:")

Testing few-shot for a couple examples

In [85]:
import warnings
warnings.filterwarnings('ignore')
sample_output = df_valid.head(2).progress_apply(lambda row:output_fewshot(row,prompt),axis=1)

Progress:: 100%|██████████| 2/2 [00:34<00:00, 17.23s/it]


In [86]:
sample_output

0    {'answer': 'Step 1: Divide 637 by 50.3. This g...
1    {'answer': 'Step 1: Subtract 536.7 from 555.3....
dtype: object

RUNNING FEW-SHOT INFERENCE ON THE VALIDATION DATASET

In [28]:
df_valid.shape

(489, 7)

In [29]:
df_valid = df_valid.sample(64, random_state = 42)

In [30]:
df_valid.loc[:,"fewshot_answers"]=df_valid.progress_apply(lambda row:output_fewshot(row,prompt),axis=1)

Progress:: 100%|██████████| 64/64 [09:49<00:00,  9.22s/it]


In [31]:
# df_valid.to_csv("./fewshot_answers_valid64_llama7b_nonchat.csv",index=False)
df_valid.to_csv("./fewshot_answers_valid64_llama7b_chat.csv",index=False)

In [32]:
# df_valid = pd.read_csv("fewshot_answers_valid64_llama7b_nonchat.csv",index_col=False)
df_valid= pd.read_csv("./fewshot_answers_valid64_llama7b_chat.csv",index_col=False)

ANSWER FORMATTING AND FURTHER POST-PROCESSING OF FEW-SHOT OUTPUTS

In [33]:
df_valid['answer_formatted'] = df_valid['answer'].progress_apply(lambda x:extract_info(x))

Progress:: 100%|██████████| 64/64 [00:00<00:00, 28339.89it/s]


In [34]:
df_valid['answer_formatted'][0]

{'Answer': 'Step 1: Divide 68389 by 144535. This gives the result: 0.4732',
 'Operation': 'Divide',
 'argument_1': '68389',
 'argument_2': '144535.',
 'result': '0.4732'}

In [35]:
import ast

In [36]:
def convert_to_dict(string):
  try:
    return ast.literal_eval(string)
  except:
    return None

In [37]:
df_valid['fewshot_answers'] = df_valid['fewshot_answers'].progress_apply(lambda x:convert_to_dict(x))

Progress:: 100%|██████████| 64/64 [00:00<00:00, 26056.64it/s]


In [38]:
df_valid['few_shot_answer_formatted'] = df_valid.progress_apply(lambda x:extract_info(x['fewshot_answers']['answer']), axis = 1)

Progress:: 100%|██████████| 64/64 [00:00<00:00, 19302.18it/s]


In [39]:
def clean_digits(ans):
  if ans is None:
    return None
  for i, j in ans.items():
    try:
      ans_cleaned = re.sub('((\d+)[\.])(?!([\d]+))','\g<2>',j)
      ans[i] = float(ans_cleaned)
    except:
      ans[i] = j
  return ans

In [40]:
df_valid['actual_answer_final'] = df_valid['answer_formatted'].progress_apply(lambda x:clean_digits(x))
df_valid['fewshot_answers_final'] = df_valid['few_shot_answer_formatted'].progress_apply(lambda x:clean_digits(x))

Progress:: 100%|██████████| 64/64 [00:00<00:00, 19237.17it/s]
Progress:: 100%|██████████| 64/64 [00:00<00:00, 18744.18it/s]


EVALUATION

In [41]:
ops_dict={'Divide':'/', 'Multiply':'*', 'Add':'+', 'Subtract':'-'}

In [42]:
##Checking sample for evaluation
sample_answers = df_valid[df_valid['fewshot_answers'].notna()].sample(1, random_state=0)
fewshot_answers_final, actual_answer_final = sample_answers['fewshot_answers_final'].values[0], sample_answers['actual_answer_final'].values[0]
ans_dict = fewshot_answers_final

In [43]:
ans_dict

{'Answer': 'Step 1: Divide 7680 by 4717. This gives the result: 160%',
 'Operation': 'Divide',
 'argument_1': 7680.0,
 'argument_2': 4717.0,
 'result': 160.0}

In [44]:
value_to_compute= str(ans_dict['argument_1'])+(ops_dict[ans_dict['Operation']])+ str(ans_dict['argument_2'])
computed_ans = python_repl.run(f"print({value_to_compute})")
python_repl.run(f"print({value_to_compute})")
rounded_ans = round(float(computed_ans.strip()),2)
rounded_ans



1.63

In [45]:
def evaluation(question, actual_ans, model_ans):
  '''
  actual_ans, model_ans: dict of answer components (operation, arguments, result)
  '''
  if actual_ans is None or model_ans is None:
    return None
  operator_match = int(actual_ans['Operation'] == model_ans['Operation'])
  # arg1_match = int(actual_ans['argument_1'] == model_ans['argument_1'])
  if float(abs(actual_ans['argument_1'] - model_ans['argument_1'])) < 0.1:
    arg1_match = 1
  else:
    arg1_match = 0
  if float(abs(actual_ans['argument_2'] - model_ans['argument_2'])) < 0.1:
    arg2_match = 1
  else:
    arg2_match = 0
  # arg2_match = int(actual_ans['argument_2'] == model_ans['argument_2'])
  if float(abs(actual_ans['result'] - model_ans['result'])) < 0.1:
    result_match = 1
  else:
    result_match = 0
  # result_match = int(actual_ans['result'] == model_ans['result'])
  result_deviation = float(abs(abs(actual_ans['result']) - abs(model_ans['result'])))

  reverse_arg_match = int((actual_ans['argument_1'] == model_ans['argument_2'])&(actual_ans['argument_2'] == model_ans['argument_1']))
  benchmark_answer=actual_ans['result']
  model_answer=model_ans['result']
  # rounded_ans = None
  # computed_result_deviation = None
  ##Computing the answer based on model's returned argument and operator
  if (arg1_match ==1) and (arg2_match ==1):
    value_to_compute= str(model_ans['argument_1'])+(ops_dict[model_ans['Operation']])+ str(model_ans['argument_2'])
    if 'percent' in question.lower():
      computed_ans = python_repl.run(f"print({value_to_compute}*100.0)")
      rounded_ans = round(float(computed_ans.strip()),2)
      computed_result_deviation = abs(abs(actual_ans['result']) - abs(rounded_ans))
    else:
      computed_ans = python_repl.run(f"print({value_to_compute})")
      rounded_ans = round(float(computed_ans.strip()),2)
      computed_result_deviation = abs(abs(actual_ans['result']) - abs(rounded_ans))

  else:
    rounded_ans = None
    computed_result_deviation = None

  evaluation_dict = ['operator_match','arg1_match','arg2_match','result_match','result_deviation','model_program_ans','computed_result_deviation','reverse_arg_match','benchmark_answer','model_answer']
  evaluation_dict = {k:v for k,v in zip(evaluation_dict, [operator_match, arg1_match, arg2_match, result_match, result_deviation,rounded_ans, computed_result_deviation,reverse_arg_match,benchmark_answer,model_answer])}
  return evaluation_dict

In [46]:
df_valid['evaluation_dict'] = df_valid.progress_apply(lambda x: evaluation(x['question'],x['actual_answer_final'], x['fewshot_answers_final']), axis=1)

Progress:: 100%|██████████| 64/64 [00:00<00:00, 5347.32it/s]


In [47]:
df_valid_nonnull = df_valid[df_valid['evaluation_dict'].notna()]

In [48]:
df_valid_nonnull.shape

(55, 13)

In [49]:
np.mean(df_valid_nonnull['evaluation_dict'].progress_apply(lambda x: x['operator_match']))

Progress:: 100%|██████████| 55/55 [00:00<00:00, 67749.40it/s]


0.7454545454545455

In [50]:
np.mean(df_valid_nonnull['evaluation_dict'].progress_apply(lambda x: x['arg1_match']))

Progress:: 100%|██████████| 55/55 [00:00<00:00, 76998.24it/s]


0.14545454545454545

In [51]:
np.mean(df_valid_nonnull['evaluation_dict'].progress_apply(lambda x: x['arg2_match']))

Progress:: 100%|██████████| 55/55 [00:00<00:00, 68250.51it/s]


0.16363636363636364

In [52]:
np.mean(df_valid_nonnull['evaluation_dict'].progress_apply(lambda x: x['result_match']))

Progress:: 100%|██████████| 55/55 [00:00<00:00, 76946.87it/s]


0.09090909090909091

In [53]:
np.mean(df_valid_nonnull['evaluation_dict'].progress_apply(lambda x: x['result_deviation']))

Progress:: 100%|██████████| 55/55 [00:00<00:00, 68189.99it/s]


215387517.02868733

In [54]:
np.mean(df_valid_nonnull['evaluation_dict'].progress_apply(lambda x: x['computed_result_deviation']))

Progress:: 100%|██████████| 55/55 [00:00<00:00, 71464.29it/s]


332.66999999999996

In [55]:
np.mean(df_valid_nonnull['evaluation_dict'].progress_apply(lambda x: x['result_deviation']))

Progress:: 100%|██████████| 55/55 [00:00<00:00, 74439.08it/s]


215387517.02868733

In [56]:
df_valid_nonnull[(df_valid_nonnull['evaluation_dict'].str.get('arg1_match')==1) & (df_valid_nonnull['evaluation_dict'].str.get('arg2_match')==1)\
                        & (df_valid_nonnull['evaluation_dict'].str.get('operator_match')==1)]['evaluation_dict'].str.get('computed_result_deviation').mean()

0.006666666666666672

In [57]:
df_valid_nonnull[(df_valid_nonnull['evaluation_dict'].str.get('arg1_match')==1) & (df_valid_nonnull['evaluation_dict'].str.get('arg2_match')==1)\
                        & (df_valid_nonnull['evaluation_dict'].str.get('operator_match')==1)].shape

(6, 13)

In [58]:
df_valid_nonnull[(df_valid_nonnull['evaluation_dict'].str.get('arg1_match')==1) & (df_valid_nonnull['evaluation_dict'].str.get('arg2_match')==1)\
                        & (df_valid_nonnull['evaluation_dict'].str.get('operator_match')==1)]['evaluation_dict'].str.get('result_deviation').mean()

25.06866666666667

In [59]:
result_dev = df_valid_nonnull['evaluation_dict'].progress_apply(lambda x: x['result_deviation'])

Progress:: 100%|██████████| 55/55 [00:00<00:00, 43402.96it/s]


In [60]:
index= [i for i in df_valid_nonnull.sample(10, random_state = 0).index]
for idx in index:
  print(f"Index: {idx}, Llama Answer: {df_valid_nonnull.loc[idx,'fewshot_answers_final']}\n Actual Answer:{df_valid_nonnull.loc[idx,'actual_answer_final']}")
  print("-"*10)

Index: 52, Llama Answer: {'Answer': 'Step 1: Divide 79 by 1058. This gives the result: 7.5%', 'Operation': 'Divide', 'argument_1': 79.0, 'argument_2': 1058.0, 'result': 7.5}
 Actual Answer:{'Answer': 'Step 1: Divide 397097677 by 28.85. This gives the result: 11456267981.5', 'Operation': 'Divide', 'argument_1': 397097677.0, 'argument_2': 28.85, 'result': 11456267981.5}
----------
Index: 39, Llama Answer: {'Answer': 'Step 1: Divide 6501 by 899. This gives the result: 7.3', 'Operation': 'Divide', 'argument_1': 6501.0, 'argument_2': 899.0, 'result': 7.3}
 Actual Answer:{'Answer': 'Step 1: Divide 38 by 188. This gives the result: .20', 'Operation': 'Divide', 'argument_1': 38.0, 'argument_2': 188.0, 'result': 0.2}
----------
Index: 46, Llama Answer: {'Answer': 'Step 1: Divide 79 by 1058. This gives the result: 7.5%', 'Operation': 'Divide', 'argument_1': 79.0, 'argument_2': 1058.0, 'result': 7.5}
 Actual Answer:{'Answer': 'Step 1: Divide 2 by 6. This gives the result: 33.3%', 'Operation': 'Di

In [61]:
rouge = evaluate.load("rouge")

In [62]:
def rouge_metric(actual_answer, model_answer):
  model_answer = (str(model_answer['Answer'])+str(model_answer['argument_1'])+str(model_answer['argument_2'])+str(model_answer['Operation']))
  actual_answer = (str(actual_answer['Answer'])+str(actual_answer['argument_1'])+str(actual_answer['argument_2'])+str(actual_answer['Operation']))
  try:
    instruct_model_results = rouge.compute(
    predictions=model_answer,
    references=actual_answer,
    use_aggregator=True,
    use_stemmer=True,
    )
  except:
    instruct_model_results = None
  return instruct_model_results

In [63]:
df_valid_nonnull['rogue_scores'] = df_valid_nonnull.progress_apply(lambda x: rouge_metric(x['actual_answer_final'], x['fewshot_answers_final']), axis=1)

Progress:: 100%|██████████| 55/55 [00:00<00:00, 66.40it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_valid_nonnull['rogue_scores'] = df_valid_nonnull.progress_apply(lambda x: rouge_metric(x['actual_answer_final'], x['fewshot_answers_final']), axis=1)


In [64]:
df_valid_nonnull[df_valid_nonnull['rogue_scores'].notna()].shape

(4, 14)

In [65]:
df_valid_nonnull["rogue_scores"].str.get("rougeLsum").mean()

0.6373363949483352

In [66]:
# df_valid_nonnull.to_csv("./df_valid_nonnull_llama7b_fewshot", index = False)

In [67]:
df_valid_nonnull.to_csv("./df_valid_nonnull_llama7bchat_fewshot", index = False)