# Import Libraries

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/gsm8k-dataset/test.jsonl
/kaggle/input/gsm8k-dataset/train.jsonl
/kaggle/input/gsm8k-dataset/test_prompts.csv


In [2]:
import json
import requests
import pandas as pd

# Settings

In [3]:
API_TOKEN = "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"

In [4]:
API_URL = "https://api-inference.huggingface.co/models/bigscience/bloom"

In [5]:
headers = {"Authorization": f"Bearer {API_TOKEN}"}

# Functions to run the generation

Using some functions from supplementary materials

In [6]:
import re

NUMBER_SET = [str(num) for num in range(0, 10)]

def _is_float(s):
    try:
        float(s)
        return True
    except:
        return False

FINAL_ANS = "answer is "

def clean_ans(ans):
    """
    Extract the number from the answer string
    """
    index = ans.find(".")
    if index >= 0:
        end_index = index + 1
        while end_index < len(ans) and ans[end_index] in NUMBER_SET:
            end_index += 1
        ans = ans[:end_index]
    while ans and ans.endswith("."):
        ans = ans[:-1]
    ans = ans.split("=")[-1].strip()
    for c in ["$", ",", "%", "€", '"']:
        ans = ans.replace(c, "")
    parts = ans.split(" ")
    for part in parts:
        if _is_float(part):
            return part
    ans = parts[0]  # default
    for part in parts:
        if not part.isalpha():  # take the 1st non-alpha token
            ans = part
            break
    while ans and ans[-1].isalpha():
        ans = ans[:-1]
    return ans.strip()

def get_ans(pred):
    """
    Method to get the string with The answer is ...
    """
    text = pred
    if text.rfind(FINAL_ANS) >= 0:
        pred_ans = text[text.rfind(FINAL_ANS) + len(FINAL_ANS) : len(text)].strip()
        return pred_ans
    else:
        return ""

Create some personal functions

In [7]:
def query(payload):
    """
    Function to send queries to HF Bloom model with your params
    
    Args:
    payload (dict): your prompt and params
    
    Returns:
    response (json object): generated text
    """
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

In [8]:
def generate_answer(prompt):
    """
    Function to send queries until you get the answer
    
    Args:
    prompt (str): your prompt
    
    Returns:
    answer (float): numeric answer
    current_text (str): generated text without the prompt
    """
    # Delete \n, otherwise not working
    prompt = prompt.replace("\n", " ")
    # Variable to store the answer
    answer = None
    # Variable to count the iterations
    counter = 0
    # Variable to save the current generated text
    current_text = prompt
    
    # Set some limit to generate the answer
    while counter < 20:
        # Increase counter after another try
        counter += 1
        
        # Write the query
        payload = {"inputs": current_text}, {"max_new_tokens": 128, "wait_for_model": True}
        
        # Send the query and get the generated text
        output = query(payload)
        try:
            generated_text = output[0]["generated_text"]
        except:
            print("Tokens are dead.")
            break
        
        # Cut the prompt from the generated text
        new_text = generated_text[len(prompt):]
        # Update the current text
        current_text = generated_text
        
        # Extract the answer and check it
        answer = clean_ans(get_ans(new_text))
        if answer != "":
            # Stop generation if necessary
            break

    return answer, current_text[len(prompt):]

# Open the dataset

In [9]:
df = pd.read_csv("/kaggle/input/gsm8k-dataset/test_prompts.csv")

In [10]:
df.head(3)

Unnamed: 0,question,answer,numeric_answer,CoT_anot1,CoT_anot2,CoT_paths,CoT_directions
0,Janet’s ducks lay 16 eggs per day. She eats th...,Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eg...,18,Q: There are 15 trees in the grove. Grove work...,"Q: Shawn has five toys. For Christmas, he got ...",Q: Betty is saving money for a new wallet whic...,Use mathemathical operations and write step by...
1,A robe takes 2 bolts of blue fiber and half th...,It takes 2/2=<<2/2=1>>1 bolt of white fiber\nS...,3,Q: There are 15 trees in the grove. Grove work...,"Q: Shawn has five toys. For Christmas, he got ...",Q: Betty is saving money for a new wallet whic...,Use mathemathical operations and write step by...
2,Josh decides to try flipping a house. He buys...,The cost of the house and repairs came out to ...,70000,Q: There are 15 trees in the grove. Grove work...,"Q: Shawn has five toys. For Christmas, he got ...",Q: Betty is saving money for a new wallet whic...,Use mathemathical operations and write step by...


Decreasing the size (significantly) of a dataset to boost the experiment

In [11]:
df = df[:100]

Run the loop

In [12]:
# Create lists for numeric answers
COT_answer1_list = []
COT_answer2_list = []
CoT_directions_list = []

# Create lists for texts
COT_text1 = []
COT_text2 = []
COT_directions_text = []

# Run the loop with generation
for i in range(len(df)):
    question = df["CoT_anot1"][i]
    answer, generated_text = generate_answer(question)
    COT_answer1_list.append(answer)
    COT_text1.append(generated_text)
    
    question = df["CoT_anot2"][i]
    answer, generated_text = generate_answer(question)
    COT_answer2_list.append(answer)
    COT_text2.append(generated_text)
    
    question = df["CoT_directions"][i]
    answer, generated_text = generate_answer(question)
    CoT_directions_list.append(answer)
    COT_directions_text.append(generated_text)

Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dead.
Tokens are dea

In [13]:
df["CoT1 answers"] = COT_answer1_list
df["CoT2 answers"] = COT_answer2_list
df["Directions answers"] = CoT_directions_list

df["CoT1 generated"] = COT_text1
df["CoT2 generated"] = COT_text2
df["Directions generated"] = COT_directions_text

In [14]:
df.to_csv("GSM8K_100_version1.csv")