# Fill in the names

In [65]:
import pandas as pd
import numpy as np
import json
templates = pd.read_json('../templates/name_number_templates.jsonl', lines=True)
fillable_vals = json.load(open('name_tags.json'))

person names

In [66]:
for i in range(len(templates['question'])):
    # there are at most 5 people
    names = np.random.choice(fillable_vals['persons'], 5, replace=False)
    q = templates['question'][i]
    for j in range(1, 6):
        tag  = '{person' + str(j) + '}'
        q = q.replace(tag, names[j-1])
    templates['question'][i] = q

places

In [67]:
for i in range(len(templates['question'])):
    # there are at most 3 places
    names = np.random.choice(fillable_vals['places'], 3, replace=False)
    q = templates['question'][i]
    for j in range(1, 4):
        tag  = '{place' + str(j) + '}'
        q = q.replace(tag, names[j-1])
    templates['question'][i] = q

Update foods

In [68]:
for i in range(len(templates['question'])):
    # there are at most 4 foods
    names = np.random.choice(fillable_vals['foods'], 4, replace=False)
    q = templates['question'][i]
    for j in range(1, 5):
        tag  = '{food' + str(j) + '}'
        q = q.replace(tag, names[j-1])
    templates['question'][i] = q

Update Currencies

In [69]:
for i in range(len(templates['question'])):
    # there are at most 4 foods
    names = np.random.choice(fillable_vals['currencies'], 1, replace=False)
    q = templates['question'][i]
    for j in range(1, 2):
        tag  = '{currencysymbol' + str(j) + '}'
        q = q.replace(tag, names[j-1])
    templates['question'][i] = q

In [70]:
templates['question'][0]

"Lily’s ducks lay {x} eggs per day. Lily eats {y} for breakfast every morning and bakes muffins for her friends every day with {z}. Lily sells the remainder at the farmers' market daily for ₩{d} per fresh duck egg. How much in dollars does Lily make every day at the farmers' market?"

# Fill in the numbers

In [71]:
# note there are only 20 annotated number samples
number_templates = templates.head(20)

In [72]:
import os
from openai import OpenAI
import pandas as pd
import json

# set prompt
with open('prompt_files/fill_in_numbers_prompt.txt', 'r') as file:
    system_prompt = file.read()

suffix = "\n\nYour output should strictly be a json object mapping the inputs to outputs. It should be in the format of {variable1: value, variable2: value, ...}"

# get API key
with open('prompt_files/openai_key.txt', 'r') as file:
    API_KEY = file.read().strip()

def get_variables(target_question, API_KEY = API_KEY, prompt = system_prompt, suffix = suffix):
  os.environ["OPENAI_API_KEY"] = API_KEY
  client = OpenAI()
  response = client.chat.completions.create(
      model="gpt-4o-mini",
      messages=[
          {"role": "system", "content": prompt},
          {"role": "user", "content": target_question + suffix},
      ]
  )
  answer = response.choices[0].message.content
  return answer

In [73]:
def valid_output(constraints, vars):
    keys = vars.keys()
    # go in this order to avoid partial matching of future keys
    sorted_keys = sorted(keys, key=lambda x: len(x), reverse=True)
    for constraint in constraints:
        original_constraint = constraint
        for key in sorted_keys:
            constraint = constraint.replace(key, str(vars[key]))
        if not eval(constraint):
            print("Failed constraint: ", original_constraint, "with", constraint)
            return False
    return True

In [74]:
def replace_vars(query, answer, vars):
    for key in vars:
        query = query.replace('{' + key + '}', str(vars[key]))
        answer = answer.replace('{' + key + '}', str(vars[key]))
        # answer = answer.replace(key, str(vars[key]))
    return query, answer

In [75]:
completed_cache = set() # useful for debugging

In [76]:
max_retries = 10
for i in range(len(number_templates)):
    if i in completed_cache:
        continue
    print(i)
    query = str(number_templates.iloc[i].to_dict())
    num_retries = 0
    while num_retries < max_retries:
        vars = get_variables(query)
        try:
            vars = json.loads(vars.strip('```').replace('json', '').strip())
        except:
            print("invalid json. retrying")
            num_retries += 1
            continue
        if valid_output(number_templates.iloc[i]['constraints'], vars):
            q, ans = replace_vars(number_templates.iloc[i]['question'], number_templates.iloc[i]['answer'], vars)
            number_templates.iloc[i]['question'] = q
            number_templates.iloc[i]['answer'] = ans
            completed_cache.add(i)
            break
        else:
            print("didn't meet constraints. retrying")
            num_retries += 1
    if num_retries == max_retries:
        print("Failed to generate valid output for template " + str(i))

0
1
2
3
4
5
6
7
8
9
10
11
Failed constraint:  a % f ==  0 with 100 % 30 ==  0
didn't meet constraints. retrying
12
13
14
15
Failed constraint:  d == a * 7 with 14 == 1 * 7
didn't meet constraints. retrying
Failed constraint:  d == a * 7 with 14 == 1 * 7
didn't meet constraints. retrying
Failed constraint:  d == a * 7 with 14 == 1 * 7
didn't meet constraints. retrying
16
Failed constraint:  i == h / g with 1 == 15 / 11
didn't meet constraints. retrying
Failed constraint:  i == h / g with 1 == 15 / 11
didn't meet constraints. retrying
Failed constraint:  g == d - b - b with 8 == 10 - 2 - 2
didn't meet constraints. retrying
17
Failed constraint:  f == c - e with 5 == 20 - 25
didn't meet constraints. retrying
18
Failed constraint:  c == d - a - b with 0 == 3 - 1 - 1
didn't meet constraints. retrying
19


In [77]:
number_templates.to_json('../datasets/gsm_name_numbers.jsonl', orient='records', lines=True)