Load in the values

In [3]:
import pandas as pd
number_templates = pd.read_json('../templates/number_templates.jsonl', lines=True)
number_templates = number_templates.head(20)

Helper function to query openai

In [4]:
import os
from openai import OpenAI
import pandas as pd
import json

# set prompt
with open('prompt_files/fill_in_numbers_prompt.txt', 'r') as file:
    system_prompt = file.read()

suffix = "\n\nYour output should strictly be a json object mapping the inputs to outputs. It should be in the format of {variable1: value, variable2: value, ...}"

# get API key
with open('prompt_files/openai_key.txt', 'r') as file:
    API_KEY = file.read().strip()

def get_variables(target_question, API_KEY = API_KEY, prompt = system_prompt, suffix = suffix):
  os.environ["OPENAI_API_KEY"] = API_KEY
  client = OpenAI()
  response = client.chat.completions.create(
      model="gpt-4o-mini",
      messages=[
          {"role": "system", "content": prompt},
          {"role": "user", "content": target_question + suffix},
      ]
  )
  answer = response.choices[0].message.content
  return answer

In [5]:
def valid_output(constraints, vars):
    keys = vars.keys()
    # go in this order to avoid partial matching of future keys
    sorted_keys = sorted(keys, key=lambda x: len(x), reverse=True)
    for constraint in constraints:
        original_constraint = constraint
        for key in sorted_keys:
            constraint = constraint.replace(key, str(vars[key]))
        if not eval(constraint):
            print("Failed constraint: ", original_constraint, "with", constraint)
            return False
    return True

In [6]:
def replace_vars(query, answer, vars):
    for key in vars:
        query = query.replace('{' + key + '}', str(vars[key]))
        answer = answer.replace('{' + key + '}', str(vars[key]))
        # answer = answer.replace(key, str(vars[key]))
    return query, answer

## Toy Example

In [55]:
query = str(number_templates.iloc[0].to_dict())
vars = get_variables(query)
vars = json.loads(vars)
print(vars)

{'a': 10, 'b': 3, 'c': 2, 'd': 5, 'e': 5, 'f': 25, 'x': 10, 'y': 3, 'z': 2}


In [56]:
valid_output(number_templates.iloc[0]['constraints'], vars)

True

In [57]:
replace_vars(number_templates.iloc[0]['question'], number_templates.iloc[0]['answer'], vars)

("Janet’s ducks lay 10 eggs per day. Janet eats 3 for breakfast every morning and bakes muffins for her friends every day with 2. Janet sells the remainder at the farmers' market daily for $5 per fresh duck egg. How much in dollars does Janet make every day at the farmers' market?",
 'Janet sells 10 - 3 - 2 = <<10 - 3 - 2 = 5>>e duck eggs a day.\nShe makes 5 * 5 = $<<5 * 5 = 25>>f every day at the farmer’s market.\n#### 25')

## Execute on all samples

#### Debugging Stuff

In [69]:
vars

{'a': 20, 'b': 25, 'c': 20}

In [42]:
number_templates.iloc[i]['constraints']

['a > 0',
 'b > 0',
 'x > 0',
 'y > 0',
 'p1 == a * (x / 100)',
 'p2 == b * (y / 100)',
 'max_profit == max(p1, p2)']

In [43]:
valid_output(number_templates.iloc[i]['constraints'], vars)

NameError: name 'm10020' is not defined

In [44]:
number_templates.iloc[i]['answer']

'If he purchases jewelry, he will make a profit of {x}% which is ${a}*({x}/100) = $<<{a}*({x}/100)={p1}>>{p1}\nIf he purchases electronic gadgets, he will make a profit of {y}% which is ${b}*({y}/100) = $<<{b}*({y}/100)={p2}>>{p2}\nIf he wants to maximize profit, since max({p1}, {p2}) determines the better choice, he will choose the option with the higher profit, thereby making a profit of ${max_profit}\n#### {max_profit}'

In [45]:
replace_vars(number_templates.iloc[i]['question'], number_templates.iloc[i]['answer'], vars)

('A merchant wants to make a choice of purchase between 2 purchase plans: jewelry worth $100 or electronic gadgets worth $150. His financial advisor speculates that the jewelry market will go up 20% while the electronic gadgets market will rise 30% within the same month. If the merchant is looking to maximize profit at the end of this month by making a choice, how much profit would this be?',
 'If he purchases jewelry, he will make a profit of 20% which is $100*(20/100) = $<<100*(20/100)=20.0>>20.0\nIf he purchases electronic gadgets, he will make a profit of 30% which is $150*(30/100) = $<<150*(30/100)=45.0>>45.0\nIf he wants to maximize profit, since max(20.0, 45.0) determines the better choice, he will choose the option with the higher profit, thereby making a profit of $45.0\n#### 45.0')

# Do runs

In [5]:
completed_cache = set() # useful for debugging

In [None]:
max_retries = 10
for i in range(len(number_templates)):
    if i in completed_cache:
        continue
    print(i)
    query = str(number_templates.iloc[i].to_dict())
    num_retries = 0
    while num_retries < max_retries:
        vars = get_variables(query)
        try:
            vars = json.loads(vars.strip('```').replace('json', '').strip())
        except:
            print("invalid json. retrying")
            num_retries += 1
            continue
        if valid_output(number_templates.iloc[i]['constraints'], vars):
            q, ans = replace_vars(number_templates.iloc[i]['question'], number_templates.iloc[i]['answer'], vars)
            number_templates.iloc[i]['question'] = q
            number_templates.iloc[i]['answer'] = ans
            completed_cache.add(i)
            break
        else:
            print("didn't meet constraints. retrying")
            num_retries += 1
    if num_retries == max_retries:
        print("Failed to generate valid output for template " + str(i))
number_templates.to_json('../datasets/gsm_numbers.jsonl', orient='records', lines=True)

16
17
Failed constraint:  f > 0 with 0 > 0
didn't meet constraints. retrying
18
Failed constraint:  c > 0 with 0 > 0
didn't meet constraints. retrying
Failed constraint:  c > 0 with 0 > 0
didn't meet constraints. retrying
19


# Do 5 variations per question

In [19]:
import numpy as np
big_df = pd.DataFrame(np.repeat(number_templates.values, 5, axis=0))
big_df.columns = number_templates.columns

In [20]:
completed_cache = set() # useful for debugging

In [None]:
max_retries = 10
for i in range(len(big_df)):
    if i in completed_cache:
        continue
    print(i)
    query = str(big_df.iloc[i].to_dict())
    num_retries = 0
    while num_retries < max_retries:
        vars = get_variables(query)
        try:
            vars = json.loads(vars.strip('```').replace('json', '').strip())
        except:
            print("invalid json. retrying")
            num_retries += 1
            continue
        if valid_output(big_df.iloc[i]['constraints'], vars):
            q, ans = replace_vars(big_df.iloc[i]['question'], big_df.iloc[i]['answer'], vars)
            big_df.iloc[i]['question'] = q
            big_df.iloc[i]['answer'] = ans
            completed_cache.add(i)
            break
        else:
            print("didn't meet constraints. retrying")
            num_retries += 1
    if num_retries == max_retries:
        print("Failed to generate valid output for template " + str(i))
big_df.to_json('../data/gsm_numbers_100.jsonl', orient='records', lines=True)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
Failed constraint:  f == c * n with 12 == 2 * 3
didn't meet constraints. retrying
16
Failed constraint:  f == c * n with 12 == 2 * 3
didn't meet constraints. retrying
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
Failed constraint:  h % 1 == 0 with 17.5 % 1 == 0
didn't meet constraints. retrying
Failed constraint:  h % 1 == 0 with 11.5 % 1 == 0
didn't meet constraints. retrying
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
Failed constraint:  a % f ==  0 with 100 % 15 ==  0
didn't meet constraints. retrying
Failed constraint:  a % f ==  0 with 100 % 30 ==  0
didn't meet constraints. retrying
56
Failed constraint:  a % f ==  0 with 100 % 15 ==  0
didn't meet constraints. retrying
57
Failed constraint:  a % f ==  0 with 40 % 15 ==  0
didn't meet constraints. retrying
58
59
60
61
Failed constraint:  f == d * g with 20 == 4 * 10
didn't meet constraints. retrying
62
63
64
65
66
67
68
69
70
71
72
Failed constraint:  g == b * e wi

OSError: Cannot save file into a non-existent directory: '../datasets'