In [None]:
import openai
import time
import copy
import json
import argparse
import tqdm
import os
import pandas as pd

from pal import interface
from pal.prompt import math_prompts
from pal.core.backend import call_chat_gpt

openai.api_key = os.getenv('OPENAI_API_KEY')


In [None]:
QUESTION_PREFIX = 'Q: '
ANSWER_PREFIX = 'A: '
QUESTION_SUFFIX = '\n'
ANSWER_SUFFIX = '\n'
EXEMPLAR_PREFIX = '\n\n\n'

In [None]:
question = "John runs 60 miles a week. He runs 3 days a week. He runs 3 hours the first day and half as much the other two days he runs. How fast does he run?"

In [34]:
reduced_gsm = gsm = pd.read_json('../datasets/reduced_gsm.jsonl', lines=True)
reduced_gsm.head()

Unnamed: 0,input,target,prev_answer,prev_score,prev_generation,prev_code_error
0,Gloria is shoe shopping when she comes across ...,104,,0,"[```\ndef solution():\n """"""Gloria is shoe s...","cannot assign to operator (<string>, line 13)"
1,Marilyn's first record sold 10 times as many c...,8000,,0,[We can solve this problem using algebra. Let'...,"invalid syntax (<string>, line 2)"
2,"Emily has 4 kids named Amy, Jackson, Corey, an...",4,,0,[We can use algebra to solve this problem. Let...,"invalid syntax (<string>, line 1)"
3,Jerome had 4 friends who came to visit him on ...,175,,0,[We can solve this problem using Python as fol...,local variable 'friend4' referenced before ass...
4,Frankie watches TV after he finishes his homew...,3,,0,[We can solve this problem by using algebra. L...,No module named 'sympy'


In [None]:
with open('../prompts/problem_reducing_least_to_most.txt') as f:
    breakdown_prompt = "".join(f.read())

In [None]:
breakdown_prompt

'Q: Four years ago, Kody was only half as old as Mohamed. If Mohamed is currently twice 30 years old, how old is Kody?\nA: To answer the question “How old is Kody?”, we need to know: “How old is Mohamed?”, “How old was Mohamed four years ago?”, “How old was Kody four years ago?”.\n\n\nQ: If Pam is currently twice as young as Rena is, and in 10 years Rena will be 5 years older than her, how old is Pam now?\nA: To answer the question “How old is Pam now?”, we need to know: “How much older is Rena than Pam currently?”.\n\n\nQ: As a freelancer, Baylor is paid for every finished work of a client he does on a freelance marketplace. Currently, he has $4000 on his dashboard from previous work done. He is currently working for three clients, with the first client paying him half the amount of money he currently has on his dashboard once the job is done. The second client will pay him 2/5 times more money than the first client once Baylor finishes his work. The third client will pay him twice th

In [29]:
question = "John runs 60 miles a week. He runs 3 days a week. He runs 3 hours the first day and half as much the other two days he runs. How fast does he run?"
messages =[{'role': 'user', 'content': f'{breakdown_prompt}{QUESTION_PREFIX}{question}{QUESTION_SUFFIX}{ANSWER_PREFIX}'}]
gen = call_chat_gpt(messages, max_tokens=512)
print(gen)

To answer the question “How fast does he run?”, we need to know: “How many miles does John run on the first day?”, “How many miles does John run on the other two days?”, “How many hours does John run on the other two days?”, “What is John’s average speed?”


In [30]:
def remove_quotes(input_str): 
    # "abcs" -> abcs
    return input_str[1:-1]
    

In [31]:
def extract_subquestions(gen: str, phrase='we need to know: ') -> list:
    main_question_phrase = 'To answer the question'
    sub_questions_phrase = 'we need to know: '
    
    subquestions = []
    main_question = remove_quotes(gen.split(main_question_phrase)[1].split(",")[0].strip())
    
    for sub_question in gen.split(sub_questions_phrase)[1].split(","):
        subquestions.append(remove_quotes(sub_question.strip()))
        
    subquestions.append(main_question)
    
    return subquestions

## Getting subquestions

In [38]:
from tqdm import tqdm

questions = tqdm(reduced_gsm['input'])
col_subquestions = []

for i, question in enumerate(questions):
    messages =[{'role': 'user', 'content': f'{breakdown_prompt}{QUESTION_PREFIX}{question}{QUESTION_SUFFIX}{ANSWER_PREFIX}'}]
    gen = call_chat_gpt(messages, max_tokens=512)
    sub_qs = extract_subquestions(gen)
    col_subquestions.append(sub_qs)

100%|██████████| 294/294 [37:11<00:00,  7.59s/it]  


In [39]:
reduced_gsm['subquestions'] = col_subquestions

In [40]:
reduced_gsm

Unnamed: 0,input,target,prev_answer,prev_score,prev_generation,prev_code_error,subquestions
0,Gloria is shoe shopping when she comes across ...,104,,0,"[```\ndef solution():\n """"""Gloria is shoe s...","cannot assign to operator (<string>, line 13)",[How much do the two pairs of high heels cost ...
1,Marilyn's first record sold 10 times as many c...,8000,,0,[We can solve this problem using algebra. Let'...,"invalid syntax (<string>, line 2)","[How many copies did Marilyn sell?, What is th..."
2,"Emily has 4 kids named Amy, Jackson, Corey, an...",4,,0,[We can use algebra to solve this problem. Let...,"invalid syntax (<string>, line 1)","[How old is Corey?, How old is Amy?, How old i..."
3,Jerome had 4 friends who came to visit him on ...,175,,0,[We can solve this problem using Python as fol...,local variable 'friend4' referenced before ass...,[How many times did the second friend press th...
4,Frankie watches TV after he finishes his homew...,3,,0,[We can solve this problem by using algebra. L...,No module named 'sympy',[How many hours of TV did Frankie watch on Mon...
...,...,...,...,...,...,...,...
289,A cup of mushrooms weighs 100 grams and has 3 ...,42,42.0,1,"[```\ndef solution():\n """"""A cup of mushroo...",,[How many cups of mushrooms does John eat per ...
290,"While working at the restaurant, each of the f...",1520,1520.0,1,"[```\ndef solution():\n """"""While working at...",,"[How much money did Rafaela receive in tips?, ..."
291,John rents his car out 10 times a month for 3 ...,250,250.0,1,"[```\ndef solution():\n """"""John rents his c...",,[How much money does John make from renting ou...
292,Frankie and Binkie went bowling together. Fra...,195,195.0,1,"[```\ndef solution():\n """"""Frankie and Bink...",,"[How much higher than 90 was Frankie's score?,..."


In [41]:
# save to jsonl
reduced_gsm_with_subqs = reduced_gsm.to_json('../datasets/reduced_gsm_with_sub_qs.jsonl', orient='records', lines=True)

just_q 

    sq1 
    a1        - eval(given just_q, is a1 correct?)
                given {just_q} is {a1} correct? True / False
        True
        just_q += a1 
        sq2
        a2        - eval(given just_q, is a2 correct?)
            True
            just_q += a2 
            sq3
            a3
            .
            .
            .
            
 
# Tasks

# analysis of self-eval() part, no new runs
'it is correct' -> false positives
'it is not correct / remaining case' -> false negatives

# Design prompt for self-eval()  

# instruction -> "you are wrong..." / ablation of this

# see variation in gms8k 


# baseline (reduced_gsm) 
    - CoT (change the prompts)

    - Self-cons (n=4, 5, 7, 10) (on final answer, and not on sub-questions) -> change n= variable in the final call
    - Self-refine (score cheating, non-cheating)
   
    - Least to most

    - Least to most with self-refine (CoT in NL)  -> design the self-eval prompt

    - Least to most with self-refine (CoT in Python)

    - to make least to most more sample efficient, apply self-refine or self-cons only in the last step/sub-question/main-question
