### Autoplan to get Strategies 

In [None]:
import os
import time
import json 
import regex
import random
import pickle 
import re
import sys
import numpy as np
import openai
from func_timeout import func_timeout, FunctionTimedOut

from tqdm import tqdm

import logging
 
# Create and configure logger
logging.basicConfig(filename="Autoplan.log",
                    format='%(asctime)s %(message)s',
                    filemode='w')
 
# Creating an object
logger = logging.getLogger()
 
# Setting the threshold of logger to DEBUG
logger.setLevel(logging.INFO)



def pretty_print(role, text, verbose=False):
    string = '------------{}-----------\n{}\n'.format(role, text)
    if verbose: print(string, end=''), sys.stdout.flush()
    return string

def Calculate(expression):
    expression = expression.replace("^", "**")
    return eval(expression)    


class HotpotQA:
    def __init__(self, lm, method, backend_args, quota_args, **kwargs):
        self.lm = lm
        self.method = method  # step
        self.backend = backend_args['name']  # openai
        
        if self.backend == 'openai':
            openai.api_type = "azure"
            openai.api_base = "https://mrityunjoypanday-gpt4.openai.azure.com/"
            openai.api_version = "2023-03-15-preview"
            openai.api_key = "d020880e0119447fbde26d3dcfb09bd7"
    
        self.top_p = backend_args['top_p']
        self.temp = backend_args['temp']
        self.max_token = backend_args['max_token']
        self.presence_penalty = backend_args['presence_penalty']        
        
        self.max_iter_per_instance = quota_args['max_iter_per_instance']

        
        self.history = []
        self.strategy = None

        # openai api
        self.n_prompt_token = 0
        self.n_sample_token = 0
        self.messages = []
        
    def task_desription(self):
        return '''Solve a Math question answering task through actions(if required) which can be of following types: 
(1) calculator[expression]: Stricly use only for simple numerical calculations which can be calculated by Pythons eval function. (Do not use this function if any variable(alphabet) is encountered in the expression; Instead use LLM action). 
                            Correct Use: calculator[(3*4)/9]
                                         Calculator[4+5+9-2]
                            Wrong use: calculator[smallest positive integer that ends in 9 and is divisible by 7]
                                       calculator[20 divided by 2]
                            
(2) finish[answer]: return the answer and finish the task.
(3) LLM[string]: get the answer using LLM.'''
    
    def score(self, instance, prediction):
        question, answer = instance['question'], instance['answer']
        query = """Question: "{}"\n
Gold Answer: "{}"\n
My Prediction: "{}"\n
Verify whether my prediction to the question is equivalent to gold answer. Respond with yes/no.""".format(question, answer, prediction)

        message = {
            'role': 'user',
            'content': query
        }
        
        while True:
            try:
                response = func_timeout(10, 
                    openai.ChatCompletion.create,
                    kwargs={
                        "engine": "gpt-4",
                        "messages": [message],
                        "temperature": 0.
                    }                
                )['choices'][0]['message']['content'].lower()
                break
            except FunctionTimedOut:
                print('OpenAI API call timeout')
                continue
        
        assert 'yes' in response or 'no' in response, (question, answer, prediction, response)
        return 'yes' in response
    
    def eval(self, instances, predictions):
        n_correct = 0
        correct_mask = []
        for idx in tqdm(range(len(instances)), desc='Evaluating'):
            result = self.score(instances[idx], predictions[idx])
            n_correct += int(result)
            correct_mask.append(result)
        
        return {'n_correct': n_correct, 'correct_mask': correct_mask}

    
    def formalize(self, action):
        message = """Valid action formats are as follows:
(1) calculator[expression]
(2) finish[answer]
(3) LLM[string]

Formalize the following action strictly into the above valid action formats. If there are multiple actions, formalize the first one.

Action: I want to do a calculation of  (45 * 95) / 5. 
Formalized First Action: calculator[(45*95)/5]

Action: i got the final answer as 90.
Formalized First Action: finish[90]

Action: get the answer for "n=5*k+3. find the possible values for n"
Formalized First Action: LLM[n=5*k+3. find the possible values for n]

Action: {}
Formalized First Action:
""".format(action)
        message = {'role': 'user', 'content': message}
        action = self.call_openai_api([message], stop='\n')['choices'][0]['message']['content']

        return action
    
    
    def recognize(self, response):

        calculator = regex.search(r'calculator\[(.*)\]', response)
        Solve = regex.search(r'LLM\[(.*)\]', response)
        finish = regex.search(r'finish\[(.*)\]', response)

        max_start = max([x.start() for x in [calculator, Solve, finish] if x is not None], default=-1)
        if max_start == -1:
            match = None
        elif calculator and max_start == calculator.start():
            match = calculator
        elif Solve and max_start == Solve.start():
            match = Solve
        elif finish and max_start == finish.start():
            match = finish
        else:
            raise NotImplementedError
        return match, calculator, Solve, finish
    
    def price(self):
        price = PRICE[self.lm]
        return (self.n_prompt_token * price['prompt'] + self.n_sample_token * price['sample']) / 1000
    
    def call_openai_api(self, messages, stop, lm=None, top_p=None):
        n_try = 10
        while n_try > 0:
            try:
                time.sleep(1)
                response = func_timeout(90, 
                    openai.ChatCompletion.create,
                    kwargs={
                        "engine": self.lm if lm is None else lm,
                        "messages": messages,
                        "top_p": self.top_p if top_p is None else top_p,
                        "temperature": self.temp,
                        "max_tokens": self.max_token,
                        "presence_penalty": self.presence_penalty,
                        "stop": stop,
                    }
                )
                break
            except FunctionTimedOut:
                print('[LOG] OpenAI API call timeout')
                n_try -= 1
                if n_try == 0:
                    raise Exception('Failed 10 retries.')
                continue
            except Exception as e:
                #print('[LOG]', e)
                time.sleep(15)
                n_try -= 1
                if n_try == 0:
                    raise Exception('Failed 10 retries.')
                continue
        return response
        
    def call_lm(self, prompt, add_response=True, stop=None, lm=None, top_p=None):
        
        self.messages.append({'role': 'user', 'content': prompt})
        response = self.call_openai_api(self.messages, stop, lm=lm, top_p=top_p)
        if add_response: self.messages.append(response['choices'][0]['message'])
        self.n_prompt_token += response['usage']['prompt_tokens']
        self.n_sample_token += response['usage']['completion_tokens']
        return response['choices'][0]['message']['content']
    
    def LLM(self, string):
        messages = [{'role': 'user', 'content': str(string)}]
        response = self.call_openai_api(messages, stop=None, lm=None, top_p=None)
        return response['choices'][0]['message']['content']
        
    def init(self):
        prompt = "{}\n\nTask Plan:".format(self.task_desription())
        strategy = self.call_lm(prompt)
        return strategy
    
    
    def run(self, data, strategy=None, is_test=False, verbose=False, return_history=False):
        questions = []
        answers = []
        predictions = []
        summaries = []
        flawed_actions = []
        flawed_plans = []
        history = ''

        
        for q_idx in range(len(data)):
            self.messages = []
            init_msg = self.task_desription()
            if 'direct' not in self.method and strategy is not None:
                init_msg += "\n\nTask Plan:\n{}\n".format(strategy)
            init_msg += '\n'

            question = data[q_idx]['question']
            answer = data[q_idx]['answer']
            questions.append(question)
            answers.append(answer)

            # print(question, answer, sep='\n')

            if 'step' in self.method and strategy is not None:
                thought_prompt = "Identify which step of plan you are at. Show your thought about the one next action. Your thought should be faithful to the plan step."
            else:
                thought_prompt = "Show your thought about the next action."

            input_msg = init_msg + '\nQuestion: ' + question + '\n' + thought_prompt
            history += pretty_print('Human', input_msg, verbose)
            logger.info(pretty_print('Human', input_msg, verbose))
            
            cur_itr = 0
            has_answer = False
            while cur_itr < self.max_iter_per_instance:
                
                # Thought
                thought = self.call_lm(input_msg, stop='\n')
                history += pretty_print('Machine', thought, verbose)
                logger.info(pretty_print('Machine', thought, verbose))
                # Action
                
                action = self.call_lm('Action:', add_response=False, stop='\n')
                history += pretty_print('Human', f'Action:{action}', verbose)
                formalized_action = self.formalize(action)
                self.messages.append({'role': 'assistant', 'content': formalized_action})
                history += pretty_print('Machine', '{} (orig {})'.format(formalized_action, action), verbose)
                logger.info(pretty_print('Machine', '{} (orig {})'.format(formalized_action, action), verbose))
                
                rec_res, m_calculator, m_solve, m_final = self.recognize(formalized_action)

                cmd, object = None, None
                if rec_res:
                    cmd = rec_res.group(0)
                    object = rec_res.group(1).strip('"\'')
                
                if rec_res:
                    input_msg = 'Observation:\n'
                    if rec_res == m_calculator:
                        #self.wiki.search_step(object)
                        expression_res = Calculate(object)
                        input_msg += str(expression_res)
                    elif rec_res == m_solve:
                        res = self.LLM(object)
                        #print("LLM: ", type(res), res)
                        input_msg += res
                    else:
                        predictions.append(object)
                        has_answer = True
                        break
                else:
                    input_msg = random.choice([
                        "No action in the format of 'Calculator[...]' or 'finish[...]'.",
                    ])

                if cur_itr < self.max_iter_per_instance - 1:
                    input_msg += '\n' + thought_prompt
                history += pretty_print('Human', input_msg, verbose)
                logger.info(pretty_print('Human', input_msg, verbose))
                
                cur_itr += 1

            if not has_answer:
                predictions.append('')
            
            if not is_test:
                if not has_answer:
                    summary_msg = 'Max number of iteration reached. No answer is found.'
                else:
                    summary_msg = 'Task finished.'

#                 supporting_entities = set()
#                 for f in data[q_idx]['supporting_facts']:
#                     supporting_entities.add('"{}"'.format(f[0]))
#                 supporting_entities = list(supporting_entities)
                
#                 supporting_string = None
#                 if len(supporting_entities) == 1:
#                     supporting_string = supporting_entities[0]
#                 else:
#                     supporting_string = ', '.join(supporting_entities[:-1]) + ' and ' + supporting_entities[-1]                

                summary_msg += ' The ground truth answer is "{}". Summarize the interaction history concisely.'.format(answer)

                history += pretty_print('Human', summary_msg, verbose)
                logger.info(pretty_print('Human', summary_msg, verbose))
                
                summary = self.call_lm(summary_msg, top_p=0.)
                history += pretty_print('Machine', summary, verbose)
                logger.info(pretty_print('Machine', summary, verbose))
                
                summaries.append(summary)
                
                if strategy is not None:

                    failed_action_msg = 'Identify all flawed parts of the plan (not flawed action). Only the flawed part.'
                    history += pretty_print('Human', failed_action_msg, verbose)
                    logger.info(pretty_print('Human', failed_action_msg, verbose))
                    
                    failed_action = self.call_lm(failed_action_msg, top_p=0.)
                    history += pretty_print('Machine', failed_action, verbose)
                    logger.info(pretty_print('Machine', failed_action, verbose))
                    
                    
                    flawed_actions.append(failed_action)

                    suggest_rev_msg = 'Suggest revision to the current flawed part of the plan. Only the flawed part.'
                    history += pretty_print('Human', suggest_rev_msg, verbose)
                    logger.info(pretty_print('Human', suggest_rev_msg, verbose))
                    
                    suggest_rev = self.call_lm(suggest_rev_msg, stop=None, top_p=0.0)
                    history += pretty_print('Machine', suggest_rev, verbose)
                    logger.info(pretty_print('Machine', suggest_rev, verbose))
                    
                    flawed_plans.append(suggest_rev)
                    
            print("\n\n############\n\nID Done:", q_idx, "\n\n")
            
            with open("summaries.json", 'w') as f:
                json.dump({"questions":questions, "answers":answers, "summaries": summaries, "flawed_actions": flawed_actions, "flawed_plans":flawed_plans}, f)
            

        to_return = None

        if is_test:
            to_return = predictions
        else:
            self.messages = []
            final_msg = 'Task Description:\n' + self.task_desription() + '\n\n'
            final_msg += 'Current Task Plan:\n{}\n\n'.format(strategy)
            final_msg += '=' * 10 + 'Task Experiences Begin' + '=' * 10 + '\n\n'

            
            for q_idx in range(len(data)):
                question = data[q_idx]['question']
                final_msg += 'Job {}: Answering the following question. {}\nSummary of Job {}:\n{}\n'.format(q_idx, question, q_idx, summaries[q_idx])
                if strategy is not None:
                    final_msg += 'Flaws of Plan in Job {}:\n{}\n'.format(q_idx, flawed_actions[q_idx])
                    final_msg += 'Suggested Revision of Plan from Job {}:\n{}\n'.format(q_idx, flawed_plans[q_idx])

            final_msg += '=' * 10 + 'Task Experiences End' + '=' * 10 + '\n\n'

            final_msg += 'Based on the above {} experiences of the task, rewrite the current task plan. The plan should not be specific to one job but generalizable to all jobs. \nNew Task Plan:'.format(len(data))

            history += pretty_print('Human', final_msg, verbose)
            logger.info(pretty_print('Human', final_msg, verbose))
            
            with open("final_msg.txt", "w") as f:
                f.write(final_msg)
                
            new_strategy = self.call_lm(final_msg, top_p=0.)
            
            
            history += pretty_print('Machine', new_strategy, verbose)
            logger.info(pretty_print('Machine', new_strategy, verbose))
            
            to_return = new_strategy

        self.history.append(history)

        if return_history:
            to_return = to_return, history
            return to_return

        # if not is_test:
        #     n_correct = self.eval(data, predictions)['n_correct']

        return to_return, history
    

backend_args = {
        'name': "openai",
        'top_p': 1,
        'temp': 0,
        'max_token': 3000,
        'presence_penalty': 1.5,
    }

quota_args = {
        'sleep_minutes': 1,
        'max_iter_per_instance': 4
}


import pandas as pd
df = pd.read_csv("../MATH.csv")
df.head(1)

data = []
for level in [1, 2, 3, 4, 5]:
    level = f"Level {level}"
    new = df[df['level']==level].sample(5, random_state=42)
    print(level, new.index)
    for i in range(5):
        q, l = new.iloc[i]['problem'], new.iloc[i]['label']
        data.append({"question": q, "answer": l})
        
hotpot = HotpotQA("gpt-4", 'step', backend_args, quota_args)

batch_size = 5
new_strategy = "Let's first understand the problem and devise a plan to solve the problem. Then, let's carry out the plan and solve the problem step by step. "
for batch in range(0, len(data), batch_size):
    new_strategy, history = hotpot.run(data[batch: batch + batch_size],
                                    strategy=new_strategy,
                                    is_test=False,
                                    verbose=True,
                                    return_history=True)
    
    with open(f'strategy_{batch}.txt', 'w') as f:
        f.write(new_strategy)
        s = "\n\n" + "="*50 + "\n\n"
        f.write(s)
        f.write(history)