In [None]:
import os
import json

from langchain_core.outputs import LLMResult
from agent.utils.tools.google_search import GoogleSearchTool
from langchain_core.prompt_values import PromptValue

from agent.utils.loader import load_prompt, load_processed_data
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())

In [None]:
dataset_name = 'hotpot_qa'
mode = "critic"
model = "gpt-4o-mini-2024-07-18"
num_samples = 1000
if dataset_name in ['hotpot_qa', 'ambig_qa', 'trivia_qa']:
	google_search = GoogleSearchTool()
	temperature = 0
	n = 1
elif dataset_name in ['gsm8k', 'svamp', 'tabmwp']:
	temperature = 0.5
	n = 1
else:
	temperature = 0.9
	n = 25
	model = "gpt-4o-mini"
top_p = 0.95
seed = 42
batch_size = 100
if "gpt-4o-mini" in model:
    processed_data_path = f"../../output/inference/gpt-4o-mini/{dataset_name}/cot/num_samples_{num_samples}_top_p_{top_p}_temperature_0_seed_{seed}.jsonl"
else:
	    processed_data_path = f"../../output/inference/{model}/{dataset_name}/cot/num_samples_{num_samples}_top_p-{top_p}_temperature_0_seed_{seed}.jsonl"
save_results_path = f"../../output/inference/{model}/{dataset_name}/{mode}/num_samples_{num_samples}_top_p_{top_p}_temperature_{temperature}_seed_{seed}.jsonl"
prompt = load_prompt(dataset_name=dataset_name, mode=mode)
dataset = load_processed_data(dataset_name=dataset_name, file_path=processed_data_path).map(lambda x: {"generation": x["generation"][3:]})
if num_samples > 0:
	dataset = dataset.select(range(num_samples))
llm = ChatOpenAI(model=model, top_p=top_p, n=n, temperature=temperature, base_url="https://api.chsdw.top/v1", seed=seed)

prompt.pretty_print()
print(dataset[2])

In [None]:
def is_null_answer(text):
    if not text:
        return True
    text = text.strip().lower()
    if text in ["none", "", "no answer", "never", "null", "both", "neither"]:
        return True
    if text.startswith("none"):
        return True
    return False

In [None]:
MAX_ITERATION = 7
async def critic_iter(item: dict):

    # load prompt
    prompt_critic: PromptValue = prompt.invoke(input=item)

    # verify: plausible & truthful
    context = "What's the problem with the above answer?\n\n1. Plausibility:\n\n"
    prompt_critic.messages[-1].content += context

    exist_query = []
    exist_evidence = set()
    revised_cot = ""
    for idx in range(MAX_ITERATION): # max interaction with tool
        print("\n\n" + "=" * 30, "Round", idx, "=" * 30)
        # get LLM res
        try:
            res = await llm.ainvoke(input=prompt_critic, stop=["> Evidence:", "---"])
            res = res.content
        except:
	        res = ""

        # case1: search
        if "> Search Query:" in res:
            print("CASE 1:")
            try:
                _, search_query = res.split("> Search Query:")[:2]
                search_query = search_query.split("\n")[0].strip()
            except:
                print("Search Query Error:", res)
                exit()

            prompt_critic.messages[-1].content += res
            print(res, end="")

            # if args.use_tool:
            # use Tool: search a new evidence
            exist_query.append(search_query)
            for k in range(exist_query.count(search_query), 8):
                try:
                    search_res = await google_search.arun(search_query)
                except:
                    search_res = {'title': "None", 'snippet': "None"}
                    break
                if search_res['snippet'] not in exist_evidence:
                    exist_evidence.add(search_res['snippet'])
                    break
            try:
                context = f"""> Evidence: [title: {search_res['title']}] snippet:{search_res['snippet']}\n\n"""
            except:
                context = f"""> Error when trying to search evidence.\n\n"""
            if idx == MAX_ITERATION - 2:
                context += f"Let's give the most possible answer.\n\nQuestion: {item['question']}\nHere's "
            # else:
            #     # w/o Tool: use LLMs generated evidence
            #     context = """> Evidence: """
            print(context, end="")
            prompt_critic.messages[-1].content += context

            
        # case2: most possible answer
        elif "most possible answer:" in res:
            print("CASE 2:")
            print(res)
            _, revised_cot = res.split("most possible answer:")
            revised_cot = revised_cot.strip()
            prompt_critic.messages[-1].content += revised_cot
            print(prompt_critic.to_string())
            break
        # case3: other output
        else:
            print("CASE 3:")
            if not res:
                print("NOT A RESPONSE.")
                break
            context = res
            context += f"\nLet's give the most possible answer.\n\nQuestion: {item['question']}\nHere's "
            print(context, end="")
            prompt_critic.messages[-1].content += context

    return revised_cot

In [30]:
async def critic(item: dict):
    print(f"Question: {item['question']}")
    print(f"Gold answer: {item['answer']}")
    # iterative correction
    previous_corrected = True
    for itr in range(1, 4):
        # initialization
        if itr == 1:
            # extract prediction
            init_cot = item['generation']
            # 返回答案部分
            init_pred = item['prediction']

            # cot and pred
            item['cot'] = [init_cot]
            item['pred'] = [init_pred]

        # choose the latest answer that is not "None" to critic
        base_idx = itr - 1
        while base_idx > 0 and is_null_answer(item['pred'][base_idx]):
            base_idx -= 1
        previous_cot = item['cot'][base_idx]
        previous_pred = item['pred'][base_idx]

        # one iteration
        revised_cot = await critic_iter(item)
        revised_pred = revised_cot.split("FINAL ANSWER:")[-1].strip()

        # is corrected
        corrected = True
        if revised_cot and (revised_cot == previous_cot):
            corrected = False

        item['cot'].append(revised_cot)
        item['pred'].append(revised_pred)

        # if no correction for twice, break
        if not corrected and not previous_corrected:
            print("Stop.")
            break
        previous_corrected = corrected

In [31]:
def critic_pot(item: dict):
    for itr in range(1, 4):
        if itr == 1:
            print("Is initial program correct:", item['answer'] == item['prediction'])
            item['pred'] = [item['pred']]
            item['report'] = [item['report']]
        print("\n" + "-" * 20, "iteration", itr, "-" * 20)
        
        # criticize latest answer that is not "None"
        base_idx = itr - 1
        while base_idx > 0 and item['pred'][base_idx] is None:
            base_idx -= 1
        print("Correct based on iter:", base_idx)

        previous_code = remove_comment(item['code'][base_idx])

        # construct prompt
        context = f"Question: {item['question']}\n"
        context += f"```python\n{previous_code}\n```\n"
        context += f"Execution: {item['report'][base_idx]}\n"
        context += f"Output: answer = {floatify_ans(item['pred'][base_idx])}\n"
        context += "\nWhat's the problem with the above code?\n\n"
        prompt_critic = prompt + context
        print("="*10, "对已有结果进行反思", "="*10)
        print(context, end="")

        # verify previous code
        # result = llm(
        #     model=args.model,
        #     prompt=prompt_critic,
        #     max_tokens=500,
        #     logprobs=True,
        #     temperature=args.temperature,
        #     n=1,
        #     stop=["Here's", "---"],
        # )
        try:
            result = await llm.ainvoke(input=prompt_critic, stop=["Here's", "---"])
            context = result.content
        except:
            context = ""

        # if context not end with a "\n", add "\n"
        if context and context[-1] != "\n":
            context += "\n"

        print("="*10, "生成新代码", "="*10)
        # generate new code
        context += "Here's a better solution:\n# Python code, return answer \n"
        prompt_critic += context
        print(context, end="")

        result = llm(
            model=args.model,
            prompt=prompt_critic,
            max_tokens=400,
            logprobs=True,
            temperature=args.temperature,
            n=1,
            stop=["```", "---"]
        )

        # excute new code
        code = parse_api_result(result)[0].strip() if result else ""
        pred, report = safe_execute(code)
        pred = floatify_ans(pred)
        corrected = True
        print("{}\n```".format(code))
        print("Execution:", report)
        print("Output: answer =", pred)

        if code.strip() == item['code'][base_idx].strip(): # no correction
            corrected = False
            code = item['code'][base_idx]
            report = item['report'][base_idx]
            pred = item['pred'][base_idx]

        # append new result
        item['code'].append(code)
        item['report'].append(report)
        item['pred'].append(pred)
        is_correct = finqa_equal(pred, item['gt'])

        print("Gold Answer:", item['gt'])
        print("Corrected:", "Yes" if corrected else "No")
        print("Is correct:", is_correct)

Question: Were Scott Derrickson and Ed Wood of the same nationality?
Gold answer: ['yes']


CASE 1:
The answer should be a clear "yes" or "no" as it addresses a yes/no question. The proposed answer appropriately concludes with "Yes," which is suitable.

2. Truthfulness:

Let's search the question in google:

> Search Query: Were Scott Derrickson and Ed Wood of the same nationality? 
> Evidence: [title: RECITATION-AUGMENTED LANGUAGE MODELS] snippet:May 14, 2013 ... Ed Wood is an American. So Scott Derrickson and Ed. Wood are of the same nationality. The answer is yes. Answer (chain-of-thought path- ...



CASE 2:
The evidence confirms that both Scott Derrickson and Ed Wood are American filmmakers, supporting the conclusion that they share the same nationality.

Overall, the proposed answer is plausible, and the evidence corroborates the claims made. Thus, the proposed answer is correct.

Question: Were Scott Derrickson and Ed Wood of the same nationality?
Here's the most possible answer