In [8]:
import getpass
import os


def _set_if_undefined(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"Please provide your {var}")

_set_if_undefined("LANGCHAIN_API_KEY")

# Optional, add tracing in LangSmith
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "Self Improvement V2"

# Tools

In [3]:
from utils.tools import construct_tools, get_tools_descriptions
from langgraph.prebuilt import ToolNode

tools = construct_tools()
tools_descriptions = get_tools_descriptions(tools)
tool_node = ToolNode(tools)
from langchain_openai import ChatOpenAI

# Define State

In [4]:
from typing import Literal
from langgraph.graph import END, StateGraph, START
from langgraph.graph.message import add_messages
from typing import Annotated
from typing_extensions import TypedDict
from operator import add

MAX_ITERATIONS = 4

class State(TypedDict):
    input: str
    prediction: str
    critiques: Annotated[list[str], add]
    react_messages: Annotated[list, add]
    predictions: Annotated[list[str], add]
    iteration: Annotated[int, add]

from langchain_core.messages import SystemMessage, HumanMessage
from langgraph.prebuilt import create_react_agent
from langchain_core.messages import SystemMessage,HumanMessage

critic_prompt = ("Inspect the previous messages and identify any potential issues or errors. "
                 "Reflect on the process of problem-solving. "
                 "What's the problem of the previous answer? "
                 "If there are tool calls, what are the issues with tool calls? (Tool selection, parameters, etc.)"
                 "Is the final answer truthful?"
                 "Your response should be as brief as possible . ")

async def criticize(state):
    if state["react_messages"]:
        react_messages = state["react_messages"][-1]
        critique = await llm.ainvoke(react_messages + [HumanMessage(content=critic_prompt)])
        return {
            "critiques": [critique.content],
            "iteration": 1 
        }
    else:
        return {
            "critiques": ["An error occurred. Please try again."],
            "iteration": 1 
        }

react_prompt = ("Always remember that there may be multiple tools that can be used to complete a step! "
                "So if there are more than one tool that can be used, your response should contain multiple tools and their parameters in one message! "
                "When you are going to respond the final answer, you should follow the following format:\n\n"
                "FINAL ANSWER: <Your final answer>\n\n"
                "Always remember the final answer should be as concise as possible. (A single number, a phrase, etc. Not a sentence!)\n\n")


reacter = create_react_agent(model=llm, tools=tools, state_modifier=SystemMessage(content=react_prompt))

async def react(state):
    try:
        if state["critiques"]:
            critique = state["critiques"][-1]
            inputs = {"messages": state["react_messages"][-1] + [HumanMessage(content=critique), HumanMessage(content=f"Question: {state["input"]}")]}
            result = await reacter.ainvoke(input=inputs, config={'recursion_limit': 18})
        else:
            inputs = {"messages": [("user", state["input"])]}
            result = await reacter.ainvoke(input=inputs)
    except Exception as e:
        return {
            "predictions": ["None"]
        }
    return {
        "react_messages": [result["messages"][len(state["react_messages"][-1])+1:]],
        "predictions": [result["messages"][-1].content],
    }


# Either agent can decide to end
from typing import Literal

def should_end(state) -> Literal["critic", "__end__"]:
    if state["iteration"] >= MAX_ITERATIONS or len(state["predictions"]) > 1 and state["predictions"][-1].split("FINAL ANSWER:")[-1].strip() == state["predictions"][-2].split("FINAL ANSWER:")[-1].strip():
        return "__end__"
    else:
        return "critic"
    

builder = StateGraph(State)

builder.add_node("critic", criticize)
builder.add_node("react", react)

builder.add_edge(START, "critic")
builder.add_edge("critic", "react")

builder.add_conditional_edges("react", should_end)

graph = builder.compile()

In [56]:
state = {"input": "What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?",
         "prediction": "Let's think step by step. The woman who portrayed Corliss Archer in the film \"Kiss and Tell\" is Janet Leigh. Janet Leigh served as a member of the California State Board of Parks and Recreation. FINAL ANSWER: member of the California State Board of Parks and Recreation.",
         "iteration": 0,
         "react_messages": [[("user","What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?\nLet's think step by step. The woman who portrayed Corliss Archer in the film \"Kiss and Tell\" is Janet Leigh. Janet Leigh served as a member of the California State Board of Parks and Recreation. FINAL ANSWER: member of the California State Board of Parks and Recreation.")]],
         'critiques': ['The previous answer incorrectly identifies Janet Leigh as the actress who portrayed Corliss Archer in the film "Kiss and Tell." The character was actually played by another actress, Annette Funicello, while Leigh is known for her roles in other classic films. This foundational error undermines the accuracy of the entire answer.\n\nRegarding the process of problem-solving, the issue arises from a failure to verify the identity of the actress and the corresponding film connection. Critical thinking should involve confirming basic facts before drawing conclusions or making assertions.\n\nThere were no tool calls in this interaction; the concerns stem solely from the provided information. However, if tool calls were involved, potential issues could include improper tool selection (e.g., using an unreliable source for film credits) or incorrect parameters (like not specifying the movie or character precisely).\n\nOverall, since the final answer is based on misinformation (attributing the role to the wrong actress), the statement regarding Janet Leigh being a member of the California State Board of Parks and Recreation is overshadowed by the initial error, resulting in an untruthful context. Effective problem-solving requires double-checking facts and maintaining a focus on accuracy.']
         }
result = await react(state)

In [15]:
import json
import asyncio
results = []

async def process(item, dataset_name:str="hotpot_qa", timeout: int = 240):
    if dataset_name == "hotpot_qa":
        messages = [HumanMessage(content=f"{item['question']}\n{item['prediction']}")]
        input = {"react_messages": [messages], "input": item["question"], "prediction": item["prediction"], "iteration": 0}
    elif dataset_name in ["gsm8k", "svamp"]:
        messages = [HumanMessage(content=f"Use python code solve the following problem, variable <answer> should contain the final answer. Use \"print(answer)\" to get the final answer.\n{item['question']}\n{item["prediction"]}")]
        input = {"react_messages": [messages], "input": f"Use python code solve the following problem, variable <answer> should contain the final answer. Use \"print(answer)\" to get the final answer.\n{item['question']}\n", "prediction": item["prediction"], "iteration": 0}
    try:
        return await asyncio.wait_for(graph.ainvoke(input=input), timeout=timeout)
    except asyncio.TimeoutError:
        print(f"Timeout processing item {item}")
        return f"Timeout on {item}"
    except Exception as e:
        print(f"Error processing item {item}: {e}")
        return f"Error on {item}: {str(e)}"

# 测试 hotpot_qa gsm8k svamp

In [20]:
from datasets import load_dataset
from tqdm.asyncio import tqdm_asyncio

dataset_name = "gsm8k"
mode = "critic"
num_test_sample = -1
temperature = 0.5
top_p = 1
batch_size = 100
llm = ChatOpenAI(temperature=temperature, model="gpt-4o-mini", base_url="https://api.chsdw.top/v1", top_p=top_p)

dataset = load_dataset("json", data_files="/Users/ariete/Projects/self-improve/output/inference/svamp/pot_1000_temperature_0_top-p_1.jsonl", split="train")
if dataset_name == "svamp":
    dataset = dataset.map(lambda x: {"question": f"{x["Body"]} {x["Question"]}"})
print(dataset)

Dataset({
    features: ['question', 'answer', 'prediction'],
    num_rows: 1319
})


In [None]:
result = await process(dataset[1], dataset_name)
print(result)

In [19]:
from tqdm import tqdm
for i in tqdm(range(0, 1000, batch_size)):
    batch = dataset.select(range(i, min(i + batch_size, len(dataset))))
    batch_results = await asyncio.gather(*(process(item, dataset_name) for item in batch))
    results.extend(batch_results)
    with open(f"/Users/ariete/Projects/self-improve/output/v2/{dataset_name}/{mode}_{len(dataset)}_temperature_{temperature}_top-p_{top_p}.jsonl", 'w') as f:
        for idx, item in enumerate(results):
            if isinstance(item, str):
                temp = {"idx": idx, "question":  dataset[idx]["question"], "predictions": [dataset[idx]["prediction"]], "answer": dataset[idx]["answer"]}
                f.write(json.dumps(temp) + "\n")
            else:
                temp = {"idx": idx, "question": item["input"], "predictions": item["predictions"], "answer": dataset[idx]["answer"], "prediction": dataset[idx]["prediction"]}
                f.write(json.dumps(temp) + "\n")




  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')
  result["title"] = BeautifulSoup(result["title"], "html.parser").get_text()


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup

Timeout processing item {'id': '5ac01bb8554299012d1db5a6', 'question': 'Which American character actor who starred on the television series "Stargate SG-1" (1997–2007) and appeared in "Episode 8" of "Twin Peaks" as a guest star?', 'answer': 'Don S. Davis', 'type': 'bridge', 'level': 'hard', 'supporting_facts': {'title': ['Episode 8 (Twin Peaks)', 'Episode 8 (Twin Peaks)', 'Don S. Davis'], 'sent_id': [0, 2, 0]}, 'context': {'title': ['Don S. Davis', 'Episode 8 (Twin Peaks)', 'Samantha Carter', 'Stargate: Continuum', 'Lost City (Stargate SG-1)', 'Stargate SG-1 (season 8)', 'Stargate SG-1', 'Redemption (Stargate SG-1)', 'Vala Mal Doran', 'George Hammond (Stargate)'], 'sentences': [['Don Sinclair Davis, PhD (August 4, 1942 – June 29, 2008) was an American character actor best-known for playing General Hammond in the television series "Stargate SG-1" (1997–2007), and earlier for playing Major Garland Briggs on the television series "Twin Peaks" (1990–1991).', ' He was also a theater profess



  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')
  result["title"] = BeautifulSoup(result["title"], "html.parser").get_text()


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')
  result["snippet"] = B

Timeout processing item {'id': '5a80ecea55429938b6142254', 'question': 'My Secret Hotel is a television series starring a South Korean DJ who rose to fame as the lead actress in what?', 'answer': "Queen In-hyun's Man", 'type': 'bridge', 'level': 'hard', 'supporting_facts': {'title': ['My Secret Hotel', 'Yoo In-na', 'Yoo In-na'], 'sent_id': [0, 0, 2]}, 'context': {'title': ['My Secret Hotel', 'Han Sun-hwa', 'Yoo Ah-in', 'Secret Love (TV series)', 'Yoo In-na', 'Nice Githinji', 'Park Myeong-su', 'The Secret of My Love', 'Kim Ji-han', 'Bang Eun-hee'], 'sentences': [['My Secret Hotel () is a 2014 South Korean mystery-romantic comedy television series starring Yoo In-na, Jin Yi-han, Namkoong Min and Lee Young-eun.', ' It aired on tvN from August 18 to October 14, 2014 on Mondays and Tuesdays at 23:00 for 16 episodes.'], ['Han Sun-hwa (born October 6, 1990), is a South Korean singer and actress.', ' She is a former member of the South Korean girl group Secret.', ' She made her television debu



  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')
  result["title"] = BeautifulSoup(result["title"], "html.parser").get_text()


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup

Timeout processing item {'id': '5a88bd1d554299206df2b357', 'question': 'When did the character on Grey\'s Anatomy, played by the same actor who portrayed Rev James Lawson in "Lee Daniel\'s The Butler", debut?', 'answer': 'season six, episode 5, "Invasion"', 'type': 'bridge', 'level': 'hard', 'supporting_facts': {'title': ['Jesse Williams (actor)', 'Jackson Avery'], 'sent_id': [1, 2]}, 'context': {'title': ['James Rae Forgan', 'Peter Lawson (cricketer)', 'Jackson Avery', 'Thomas J. Lawson', 'Jack Lawson', 'Jesse Williams (actor)', 'The Plants', 'John Lawson (Australian politician)', 'James Menteath', 'James Lapslie'], 'sentences': [['The Very Rev James Rae Forgan DD (1876-1966) was a Scottish minister who served as Moderator of the General Assembly of the Church of Scotland in 1940.'], ['Peter James Lawson (born 11 September 1981) is an English cricketer.', ' Lawson is a right-handed batsman who bowls right-arm medium-fast.', ' He was born in Barrow-in-Furness, Cumbria.'], ['Jackson Ave



  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')
  result["title"] = BeautifulSoup(result["title"], "html.parser").get_text()


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')
100%|██████████| 10/10 [36:28<00:00, 218.83s/it]


# GSM8K

In [14]:
from datasets import load_dataset
dataset_name = "gsm8k"
mode = "critic"
num_test_sample = -1
temperature = 0
top_p = 1
batch_size = 100
dataset = load_dataset("json", data_files="/Users/ariete/Projects/self-improve/output/inference/gsm8k/pot_1319_temperate_0_top-p_1.jsonl", split="train")
print(dataset)
results = []


Dataset({
    features: ['question', 'answer', 'prediction'],
    num_rows: 1319
})


In [None]:
from tqdm import tqdm
for i in tqdm(range(100, len(dataset), batch_size)):
    batch = dataset.select(range(i, min(i + batch_size, len(dataset))))
    batch_results = await asyncio.gather(*(process(item, dataset_name) for item in batch))
    results.extend(batch_results)
    with open(f"/Users/ariete/Projects/self-improve/output/v2/{dataset_name}/{mode}_{len(dataset)}_temperature_{temperature}_top-p_{top_p}.jsonl", 'w') as f:
        for idx, item in enumerate(results):
            if isinstance(item, str):
                temp = {"idx": idx, "question":  dataset[idx]["question"], "predictions": [dataset[idx]["prediction"]], "answer": dataset[idx]["answer"]}
                f.write(json.dumps(temp) + "\n")
            else:
                temp = {"idx": idx, "question": dataset[idx]["question"], "predictions": item["predictions"], "answer": dataset[idx]["answer"], "prediction": dataset[idx]["prediction"]}
                f.write(json.dumps(temp) + "\n")