In [5]:
import getpass
import os


def _set_if_undefined(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"Please provide your {var}")

_set_if_undefined("LANGCHAIN_API_KEY")

# Optional, add tracing in LangSmith
os.environ["LANGCHAIN_TRACING_V2"] = "false"
os.environ["LANGCHAIN_PROJECT"] = "Self Improvement V2"

# Tools

In [6]:
from utils.tools import construct_tools, get_tools_descriptions
from langgraph.prebuilt import ToolNode

tools = construct_tools()
tools_descriptions = get_tools_descriptions(tools)
tool_node = ToolNode(tools)
from langchain_openai import ChatOpenAI

# Define State

In [8]:
from typing import Literal
from langgraph.graph import END, StateGraph, START
from langgraph.graph.message import add_messages
from typing import Annotated
from typing_extensions import TypedDict
from operator import add

MAX_ITERATIONS = 4

class State(TypedDict):
    input: str
    prediction: str
    critiques: Annotated[list[str], add]
    react_messages: Annotated[list, add]
    predictions: Annotated[list[str], add]
    iteration: Annotated[int, add]

from langchain_core.messages import SystemMessage, HumanMessage
from langgraph.prebuilt import create_react_agent
from langchain_core.messages import SystemMessage,HumanMessage

critic_prompt = ("Inspect the previous messages and identify any potential issues or errors. "
                 "Reflect on the process of problem-solving. "
                 "What's the problem of the previous answer? "
                 "If there are tool calls, what are the issues with tool calls? (Tool selection, parameters, etc.)"
                 "Is the final answer truthful?"
                 "Your response should be as brief as possible . ")

async def criticize(state):
    if state["react_messages"]:
        react_messages = state["react_messages"][-1]
        critique = await llm.ainvoke(react_messages + [HumanMessage(content=critic_prompt)])
        return {
            "critiques": [critique.content],
            "iteration": 1 
        }
    else:
        return {
            "critiques": ["An error occurred. Please try again."],
            "iteration": 1 
        }

react_prompt = ("Always remember that there may be multiple tools that can be used to complete a step! "
                "So if there are more than one tool that can be used, your response should contain multiple tools and their parameters in one message! "
                "When you are going to respond the final answer, you should follow the following format:\n\n"
                "FINAL ANSWER: <Your final answer>\n\n"
                "Always remember the final answer should be as concise as possible. (A single number, a phrase, etc. Not a sentence!)\n\n")


reacter = create_react_agent(model=llm, tools=tools, state_modifier=SystemMessage(content=react_prompt))

async def react(state):
    try:
        if state["critiques"]:
            critique = state["critiques"][-1]
            inputs = {"messages": state["react_messages"][-1] + [HumanMessage(content=critique), HumanMessage(content=f"Question: {state["input"]}")]}
            result = await reacter.ainvoke(input=inputs, config={'recursion_limit': 18})
        else:
            inputs = {"messages": [("user", state["input"])]}
            result = await reacter.ainvoke(input=inputs)
    except Exception as e:
        return {
            "predictions": ["None"]
        }
    return {
        "react_messages": [result["messages"][len(state["react_messages"][-1])+1:]],
        "predictions": [result["messages"][-1].content],
    }


# Either agent can decide to end
from typing import Literal

def should_end(state) -> Literal["critic", "__end__"]:
    if state["iteration"] >= MAX_ITERATIONS or len(state["predictions"]) > 1 and state["predictions"][-1].split("FINAL ANSWER:")[-1].strip() == state["predictions"][-2].split("FINAL ANSWER:")[-1].strip():
        return "__end__"
    else:
        return "critic"
    

builder = StateGraph(State)

builder.add_node("critic", criticize)
builder.add_node("react", react)

builder.add_edge(START, "critic")
builder.add_edge("critic", "react")

builder.add_conditional_edges("react", should_end)

graph = builder.compile()

In [15]:
import json
import asyncio


async def process(item, dataset_name:str="hotpot_qa", timeout: int = 300):
    if dataset_name in ["hotpot_qa", "trivia_qa", "ambig_qa"]:
        messages = [HumanMessage(content=f"{item['question']}\n{item['prediction']}")]
        input = {"react_messages": [messages], "input": item["question"], "prediction": item["prediction"], "iteration": 0}
    elif dataset_name in ["gsm8k", "svamp", 'tabmwp']:
        messages = [HumanMessage(content=f"Use python code solve the following problem, variable <answer> should contain the final answer. Use \"print(answer)\" to get the final answer.\n{item['question']}\n{item["prediction"]}")]
        input = {"react_messages": [messages], "input": f"Use python code solve the following problem, variable <answer> should contain the final answer. Use \"print(answer)\" to get the final answer.\n{item['question']}\n", "prediction": item["prediction"], "iteration": 0}
    try:
        return await asyncio.wait_for(graph.ainvoke(input=input), timeout=timeout)
    except asyncio.TimeoutError:
        print(f"Timeout processing item {item}")
        return f"Timeout on {item}"
    except Exception as e:
        print(f"Error processing item {item}: {e}")
        return f"Error on {item}: {str(e)}"

# 测试 hotpot_qa gsm8k svamp trivia_qa ambig_qa

In [16]:
from datasets import load_dataset
from tqdm.asyncio import tqdm_asyncio

dataset_name = "ambig_qa"
mode = "critic"
num_test_sample = 1000
temperature = 0
top_p = 1
batch_size = 100
llm = ChatOpenAI(temperature=temperature, model="gpt-4o-mini", base_url="https://api.chsdw.top/v1", top_p=top_p)
results = []
dataset = load_dataset("json", data_files="/Users/ariete/Projects/self-improve/output/inference/ambig_qa/cot_1000_temperature_0_top-p_1.jsonl", split="train")
if dataset_name == "svamp":
    dataset = dataset.map(lambda x: {"question": f"{x["Body"]} {x["Question"]}", "answer": x["Answer"]})
print(dataset, dataset[1])
print(llm.temperature, llm.top_p)

Dataset({
    features: ['question', 'answer', 'prediction'],
    num_rows: 1000
}) {'question': 'How often does spermatogeneis—the production of sperm—occur?', 'answer': ['usually continues uninterrupted until death', '74 days', 'constant'], 'prediction': "A: Spermatogenesis occurs continuously throughout a male's life after puberty, with new sperm cells being produced daily. FINAL ANSWER: Continuously after puberty."}
0.0 1.0


In [17]:
from tqdm import tqdm
from tqdm.asyncio import tqdm_asyncio
for i in tqdm(range(0, 1000, batch_size)):
    batch = dataset.select(range(i, min(i + batch_size, len(dataset))))
    batch_results = await tqdm_asyncio.gather(*(process(item, dataset_name) for item in batch))
    results.extend(batch_results)
    with open(f"/Users/ariete/Projects/self-improve/output/v2/{dataset_name}/{mode}_{len(dataset)}_temperature_{temperature}_top-p_{top_p}.jsonl", 'w') as f:
        for idx, item in enumerate(results):
            if isinstance(item, str):
                temp = {"idx": idx, "question":  dataset[idx]["question"], "predictions": [dataset[idx]["prediction"]], "answer": dataset[idx]["answer"]}
                f.write(json.dumps(temp) + "\n")
            else:
                temp = {"idx": idx, "question": dataset[idx]["question"], "predictions": item["predictions"], "answer": dataset[idx]["answer"], "prediction": dataset[idx]["prediction"]}
                f.write(json.dumps(temp) + "\n")




  lis = BeautifulSoup(html).find_all('li')
100%|██████████| 100/100 [05:00<00:00,  3.00s/it]
 10%|█         | 1/10 [05:00<45:01, 300.17s/it]

Timeout processing item {'question': 'When does episode 37 of stuck in the middle air?', 'answer': ['October 27, 2017'], 'prediction': 'A: Episode 37 of "Stuck in the Middle," titled "Stuck in the Middle of the Road," aired on March 3, 2017. FINAL ANSWER: March 3, 2017.'}
Timeout processing item {'question': 'Who played Marlo Kelly on the drew carey show?', 'answer': ['Jenny McCarthy'], 'prediction': 'A: Marlo Kelly did not appear on "The Drew Carey Show." However, the character Marlo was portrayed by actress Jennifer Aniston in a guest appearance. FINAL ANSWER: Jennifer Aniston (guest appearance).'}




  lis = BeautifulSoup(html).find_all('li')


Timeout processing item {'question': 'When did the metropolitan police force become a service?', 'answer': ['29 September 1829', '1989'], 'prediction': 'A: The Metropolitan Police Force was rebranded as the Metropolitan Police Service in 1829. FINAL ANSWER: 1829'}
Timeout processing item {'question': 'Who has won the 2017 asian snooker championship?', 'answer': ['Lyu Haotian', 'Lyu'], 'prediction': 'A: The 2017 Asian Snooker Championship was won by Pankaj Advani. FINAL ANSWER: Pankaj Advani'}
Timeout processing item {'question': 'Who was the first performance on live aid at John F. Kennedy Stadium, prior to the first musical act?', 'answer': ['Bernard Watson'], 'prediction': 'A: The first performance on the Live Aid stage at John F. Kennedy Stadium was by actor and comedian Jack Nicholson, who introduced the event. The first musical act to perform was "The Hooters." FINAL ANSWER: Jack Nicholson'}
Timeout processing item {'question': 'When did breaking dawn part 2 score come out?', 'ans

100%|██████████| 100/100 [05:00<00:00,  3.00s/it]
 20%|██        | 2/10 [10:00<40:02, 300.31s/it]

Timeout processing item {'question': 'When is the last time England won the rugby world cup?', 'answer': ['2003'], 'prediction': 'A: England last won the Rugby World Cup in 2003. FINAL ANSWER: 2003'}
Timeout processing item {'question': 'Who played doc brown on the TV series back to the future in the animated segments?', 'answer': ['Dan Castellaneta'], 'prediction': 'A: In the animated series "Back to the Future," Doc Brown was voiced by Christopher Lloyd, who also portrayed the character in the live-action films. FINAL ANSWER: Christopher Lloyd'}
Timeout processing item {'question': 'When did the song unchained melody come out?', 'answer': ['1955', 'January 19, 1955'], 'prediction': 'A: "Unchained Melody" was first published in 1955. The most famous version, recorded by The Righteous Brothers, was released in 1965. FINAL ANSWER: 1955 (originally published)'}
Timeout processing item {'question': 'When did the Seljuk Empire end?', 'answer': ['1194'], 'prediction': 'A: The Seljuk Empire 



  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')
100%|██████████| 100/100 [05:00<00:00,  3.00s/it]

Timeout processing item {'question': 'Who is the lead singer that sings the song what a beautiful name it is?', 'answer': ['Brooke Ligertwood'], 'prediction': 'A: The lead singer of the song "What a Beautiful Name" is Brooke Ligertwood, who is part of the worship group Hillsong Worship. FINAL ANSWER: Brooke Ligertwood.'}
Timeout processing item {'question': 'What were the names of the fraternities that the Animal House fraternity is based on?', 'answer': ['Zeta Beta Tau and Alpha Delta Phi'], 'prediction': 'A: The Animal House fraternity is primarily based on the real-life fraternities of the Alpha Tau Omega (ATO) and the Delta Tau Delta (DTD) at Dartmouth College. FINAL ANSWER: Alpha Tau Omega and Delta Tau Delta.'}
Timeout processing item {'question': "Where was the movie dead man's burden filmed?", 'answer': ['New Mexico', 'in New Mexico'], 'prediction': 'A: The movie "Dead Man\'s Burden" was filmed in New Mexico, particularly in the towns of Santa Fe and the surrounding areas. FINA




Timeout processing item {'question': 'Who said democracy is the rule of fools?', 'answer': ['Plato'], 'prediction': 'A: The quote "Democracy is the rule of fools" is often attributed to various figures, but one notable attribution is to the ancient Greek philosopher Plato. FINAL ANSWER: Plato'}
Timeout processing item {'question': 'Who sings backing vocals with piano on one of these nights, the song?', 'answer': ['Glenn Frey', 'Glenn Lewis Frey'], 'prediction': 'A: The backing vocals on "One of These Nights" by the Eagles were provided by Richard Manuel of The Band and also featured a piano part by Glenn Frey and Don Henley. FINAL ANSWER: Richard Manuel'}
Timeout processing item {'question': 'When did the American Basketball League permanently add the 3 point line?', 'answer': ['1961'], 'prediction': 'A: The American Basketball League (ABL) permanently adopted the three-point line in its inaugural 1996-1997 season. FINAL ANSWER: 1996-1997 season.'}
Timeout processing item {'question': 

100%|██████████| 100/100 [05:00<00:00,  3.00s/it]
 40%|████      | 4/10 [20:01<30:01, 300.29s/it]

Timeout processing item {'question': 'Who won sports personality of the year 2017 ireland?', 'answer': ['James Joseph McClean', 'McClean', 'James McClean'], 'prediction': 'A: The winner of the RTÉ Sports Personality of the Year award in 2017 was Katie Taylor, the renowned Irish boxer. FINAL ANSWER: Katie Taylor.'}
Timeout processing item {'question': 'Who developed the first induction motor that was an alternating current power system?', 'answer': ['Nikola Tesla'], 'prediction': 'A: The first induction motor that was developed for an alternating current power system was created by Nikola Tesla. FINAL ANSWER: Nikola Tesla.'}
Timeout processing item {'question': 'Who lit the torch at the 2012 Winter Youth Olympics?', 'answer': ['Egon Zimmermann', 'Franz Klammer'], 'prediction': 'A: The torch at the 2012 Winter Youth Olympics in Innsbruck, Austria, was lit by Austrian snowboarder Catrina Faber. FINAL ANSWER: Catrina Faber'}
Timeout processing item {'question': 'When does the season 10 of 



  lis = BeautifulSoup(html).find_all('li')


Timeout processing item {'question': 'Who were the members of the committee, other than Christopher G. Memminger, who wrote the south carolina ordinance of secession?', 'answer': ['J. P. Richardson', 'R. W. Barnwell', 'F. H. Wardlaw', 'J. E. Jenkins', 'B. H. Rutledge', 'P. E. Duncan'], 'prediction': 'A: The South Carolina Ordinance of Secession was drafted by a committee that included notable members such as William H. Trescot, John H. McCrady, and John A. Inglis, among others. FINAL ANSWER: William H. Trescot, John H. McCrady, John A. Inglis.'}
Timeout processing item {'question': 'Who plays ronald reagan in lee daniels the butler?', 'answer': ['Alan Rickman', 'Alan Sidney Patrick Rickman'], 'prediction': 'A: Ronald Reagan is portrayed by Alan Rickman in "Lee Daniels\' The Butler." FINAL ANSWER: Alan Rickman'}
Timeout processing item {'question': 'Where was the train scene in fast five filmed?', 'answer': ['Blythe Junction', 'Rice , California', 'Rice, California', 'Rice'], 'predictio

100%|██████████| 100/100 [05:00<00:00,  3.00s/it]
 50%|█████     | 5/10 [25:01<25:01, 300.32s/it]

Timeout processing item {'question': 'What is the control panel on the computer?', 'answer': ['give the user control of software and hardware features', 'part the Microsoft Windows', 'omponent of Microsoft Windows that provides the ability to view and change system settings.', 'provides the ability to view and change system settings'], 'prediction': 'A: The control panel on a computer is a feature that allows users to view and adjust system settings and configurations. It provides access to various settings related to hardware, software, user accounts, and network configurations. In Windows, for example, you can find options for managing devices, uninstalling software, and configuring system preferences. FINAL ANSWER: System settings and configurations interface.'}
Timeout processing item {'question': 'Who wrote the lyrics to Busta Rhymes\'s song "In the Ghetto"?', 'answer': ['Rick James'], 'prediction': 'A: The lyrics to "In the Ghetto" by Busta Rhymes were written by Busta Rhymes him

  result["title"] = BeautifulSoup(result["title"], "html.parser").get_text()


  lis = BeautifulSoup(html).find_all('li')
  result["title"] = BeautifulSoup(result["title"], "html.parser").get_text()


Timeout processing item {'question': 'How many seasons of marco polo 1964 series will there be?', 'answer': ['1'], 'prediction': 'A: The 1964 television series "Marco Polo" aired for one season. FINAL ANSWER: One season.'}
Timeout processing item {'question': "Who did Doc Hudson's voice in every English language version of Cars where Paul Newman didn't voice the character?", 'answer': ['Corey Burton'], 'prediction': 'A: In every English language version of "Cars" where Paul Newman did not voice Doc Hudson, the character was voiced by Keith Ferguson. FINAL ANSWER: Keith Ferguson.'}
Timeout processing item {'question': 'When did the air jordan 12 re-release with the Eleven as the final countdown pack?', 'answer': ['December 25, 2008'], 'prediction': 'A: The Air Jordan 12 was re-released as part of the Final Countdown Pack alongside the Air Jordan 11 on December 23, 2008. FINAL ANSWER: December 23, 2008.'}
Timeout processing item {'question': 'When did the kim family come to power in Geum

100%|██████████| 100/100 [05:00<00:00,  3.00s/it]
 60%|██████    | 6/10 [30:01<20:01, 300.34s/it]

Timeout processing item {'question': 'When was the last time england got to a rugby world cup quarter final?', 'answer': ['2015'], 'prediction': 'A: As of my last knowledge update in October 2023, England reached the Rugby World Cup quarter-finals in 2023. They played against Fiji in the quarter-finals on October 15, 2023. FINAL ANSWER: 2023 Rugby World Cup.'}
Timeout processing item {'question': 'Who wrote may the bird of paradise fly up your nose?', 'answer': ['Neal Merritt'], 'prediction': 'A: "May the Bird of Paradise Fly Up Your Nose" was written by Hank Williams. FINAL ANSWER: Hank Williams.'}
Timeout processing item {'question': 'Who is the best FCS rushing quarterback of all time in terms of total yards?', 'answer': ['Matt Cannon'], 'prediction': 'A: The best FCS rushing quarterback of all time in terms of total yards is Walter Payton. He played for Jackson State University and is known for his prolific rushing ability. FINAL ANSWER: Walter Payton'}
Timeout processing item {'qu

  result["title"] = BeautifulSoup(result["title"], "html.parser").get_text()


Timeout processing item {'question': 'Who narrates the hitchhiker guide to the galaxy radio series 6?', 'answer': ['Lloyd', 'John Llod'], 'prediction': 'A: The narrator for "The Hitchhiker\'s Guide to the Galaxy" radio series, including series 6, is Geoffrey McGivern. FINAL ANSWER: Geoffrey McGivern'}
Timeout processing item {'question': 'When will the 2017 fifty shades of grey movie be released in the United States?', 'answer': ['February 10, 2017'], 'prediction': 'A: "Fifty Shades Darker," the sequel to "Fifty Shades of Grey," was released in the United States on February 10, 2017. FINAL ANSWER: February 10, 2017.'}
Timeout processing item {'question': 'During what event did neil armstrong received the presidential medal of freedom?', 'answer': ['official state dinner'], 'prediction': 'A: Neil Armstrong received the Presidential Medal of Freedom during a ceremony on September 16, 1969, in recognition of his accomplishments as the first person to walk on the moon during the Apollo 11 

100%|██████████| 100/100 [05:00<00:00,  3.00s/it]


Timeout processing item {'question': 'What animal is January 1 - January 24 1982 in the chinese zodiac?', 'answer': ['rooster'], 'prediction': 'A: Individuals born between January 1 and January 24, 1982, fall under the Chinese zodiac sign of the Rooster. The Year of the Rooster for the Chinese zodiac started on January 25, 1981, and ended on February 12, 1982. However, if born after January 24, 1982, one would be in the Year of the Dog. FINAL ANSWER: Rooster'}
Timeout processing item {'question': 'When does the new Army Service Uniform come out?', 'answer': ['2020'], 'prediction': "A: The new Army Service Uniform (ASU) was officially introduced to the U.S. Army in 2010. However, updates and variations can occur over time. For the most accurate information regarding specific changes or updates, it's best to check with official Army announcements or sources. FINAL ANSWER: 2010"}
Timeout processing item {'question': 'Who played nellie bly in frankie and johnny in 1966?', 'answer': ['Nancy



  lis = BeautifulSoup(html).find_all('li')


Timeout processing item {'question': 'Who was the bond girl character in you only live twice who replaces Aki?', 'answer': ['Kissy Suzuki'], 'prediction': 'A: The Bond girl character who replaces Aki in "You Only Live Twice" is Kissy Suzuki, played by actress Mie Hama. FINAL ANSWER: Kissy Suzuki (Mie Hama)'}
Timeout processing item {'question': 'What date did france stop having kings and queens?', 'answer': ['4 September 1870'], 'prediction': 'A: France effectively stopped having kings and queens after the abolition of the monarchy in 1792 during the French Revolution. The monarchy was briefly restored in the 19th century, but the last king, Louis-Philippe, abdicated in 1848. FINAL ANSWER: 1792'}
Timeout processing item {'question': 'What is the MPAA rating of Power Rangers (2017 film)?', 'answer': ['PG-13'], 'prediction': 'A: The MPAA rating for the 2017 film "Power Rangers" is PG-13. FINAL ANSWER: PG-13'}
Timeout processing item {'question': 'Who has scored most own goals in a single

100%|██████████| 100/100 [05:00<00:00,  3.00s/it]
 80%|████████  | 8/10 [40:02<10:00, 300.40s/it]

Timeout processing item {'question': 'What company first aired presentations of the original radio show This Is Your Life?', 'answer': ['NBC'], 'prediction': 'A: The original radio show "This Is Your Life" first aired on the NBC network. FINAL ANSWER: NBC (National Broadcasting Company)'}
Timeout processing item {'question': 'Who serves as the president of the senate in the texas legislature from 2003 to 2015?', 'answer': ['David Dewhurst', 'David Henry Dewhurst'], 'prediction': 'A: The president of the Texas Senate from 2003 to 2015 was David Dewhurst. FINAL ANSWER: David Dewhurst'}
Timeout processing item {'question': 'When does braxton family values season 5 start?', 'answer': ['May 19 , 2016', 'May 19, 2016'], 'prediction': 'A: "Braxton Family Values" Season 5 premiered on June 16, 2016. FINAL ANSWER: June 16, 2016.'}
Timeout processing item {'question': 'Who dies in breaking dawn part 2 movie?', 'answer': ['Aro', 'Irina'], 'prediction': 'A: In "Breaking Dawn - Part 2," the charact



Timeout processing item {'question': 'Where was war on the planet of the apes principal photography occurred?', 'answer': ['Lower Mainland in Vancouver', 'Lower Mainland'], 'prediction': 'A: Principal photography for "War for the Planet of the Apes" took place primarily in British Columbia, Canada. Notably, locations included areas around Vancouver and the surrounding forests. FINAL ANSWER: British Columbia, Canada.'}
Timeout processing item {'question': 'When was the city of new york founded by the Dutch and initially called New Amsterdam?', 'answer': ['1624'], 'prediction': 'A: New Amsterdam was founded by the Dutch in 1624. FINAL ANSWER: 1624'}
Timeout processing item {'question': 'Where is the second location the cash explosion tv show taped?', 'answer': ['Columbus, Ohio', 'Columbus'], 'prediction': 'A: The second location for the "Cash Explosion" TV show was at the I-X Center in Cleveland, Ohio. FINAL ANSWER: I-X Center, Cleveland, Ohio.'}
Timeout processing item {'question': 'How

100%|██████████| 100/100 [05:00<00:00,  3.00s/it]

Timeout processing item {'question': 'Who is the character that sings better than i in joseph king of dreams?', 'answer': ['Joseph'], 'prediction': 'A: The character who sings "Better Than I" in *Joseph: King of Dreams* is the narrator, who is also known as the character of Joseph. The song expresses Joseph\'s feelings about God’s gifts and his worries about his ability to fulfill his destiny. FINAL ANSWER: Joseph.'}
Timeout processing item {'question': "Where is John Wayne's star on the Hollywood walk of fame, in relation to other stars?", 'answer': ['Between Ricky Nelson and Shirley Jones'], 'prediction': "A: John Wayne's star on the Hollywood Walk of Fame is located at 6927 Hollywood Boulevard, near the intersection with Highland Avenue. It is situated close to the stars of other notable actors and entertainers, including those associated with the western genre, reflecting his iconic status in film. FINAL ANSWER: 6927 Hollywood Boulevard."}
Timeout processing item {'question': 'Wher


  result["title"] = BeautifulSoup(result["title"], "html.parser").get_text()
  result["title"] = BeautifulSoup(result["title"], "html.parser").get_text()


Timeout processing item {'question': 'Who is the 11th game minister of india?', 'answer': ['Vijay Goel'], 'prediction': 'A: The 11th Minister of Youth Affairs and Sports in India is Anurag Thakur, who took office in July 2021. FINAL ANSWER: Anurag Thakur'}
Timeout processing item {'question': 'At which outside locations was It Came From Outer Space filmed at?', 'answer': ['Palmdale, CA, Victorville, CA, and the Mojave Desert'], 'prediction': 'A: "It Came From Outer Space" was primarily filmed at the Mojave Desert in California, specifically near the area of Edwards Air Force Base. Other locations in and around the California desert were also used for various scenes. FINAL ANSWER: Mojave Desert, California.'}
Timeout processing item {'question': 'What is the most games the Red Sox have won between the regular season and the playoffs?', 'answer': ['119'], 'prediction': 'A: The Boston Red Sox won a total of 108 games during the 2018 season, which included both regular season and postseaso

100%|██████████| 100/100 [05:00<00:00,  3.00s/it]


Timeout processing item {'question': 'Kind of oil that has been turned into a solid fat?', 'answer': ['vegetable oil', 'palm oil', 'vegetable oils', 'hydrogenated vegetable oils'], 'prediction': 'A: The kind of oil that has been turned into a solid fat is typically referred to as "hydrogenated oil" or "trans fat." Common examples include margarine and shortening. FINAL ANSWER: Hydrogenated oil.'}
Timeout processing item {'question': 'Who played Zordon in the original power rangers movie?', 'answer': ['Nicholas Bell'], 'prediction': 'A: Zordon was portrayed by actor Nicholas Bell in the original "Mighty Morphin Power Rangers: The Movie" released in 1995. FINAL ANSWER: Nicholas Bell'}
Timeout processing item {'question': 'Where does the peanut butter jelly time popularity come from?', 'answer': ['a Flash-animated music video featuring a dancing banana'], 'prediction': 'A: The "Peanut Butter Jelly Time" meme gained popularity from a flash animation that features a dancing banana and the c

100%|██████████| 10/10 [50:03<00:00, 300.38s/it]


# GSM8K

In [14]:
from datasets import load_dataset
dataset_name = "gsm8k"
mode = "critic"
num_test_sample = -1
temperature = 0
top_p = 1
batch_size = 100
dataset = load_dataset("json", data_files="/Users/ariete/Projects/self-improve/output/inference/gsm8k/pot_1319_temperate_0_top-p_1.jsonl", split="train")
print(dataset)
results = []


Dataset({
    features: ['question', 'answer', 'prediction'],
    num_rows: 1319
})


In [None]:
from tqdm import tqdm
for i in tqdm(range(100, len(dataset), batch_size)):
    batch = dataset.select(range(i, min(i + batch_size, len(dataset))))
    batch_results = await asyncio.gather(*(process(item, dataset_name) for item in batch))
    results.extend(batch_results)
    with open(f"/Users/ariete/Projects/self-improve/output/v2/{dataset_name}/{mode}_{len(dataset)}_temperature_{temperature}_top-p_{top_p}.jsonl", 'w') as f:
        for idx, item in enumerate(results):
            if isinstance(item, str):
                temp = {"idx": idx, "question":  dataset[idx]["question"], "predictions": [dataset[idx]["prediction"]], "answer": dataset[idx]["answer"]}
                f.write(json.dumps(temp) + "\n")
            else:
                temp = {"idx": idx, "question": dataset[idx]["question"], "predictions": item["predictions"], "answer": dataset[idx]["answer"], "prediction": dataset[idx]["prediction"]}
                f.write(json.dumps(temp) + "\n")