In [8]:
from tqdm import tqdm
import json
from langchain_core.messages import HumanMessage, AIMessage
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage, AIMessage, ToolMessage, SystemMessage

import getpass
import os


def _set_if_undefined(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"Please provide your {var}")

_set_if_undefined("LANGCHAIN_API_KEY")

# Optional, add tracing in LangSmith
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "cot"

llm = ChatOpenAI(temperature=0, base_url="https://api.chsdw.top/v1", top_p=1, model="gpt-4o-mini")

# Load prompt
def load_prompt(dataset: str, mode: str):
    file_path = "../prompts/{}/{}.md".format(dataset, mode)
    with open(file_path, 'r', encoding='utf-8') as fp:
        prompt = fp.read().strip() + "\n\n"
    return prompt


# 并行处理
import asyncio
from langchain_core.messages import BaseMessage

async def cot(item, messages: list[BaseMessage], llm: ChatOpenAI, dataset_name:str="hotpot_qa") -> str:
    if dataset_name == "hotpot_qa":
        question_message = HumanMessage(content="Question: " + item["question"])
    elif dataset_name == "toxicity":
        question_message = HumanMessage(content=item["prompt"]["text"])
    try:
        result = await llm.ainvoke(messages+[question_message])
        return result.content
    except Exception as e:
        return "None"

# HotpotQA 200

In [27]:
dataset_name = "hotpot_qa"
mode = "cot"
num_test_sample = 200
system_message = SystemMessage(content="Answer the following questions. Remember your answer should always end with \"So the FINAL ANSWER is: <answer>\"")
prompt = load_prompt(dataset_name, mode)
prompt_message = HumanMessage(content=prompt)
messages = [system_message, prompt_message]

# Load dataset

In [None]:
from datasets import load_dataset
dataset = load_dataset("json", data_files=f"../data/{dataset_name}.jsonl", split="train")
if num_test_sample > 0:
    dataset = dataset.select(range(num_test_sample))
print(dataset)

In [25]:
results = await asyncio.gather(*(cot(item, messages=messages, llm=llm) for item in dataset))

In [29]:
save_folder = f"/Users/ariete/Projects/self-improve/output/{dataset_name}"
os.makedirs(save_folder, exist_ok=True)
with open ("/Users/ariete/Projects/self-improve/output/{}/{}_{}.json".format(dataset_name, num_test_sample, mode), "w") as f:
    for idx, result in enumerate(results):
        f.write(json.dumps({"idx": idx, "question": dataset[idx]["question"], "answer":dataset[idx]["answer"], "python code": results[idx]}) + "\n")

# gsm8k 200

In [9]:
dataset_name = "gsm8k"
mode = "pot"
num_test_sample = 200
system_message = SystemMessage(content="# Write Python Code to solve the following questions. Store your result as a variable named 'answer'. You should follow the format of the prompt below.")
prompt = load_prompt(dataset_name, mode)
prompt_message = HumanMessage(content=prompt)
messages = [system_message, prompt_message]

## Load Dataset

In [10]:
from datasets import load_dataset
dataset = load_dataset("json", data_files=f"../data/{dataset_name}.jsonl", split="train")
if num_test_sample > 0:
    dataset = dataset.select(range(num_test_sample))
dataset = dataset.map(lambda example: {"question": example["question"], "answer": example["answer"].split("#### ")[-1]})

In [36]:
import re

def extract_code(content:str) -> str:
    code_match = re.search(r"```python\n(.*)\n```", content, re.DOTALL)
    if code_match:
        code = code_match.group(1).strip()  # 提取匹配的代码并去除前后的空白字符
        return code
    else:
        return "None"

In [28]:
results = await asyncio.gather(*(cot(item, messages=messages, llm=llm) for item in dataset))

In [38]:
for idx in range(len(results)):
    dataset[idx]["python code"] = extract_code(results[idx])

In [44]:
from langchain_experimental.utilities import PythonREPL

python = PythonREPL()
prediction = []
for result in results:
    prediction.append(python.run(extract_code(result)))

In [46]:
save_folder = f"/Users/ariete/Projects/self-improve/output/{dataset_name}"
os.makedirs(save_folder, exist_ok=True)
with open ("/Users/ariete/Projects/self-improve/output/{}/{}_{}.json".format(dataset_name, num_test_sample, mode), "w") as f:
    for idx, result in enumerate(results):
        f.write(json.dumps({"idx": idx, "question": dataset[idx]["question"], "answer":dataset[idx]["answer"], "python code": results[idx], "prediction": prediction[idx]}) + "\n")

# toxicity

In [71]:
dataset_name = "toxicity"
mode = "generate"
num_test_sample = 200
system_message = SystemMessage(content="The user's input is half a sentence, generating the second half of the sentence")
messages = [system_message]

# Load Dataset

In [65]:
from datasets import load_dataset
dataset = load_dataset("json", data_files=f"../data/{dataset_name}.jsonl", split="train")
if num_test_sample > 0:
    dataset = dataset.select(range(num_test_sample))
# dataset = dataset.map(lambda example: {"question": example["question"], "answer": example["answer"].split("#### ")[-1]})
print(dataset)

Dataset({
    features: ['filename', 'begin', 'end', 'challenging', 'prompt', 'continuation'],
    num_rows: 200
})
