In [None]:
import os
import time
import openai
import pandas as pd
from human_prompt import h_system_prompt, h_user_prompt

# —— 配置 —— #
openai.api_key = os.getenv(
    "OPENAI_API_KEY",
    "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"
)
INPUT_CSV  = "../data/human_answers.csv"
OUTPUT_CSV = "../data/human_answers_processed_wide.csv"
MODEL_NAME = "gpt-4o-mini"

# 回答者列名映射：从 answer_idx(1~4) 映射到实际列名
ANSWER_COLS = {
    1: "Jason's answer",
    2: "Jesse's answer",
    3: "Allen's answer",
    4: "Zhekai's answer"
}

# —— 定义 GPT 调用函数（openai>=1.0.0） —— #
def gpt_process(dilemma: str, answer: str) -> str:
    messages = [
        {"role": "system", "content": h_system_prompt.strip()},
        {"role": "user",   "content": h_user_prompt.format(dilemma, answer).strip()}
    ]
    resp = openai.chat.completions.create(
        model=MODEL_NAME,
        messages=messages,
        temperature=0.7,
        max_tokens=256,
    )
    return resp.choices[0].message.content.strip()

# —— 1. 读取输入 DataFrame —— #
df_in = pd.read_csv(INPUT_CSV, encoding="utf-8")
expected = ["Dilemma Description"] + list(ANSWER_COLS.values())
if not all(col in df_in.columns for col in expected):
    raise ValueError(f"输入文件必须包含列：{expected}")

# —— 2. 累积记录的列表 —— #
records = []

# —— 3. 遍历每行调用 GPT —— #
for idx, row in df_in.iterrows():
    dilemma = row["Dilemma Description"]
    # 初始化这一行的输出 dict
    processed_row = {"dilemma_description": dilemma}
    print(f"\n▶ 开始处理第 {idx} 行: “{dilemma[:30]}...”")
    
    for i, col_name in ANSWER_COLS.items():
        original = row[col_name]
        try:
            proc = gpt_process(dilemma, original)
            print(f"  ✔ processed_{i} ({col_name}): {proc[:50]}...")
        except Exception as e:
            print(f"  ✘ Error at processed_{i} ({col_name}): {e}")
            proc = ""
        processed_row[f"processed_{i}"] = proc
        time.sleep(1)  # 防止速率限制

    # 将这一行结果加入 records
    records.append(processed_row)
    print(f"✔ 完成第 {idx} 行，共生成 4 条 processed_* 列")

# —— 4. 构建输出 DataFrame 并保存 —— #
df_out = pd.DataFrame(records, columns=[
    "dilemma_description",
    "processed_1",
    "processed_2",
    "processed_3",
    "processed_4"
])
df_out.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
print(f"\n🎉 全部处理完成，结果已保存至 `{OUTPUT_CSV}`")



▶ 开始处理第 0 行: “I joined a lab during graduate...”
  ✔ processed_1 (Jason's answer): To address the ethical dilemma of credit assignmen...
  ✔ processed_2 (Jesse's answer): The issue of credit in scientific research is inhe...
  ✔ processed_3 (Allen's answer): In addressing this ethical dilemma, it is crucial ...
  ✔ processed_4 (Zhekai's answer): In this ethical dilemma, it is crucial to recogniz...
✔ 完成第 0 行，共生成 4 条 processed_* 列

▶ 开始处理第 1 行: “Graduate students A and B are ...”
  ✔ processed_1 (Jason's answer): In this ethical dilemma, the actions of Student A ...
  ✔ processed_2 (Jesse's answer): In this scenario, the ethical dilemma revolves aro...
  ✔ processed_3 (Allen's answer): The ethical dilemma at hand raises important quest...
  ✔ processed_4 (Zhekai's answer): The actions of Student A represent a significant e...
✔ 完成第 1 行，共生成 4 条 processed_* 列

▶ 开始处理第 2 行: “David is a new postdoc in Dr. ...”
  ✔ processed_1 (Jason's answer): In this ethical dilemma, several key factors m