In [3]:
import os
from dotenv import load_dotenv

load_dotenv("1.env")

QWEN_API_KEY = os.getenv("QWEN_API_KEY")

#检查是否加载
if not QWEN_API_KEY:
    raise EnvironmentError("no QWEN_API_KEY")

print("loaded")

loaded


In [16]:
import os
import json
import time
import requests
import jieba
import sacrebleu
from dotenv import load_dotenv

#加载环境变量
load_dotenv()
QWEN_API_KEY = os.getenv("QWEN_API_KEY")
if not QWEN_API_KEY:
    raise ValueError("请在 .env 文件中设置 QWEN_API_KEY")

PROMPT_TEMPLATE = """你是一个智能对话助手。用户说了一句话，请完成以下任务：

1. **意图识别**：用中文简明概括用户的**核心意图类别**（如“寻找餐厅”、“预订酒店”等），不超过8个字。
2. **澄清提问**：针对该意图，**选择一个最关键的信息缺口**，提出一个自然、简洁的中文澄清问题。  
   - 可以问时间、地点、类型、偏好、联系方式等，**任选其一即可**。
   - 不需要覆盖所有缺失信息，只需问**一个问题**。

用户语句："{utterance}"

请严格按以下 JSON 格式输出，不要任何额外内容：
{{"intent": "意图", "clarification": "澄清问题"}}
"""


def call_qwen_model(utterance: str, model_name: str, max_retries=3):
    url = "https://dashscope.aliyuncs.com/api/v1/services/aigc/text-generation/generation"
    headers = {
        "Authorization": f"Bearer {QWEN_API_KEY}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": model_name,
        "input": {
            "messages": [
                {"role": "user", "content": PROMPT_TEMPLATE.format(utterance=utterance)}
            ]
        },
        "parameters": {
            "result_format": "message",
            "max_tokens": 200,
            "temperature": 0.0  
        }
    }

    for _ in range(max_retries):
        try:
            response = requests.post(url, json=payload, headers=headers, timeout=30)
            if response.status_code == 200:
                data = response.json()
                text = data["output"]["choices"][0]["message"]["content"].strip()
                start = text.find("{")
                end = text.rfind("}") + 1
                if start != -1 and end > start:
                    result = json.loads(text[start:end])
                    return result["intent"], result["clarification"]
            time.sleep(1)
        except Exception as e:
            print(f"{model_name} 调用失败: {e}")
            time.sleep(2)
    return "解析失败", "无法生成澄清问题"


def main():
    input_file = "translated.jsonl"
    output_file = "results.jsonl"
    metrics_file = "evaluation.json"

    if not os.path.exists(input_file):
        raise FileNotFoundError(f"找不到 {input_file}")

    with open(input_file, "r", encoding="utf-8") as fin:
        lines = fin.readlines()

    print(f"开始对比 qwen-plus 与 qwen-max，共 {len(lines)} 条样本...")

    results = []
    for line in lines:
        item = json.loads(line.strip())
        utterance = item["user_utterance"]
        print(f"→ 处理: {utterance[:50]}...")

        plus_intent, plus_clar = call_qwen_model(utterance, "qwen-plus")
        time.sleep(0.5)
        max_intent, max_clar = call_qwen_model(utterance, "qwen-max")
        time.sleep(0.5)

        result = {
            "user_utterance": utterance,
            "dialog_id": item.get("dialog_id", ""),
            "domains": item.get("domains", []),
            "gold_intent": item["gold_intent"],
            "gold_clarification": item["gold_clarification"],  # 支持 str 或 list
            "qwen-plus": {
                "predicted_intent": plus_intent,
                "predicted_clarification": plus_clar
            },
            "qwen-max": {
                "predicted_intent": max_intent,
                "predicted_clarification": max_clar
            }
        }
        results.append(result)

    #保存详细结果
    with open(output_file, "w", encoding="utf-8") as fout:
        for res in results:
            fout.write(json.dumps(res, ensure_ascii=False) + "\n")

    plus_intent_correct = 0
    max_intent_correct = 0
    total_valid = 0

    plus_clarifications = []
    max_clarifications = []
    references_list = []

    for item in results:
        gold_intent = item["gold_intent"]
        gold_clars = item["gold_clarification"]
        if isinstance(gold_clars, str):
            gold_clars = [gold_clars]

        if gold_intent == "无有效请求":
            continue

        total_valid += 1

        if item["qwen-plus"]["predicted_intent"] == gold_intent:
            plus_intent_correct += 1
        if item["qwen-max"]["predicted_intent"] == gold_intent:
            max_intent_correct += 1

        plus_clarifications.append(item["qwen-plus"]["predicted_clarification"])
        max_clarifications.append(item["qwen-max"]["predicted_clarification"])
        references_list.append(gold_clars)

    if total_valid == 0:
        metrics = {
            "qwen-plus": {"Intent Accuracy": 0, "BLEU-4": 0},
            "qwen-max": {"Intent Accuracy": 0, "BLEU-4": 0}
        }
    else:
        plus_acc = round((plus_intent_correct / total_valid) * 100, 2)
        max_acc = round((max_intent_correct / total_valid) * 100, 2)

        # BLEU-4 使用 sacrebleu 的中文分词
        plus_bleu = sacrebleu.corpus_bleu(
            plus_clarifications,
            references_list,
            tokenize='zh'
        ).score
        max_bleu = sacrebleu.corpus_bleu(
            max_clarifications,
            references_list,
            tokenize='zh'
        ).score

        metrics = {
            "qwen-plus": {
                "Intent Accuracy": plus_acc,
                "BLEU-4": round(plus_bleu, 2)
            },
            "qwen-max": {
                "Intent Accuracy": max_acc,
                "BLEU-4": round(max_bleu, 2)
            }
        }

    #保存指标
    with open(metrics_file, "w", encoding="utf-8") as f:
        json.dump(metrics, f, ensure_ascii=False, indent=2)

    print("\n" + "="*70)
    print("qwen-plus vs qwen-max 评估结果:")
    print(json.dumps(metrics, ensure_ascii=False, indent=2))
    print("="*70)
    print(f"结果已保存至:\n - {output_file}\n - {metrics_file}")


if __name__ == "__main__":
    main()

开始对比 qwen-plus 与 qwen-max，共 60 条样本...
→ 处理: Great. Thank you very much for your help today...
→ 处理: Actually I don't need it booked right now. Can I p...
→ 处理: Yes. I a place to stay. A upscale hotel please...
→ 处理: Yes I need to find a nice expensive British restau...
→ 处理: No, thanks, I just need to know the contact number...
→ 处理: I want something in the west area with a moderate ...
→ 处理: I would prefer one in the moderate price range...
→ 处理: No that's all I need. Thank you so much and bye...
→ 处理: Lets go with the Copper Kettle. I'd like their add...
→ 处理: A guesthouse is fine. I'm looking for a place in t...
→ 处理: I'm looking for European restaurant in the centre...
→ 处理: Yes. I need to travel from Cambridge to London Liv...
→ 处理: I need to book a train on Sunday as well...
→ 处理: Yes, I'd like to go to leicester after 19:45...
→ 处理: Thanks. Yes, I also need to find a guesthouse for ...
→ 处理: That is very helpful, thanks. That's all I need fo...
→ 处理: No, any will be fine. I only

In [17]:
import os
import json
import time
import re
import requests
from tqdm import tqdm
from dotenv import load_dotenv

#配置
load_dotenv()
QWEN_API_KEY = os.getenv("QWEN_API_KEY")
if not QWEN_API_KEY:
    raise ValueError("no QWEN_API_KEY")

EVALUATOR_PROMPT = """你是一个专业的对话系统评估专家。请根据以下输入，严格完成两项评估任务，并**仅输出一个合法的 JSON 对象**，不要包含任何解释、注释或 Markdown。

【输入信息】
- 用户语句: {user_utterance}
- 真实意图 (gold_intent): {gold_intent}
- 真实澄清语句 (gold_clarification): {gold_clarification}
- 模型预测意图 (predicted_intent): {predicted_intent}
- 模型预测澄清语句 (predicted_clarification): {predicted_clarification}

【任务说明】
1. **意图归一化**：
   将“真实意图”和“预测意图”分别映射到以下**标准意图类别之一**（必须完全匹配，不可新增）：
   - find_restaurant（寻找餐厅）
   - find_accommodation（寻找住宿/酒店）
   - find_attraction（寻找景点/娱乐场所）
   - book_restaurant（预订餐厅）
   - book_train（预订火车票）
   - book_taxi（预订出租车）
   - get_contact_info（获取联系方式/地址/邮编等）
   - query_train（查询火车班次/时间）
   - no_request（无有效请求）
   - end_conversation（结束对话/感谢）
   - other（无法归类）

2. **澄清语句评分**（0.0 ~ 1.0）：
   - 如果 gold_clarification 是“无需澄清”：
       - 若 predicted_clarification 为空字符串 "" 或表达结束/感谢（如“好的”、“谢谢”、“再见”），则评分为 1.0；
       - 否则评分为 0.0。
   - 否则（需要澄清）：
       - 若 predicted_clarification 在**语义上等价或合理覆盖**了 gold_clarification 的核心信息，则评 0.8~1.0；
       - 若部分相关但缺失关键信息，评 0.4~0.7；
       - 若完全无关或错误，评 0.0~0.3。

【输出格式要求】
- 必须是**纯 JSON**，无任何额外文本。
- 字段如下：
{{
  "normalized_gold_intent": "标准类别",
  "normalized_pred_intent": "标准类别",
  "clarification_score": 0.x,
  "reason": "简要中文说明（不超过50字）"
}}

现在请开始评估：
"""


def call_qwen_max(prompt: str, max_retries=3):
    url = "https://dashscope.aliyuncs.com/api/v1/services/aigc/text-generation/generation"
    headers = {
        "Authorization": f"Bearer {QWEN_API_KEY}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": "qwen-max",
        "input": {"messages": [{"role": "user", "content": prompt}]},
        "parameters": {
            "result_format": "message",
            "max_tokens": 300,
            "temperature": 0.0
        }
    }

    for _ in range(max_retries):
        try:
            response = requests.post(url, json=payload, headers=headers, timeout=30)
            if response.status_code == 200:
                data = response.json()
                content = data["output"]["choices"][0]["message"]["content"].strip()
                match = re.search(r"\{.*\}", content, re.DOTALL)
                if match:
                    json_str = match.group(0)
                    result = json.loads(json_str)
                    return result
            time.sleep(1)
        except Exception as e:
            print(f"Qwen-Max 调用失败: {e}")
            time.sleep(2)
    return {
        "normalized_gold_intent": "other",
        "normalized_pred_intent": "other",
        "clarification_score": 0.0,
        "reason": "评估失败"
    }


def main():
    input_file = "results.jsonl"
    output_summary = "evaluation_by_qwen_max.json"
    output_details = "evaluation_details.json"

    if not os.path.exists(input_file):
        raise FileNotFoundError(f"找不到 {input_file}")

    with open(input_file, "r", encoding="utf-8") as f:
        data = [json.loads(line) for line in f]

    print(f"共加载 {len(data)} 条样本，将使用 Qwen-Max 进行评估...")

    all_evaluations = []
    metrics = {
        "qwen-plus": {"intent_correct": 0, "total": 0, "clar_scores": []},
        "qwen-max": {"intent_correct": 0, "total": 0, "clar_scores": []}
    }

    for item in tqdm(data, desc="Qwen-Max 评估中"):
        utterance = item["user_utterance"]
        gold_intent = item["gold_intent"]
        gold_clar = item["gold_clarification"]
        if isinstance(gold_clar, list):
            gold_clar = gold_clar[0] if gold_clar else "无需澄清"

        for model_name in ["qwen-plus", "qwen-max"]:
            pred_intent = item[model_name]["predicted_intent"]
            pred_clar = item[model_name]["predicted_clarification"]

            prompt = EVALUATOR_PROMPT.format(
                user_utterance=utterance,
                gold_intent=gold_intent,
                gold_clarification=gold_clar,
                predicted_intent=pred_intent,
                predicted_clarification=pred_clar
            )

            eval_result = call_qwen_max(prompt)
            all_evaluations.append({
                "item_id": item.get("dialog_id", ""),
                "model": model_name,
                "input": {
                    "user_utterance": utterance,
                    "gold_intent": gold_intent,
                    "gold_clarification": gold_clar,
                    "pred_intent": pred_intent,
                    "pred_clarification": pred_clar
                },
                "evaluation": eval_result
            })

            #更新指标
            gold_norm = eval_result["normalized_gold_intent"]
            pred_norm = eval_result["normalized_pred_intent"]
            clar_score = eval_result["clarification_score"]

            if gold_norm != "other":
                metrics[model_name]["total"] += 1
                if gold_norm == pred_norm:
                    metrics[model_name]["intent_correct"] += 1

            metrics[model_name]["clar_scores"].append(clar_score)


    #计算最终指标
    final_metrics = {}
    for model in ["qwen-plus", "qwen-max"]:
        total = metrics[model]["total"]
        correct = metrics[model]["intent_correct"]
        clar_scores = metrics[model]["clar_scores"]

        intent_acc = correct / total if total > 0 else 0
        avg_clar = sum(clar_scores) / len(clar_scores) if clar_scores else 0

        final_metrics[model] = {
            "Intent Accuracy (%)": round(intent_acc * 100, 2),
            "Avg Clarification Score": round(avg_clar, 4),
            "Evaluated Samples": total,
            "Total Predictions": len(clar_scores)
        }

    # 保存
    with open(output_summary, "w", encoding="utf-8") as f:
        json.dump(final_metrics, f, ensure_ascii=False, indent=2)

    with open(output_details, "w", encoding="utf-8") as f:
        json.dump(all_evaluations, f, ensure_ascii=False, indent=2)

    print("\n 评估完成")
    print(json.dumps(final_metrics, ensure_ascii=False, indent=2))
    print(f"\n详细结果已保存至: {output_details}")
    print(f"汇总指标已保存至: {output_summary}")


if __name__ == "__main__":
    main()

共加载 60 条样本，将使用 Qwen-Max 进行评估...


Qwen-Max 评估中: 100%|██████████████████████████| 60/60 [04:04<00:00,  4.08s/it]


 评估完成
{
  "qwen-plus": {
    "Intent Accuracy (%)": 64.91,
    "Avg Clarification Score": 0.495,
    "Evaluated Samples": 57,
    "Total Predictions": 60
  },
  "qwen-max": {
    "Intent Accuracy (%)": 66.67,
    "Avg Clarification Score": 0.605,
    "Evaluated Samples": 57,
    "Total Predictions": 60
  }
}

详细结果已保存至: evaluation_details.json
汇总指标已保存至: evaluation_by_qwen_max.json



