In [9]:
import json
import re
import random
import os

def main():
    INPUT_FILE = "dialogues_001.json" 
    if not os.path.exists(INPUT_FILE):
        raise FileNotFoundError(
            f"找不到{INPUT_FILE}"
        )

    print(f"正在加载 {INPUT_FILE}...")
    with open(INPUT_FILE, "r", encoding="utf-8") as f:
        data = json.load(f)

    if isinstance(data, list):
        dialogs = {item["dialogue_id"]: item for item in data}
    else:
        dialogs = data

    TOURISM_SERVICES = {"attraction", "hotel", "restaurant"}

    FUZZY_PATTERNS = [
        r"\bsomewhere\b", r"\b(some )?place\b", r"\brecommend\b",
        r"\bsuggest\b", r"\bgood\b", r"\bnice\b", r"\bgreat\b",
        r"\binteresting\b", r"\blooking for\b", r"\bwant\b",
        r"\bneed\b", r"\bprefer\b", r"\blike\b", r"\bnot sure\b",
        r"\bdon't know\b", r"\bmaybe\b", r"\bany\b.*place"
    ]

    def is_fuzzy(utt: str) -> bool:
        return any(re.search(pat, utt.lower()) for pat in FUZZY_PATTERNS)

    def clean_utterance(utt: str) -> str:
        utt = re.sub(r"\s+", " ", utt.strip())
        utt = re.sub(r"^[^\w]+|[^\w]+$", "", utt)
        return utt

    candidates = []
    seen = set()

    for dialog_id, dialog in dialogs.items():
        services = set(dialog.get("services", []))
        if not services:
            for turn in dialog.get("turns", []):
                for frame in turn.get("frames", []):
                    service = frame.get("service")
                    if service:
                        services.add(service.lower())

        #只保留涉及旅游领域的对话
        if not (services & TOURISM_SERVICES):
            continue

        #遍历所有轮次
        for turn in dialog.get("turns", []):
            if turn.get("speaker") != "USER":
                continue

            utterance = clean_utterance(turn.get("utterance", ""))
            if len(utterance) < 8 or len(utterance) > 120:
                continue
            if not is_fuzzy(utterance):
                continue
            if utterance in seen:
                continue
            seen.add(utterance)

            candidates.append({
                "user_utterance": utterance,
                "dialog_id": dialog_id,
                "domains": list(services & TOURISM_SERVICES)  #只保留旅游相关 domain
            })

    #随机打乱并取前60条
    random.seed(42)
    random.shuffle(candidates)
    selected = candidates[:60]

    #保存
    output_path = "raw_candidates.jsonl"
    with open(output_path, "w", encoding="utf-8") as f:
        for item in selected:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

    print(f"成功提取 {len(selected)} 条模糊用户请求")
    print(f"已保存至: {output_path}")
    print("\n示例输出：")
    print(open(output_path, encoding="utf-8").readline())

if __name__ == "__main__":
    main()

正在加载 dialogues_001.json...
成功提取 60 条模糊用户请求
已保存至: raw_candidates.jsonl

示例输出：
{"user_utterance": "Great. Thank you very much for your help today", "dialog_id": "MUL0208.json", "domains": ["restaurant"]}

