In [1]:
from prompts import (
    STATE_EXTRACTION_PROMPT,
    RESPONSE_GENERATION_PROMPT,
    DOMAIN_RECOGNITION_PROMPT
)

from multiwoz_utils.database import default_database
# results = default_database.query("hotel", {"area": "north", "parking": "yes"})
# print(results)

In [2]:
def process_examples(examples, input_keys, output_keys):
    output = "\n"
    for n, ex in enumerate(examples[-2:]):
        input_str = '\n'.join((f"{key if key != 'full_state' else 'state'}: {ex[key]}" for key in input_keys))
        output_str = '\n'.join((f"{key}: {ex[key]}" for key in output_keys))
        output += "---------------------" + \
                  f"Example {n}:\n" + \
                  f"{input_str}\n" + \
                  f"\n{output_str}\n"
    return output + "\n"

In [5]:
import pickle
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from tqdm import tqdm

from multiwoz_utils.dialog_iterator import iterate_dialogues

from multiwoz_utils.data_loader import load_multiwoz

data = load_multiwoz()  # 默认加载 'train' split

# 设置参数
vec_file_path = "multiwoz-context-db.vec"
top_k = 5  # 检索候选数量

# 加载 embedding 模型
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

# 从本地加载 FAISS 向量库
with open(vec_file_path, "rb") as f:
    vector_store = pickle.load(f)

print("Loaded FAISS vector store from", vec_file_path)

last_dial_id = None
history = []

for it, turn in enumerate(tqdm(iterate_dialogues(data, default_database), desc="Retrieving", unit="turns")):
    dialog_id = turn['dialogue_id']

    if dialog_id != last_dial_id:
        history = []
        last_dial_id = dialog_id

    # 当前历史，不含当前 utterance
    history_text = "\n".join(history)

    # 相似性检索
    query_text = turn['page_content']
    results = vector_store.similarity_search(query_text, k=top_k)

    # 构造 few-shot 示例
    examples = [{
        'context': doc.metadata.get('context', ''),
        'state': doc.metadata.get('state', ''),
        'full_state': doc.metadata.get('full_state', ''),
        'response': doc.metadata.get('response', ''),
        'database': doc.metadata.get('database', ''),
        'domain': doc.metadata.get('domain', '')
    } for doc in results]

    fewshot_examples = process_examples(
        examples, ['context'], ['state']
    )

    # ✅ 用对话历史替换原来的 context
    final_prompt = STATE_EXTRACTION_PROMPT.format(
        fewshot_examples,
        history_text,
        turn['question'].strip()
    )

    print(final_prompt)
    print("==" * 50)

    # 更新对话历史
    history.append(f"Customer: {turn['question']}")
    history.append(f"Assistant: {turn['metadata']['response']}")

    if it == 5:
        break

Loaded FAISS vector store from multiwoz-context-db.vec


Retrieving: 3turns [00:02,  1.67turns/s]

Capture entity values from last utterance of the conversation according to examples.
Focus only on the values mentioned in the last utterance.
Capture pair "entity:value" with no spaces.
Separate entity:value pairs by hyphens.

Values that should be captured are:
 - "pricerange": price range of the restaurant (cheap/moderate/expensive)
 - "area": area where the restaurant is located (north/east/west/south/centre)
 - "food": type of food the restaurant serves
 - "name": name of the restaurant
 - "bookday": day of the booking
 - "booktime": time of the booking
 - "bookpeople": how many people the booking is for

Do not capture any other values!
If a value is not specified, leave it empty.

------

---------------------Example 0:
context: Customer: I would like to eat in the Center of town at a expensive place.

state: {'restaurant': {'area': 'centre', 'pricerange': 'expensive'}}
---------------------Example 1:
context: Customer: I want to find an expensive restaurant in the centre of tow

Retrieving: 5turns [00:02,  2.05turns/s]

Capture entity values from last utterance of the conversation according to examples.
Focus only on the values mentioned in the last utterance.
Capture pair "entity:value" with no spaces.
Separate entity:value pairs by hyphens.

Values that should be captured are:
 - "pricerange": price range of the restaurant (cheap/moderate/expensive)
 - "area": area where the restaurant is located (north/east/west/south/centre)
 - "food": type of food the restaurant serves
 - "name": name of the restaurant
 - "bookday": day of the booking
 - "booktime": time of the booking
 - "bookpeople": how many people the booking is for

Do not capture any other values!
If a value is not specified, leave it empty.

------

---------------------Example 0:
context: Customer: I'm looking for a hotel called Kirkwood House please.
Assistant: I've located Kirkwood House. It's a guesthouse in the north. Would you like to book a room?
Customer: May I please have the phone number for the hotel?
Assistant: Absolutely. That


