In [None]:
import json
import os
from typing import List, Dict, Any

def load_multiwoz_dialogues(file_path: str) -> List[Dict[str, Any]]:
    """
    MultiWOZ JSON 파일에서 모든 대화 데이터를 로드합니다.

    Args:
        file_path: MultiWOZ JSON 파일 경로

    Returns:
        대화 데이터 리스트
    """
    # JSON 파일 로드
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # data가 리스트 형태인지 확인
    if isinstance(data, list):
        return data
    # data가 딕셔너리 형태라면 (일부 MultiWOZ 버전에서 가능)
    elif isinstance(data, dict):
        # 각 대화를 리스트로 반환
        return list(data.values())
    else:
        raise ValueError(f"예상치 못한 데이터 형식입니다: {type(data)}")

def get_all_utterances(dialogues: List[Dict[str, Any]]) -> List[str]:
    """
    모든 대화에서 발화(utterance)만 추출합니다.

    Args:
        dialogues: 대화 데이터 리스트

    Returns:
        모든 발화 리스트
    """
    all_utterances = []
    
    for dialogue in dialogues:
        # 각 대화의 턴(turns)에 대해 반복
        for turn in dialogue.get('turns', []):
            # 발화가 있으면 추가
            if 'utterance' in turn:
                all_utterances.append(turn['utterance'])
    
    return all_utterances

def get_dialogue_services(dialogues: List[Dict[str, Any]]) -> Dict[str, List[str]]:
    """
    각 대화 ID별 서비스 목록을 추출합니다.

    Args:
        dialogues: 대화 데이터 리스트

    Returns:
        대화 ID를 키로, 서비스 리스트를 값으로 가지는 딕셔너리
    """
    dialogue_services = {}
    
    for dialogue in dialogues:
        dialogue_id = dialogue.get('dialogue_id', '')
        services = dialogue.get('services', [])
        dialogue_services[dialogue_id] = services
    
    return dialogue_services

def filter_dialogues_by_service(dialogues: List[Dict[str, Any]], service: str) -> List[Dict[str, Any]]:
    """
    특정 서비스를 포함하는 대화만 필터링합니다.

    Args:
        dialogues: 대화 데이터 리스트
        service: 필터링할 서비스 이름 (예: 'restaurant', 'hotel' 등)

    Returns:
        특정 서비스를 포함하는 대화 리스트
    """
    filtered_dialogues = []
    
    for dialogue in dialogues:
        services = dialogue.get('services', [])
        if service in services:
            filtered_dialogues.append(dialogue)
    
    return filtered_dialogues

# 사용 예시
if __name__ == "__main__":
    # MultiWOZ 데이터 파일 경로
    file_path = "dataset/train/dialogues_001.json"  # 실제 파일 경로로 변경해주세요
    
    # 모든 대화 데이터 로드
    dialogues = load_multiwoz_dialogues(file_path)
    print(f"총 {len(dialogues)}개의 대화를 로드했습니다.")
    
    # 첫 번째 대화 정보 확인
    if dialogues:
        first_dialogue = dialogues[0]
        print(f"첫 번째 대화 ID: {first_dialogue.get('dialogue_id', '정보 없음')}")
        print(f"첫 번째 대화 서비스: {first_dialogue.get('services', [])}")
        print(f"첫 번째 대화 턴 수: {len(first_dialogue.get('turns', []))}")
    
    print(dialogues[0])

In [7]:
def get_utterances_from_dialogue(dialogue: Dict[str, Any]) -> List[str]:
    """
    한 대화에서 모든 발화(utterance)를 추출합니다.

    Args:
        dialogue: 단일 대화 데이터

    Returns:
        해당 대화의 모든 발화 리스트
    """
    utterances = []
    
    for turn in dialogue.get('turns', []):
        if 'utterance' in turn:
            # 화자 정보와 함께 발화 저장 (선택적)
            speaker = turn.get('speaker', '')
            utterance = turn.get('utterance', '')
            utterances.append(f"{speaker}: {utterance}")
            
            # 화자 정보 없이 발화만 저장하려면 아래 주석을 해제하고 위 코드를 주석 처리
            # utterances.append(utterance)
    
    return utterances

In [None]:
get_utterances_from_dialogue(dialogues[0])

In [None]:
first_utterance = get_utterances_from_dialogue(dialogues[0])
for item in first_utterance:
    print(item)

#Annotation using LLM

In [10]:
from openai import OpenAI
import os
from dotenv import load_dotenv
load_dotenv()
api_key  = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)
def annotate_text(system_prompt, user_prompt):
    response = client.chat.completions.create(
                model='gpt-4o',
                temperature=0.1,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}])
    return response.choices[0].message.content
system_prompt = """
You are an expert at extracting dialogue elements from natural language conversation texts, following a structure similar to the MultiWOZ dataset. 
Your task is to analyze each utterance one-by-one and assign a single keyword (using a variable name structure with underscores instead of spaces) to each Markov Decision Process (MDP) element. 
You should assign one utterance number to each utterance, regardless if it is a user utterance or system utterance.
For elements that are not applicable or not mentioned, output "NA".

Follow these detailed definitions and guidelines:

1. Dialogue States:
   - Definition: The dialogue state represents the current context of the conversation. It includes user intents, filled slots, mentioned entities, and any other relevant contextual clues that define the conversation's status.
   - Task: Extract and list all components that contribute to the dialogue state from each utterance.

2. User Actions:
   - Definition: User actions are the dialogue acts or intents expressed by the user. These may include acts like "request", "inform", "confirm", etc.
   - Task: Identify and list the dialogue acts performed by the user as single keywords.

3. System Actions:
   - Definition: System actions are the dialogue acts or responses executed by the system. They can include providing information, asking clarifying questions, offering options, etc.
   - Task: Identify and list the dialogue acts performed by the system as single keywords.

4. Transitions:
   - Definition: Dialogue transitions capture the changes in the dialogue state from one turn to the next as a result of actions taken by the user or the system.
   - Task: Infer how the state in one dialogue turn connects to the next. Represent a transition in the form "state_1", "action", "state_2" (all as single keywords).

5. Rewards:
   - Definition: Rewards are explicit or implicit signals indicating the quality or success of a dialogue turn. This may include feedback, task success indicators, or other performance signals.
   - Task: If a reward is identifiable from the utterance, extract it as a single keyword; otherwise, output "NA".

6. Discount Factor:
   - Definition: The discount factor represents the weighting of future rewards relative to immediate rewards. Although it may not be explicitly mentioned in natural conversations, if there is any indication of weighting future versus immediate outcomes, extract it as a single keyword; otherwise, output "NA".

Format your final answer as a JSON object with the following format for each utterance:
{
  "utterance_number": {
    "dialogue_states": [list_of_state_keywords],
    "user_actions": [list_of_user_action_keywords],
    "system_actions": [list_of_system_action_keywords],
    "transitions": ["state_1_keyword", "action_keyword", "state_2_keyword"],
    "rewards": "reward_keyword_or_NA",
    "discount_factor": "discount_factor_keyword_or_NA"
  },
  ...
}

For any element that is not mentioned or identifiable in the conversation, indicate it as "NA".

"""

In [None]:
idx=0
texts = ""
for item in first_utterance:
    texts+="\n"+item
    user_prompt = f"""
    Below is a natural language conversation between a user and a system. Please extract, define, and annotate the dialogue elements based on the guidelines provided.
    Text:
    {texts}
    """
    print("utterance", idx)
    print(texts)
    print(annotate_text(system_prompt,user_prompt))
    idx+=1

In [None]:
print(annotate_text(system_prompt,user_prompt))

In [13]:
prompt2 = """"You are an expert annotator for multi-domain dialogues. 
I will provide you with a multi-turn conversation between a user and a system. 
I want you to produce a **single top-level dictionary** with keys for each turn in the format "turn_1", "turn_2", and so on. 
The value for each key will be a dictionary containing the following fields:

1. "turn_idx"
   - The turn index (starting from 1).

2. "speaker"
   - Either "user" or "system".

3. "utterance"
   - The exact text of that turn.

4. "dialogue_acts"
   - An object whose keys are act types (e.g., "Inform", "Request", "Offer", "Confirm", "Recommend", "NoOffer", "Select"), 
     and whose values are arrays of [domain, slot, value] tuples.
     - domain: The domain of the act (hotel, restaurant, train, etc.).
     - slot: The slot name (e.g., "area", "stars", "price", etc.).
     - value: The value being informed or requested (use "?" if the slot is requested).

5. "belief_state"
   - An array of objects, each corresponding to a domain. Each object contains:
       - "domain": The name of the domain (e.g., "hotel", "restaurant", "train", etc.).
       - "slots": A list of [slot, value] pairs representing all information accumulated about that domain so far.

6. "transitions"
   - Show how the current turn's final state ("state_1") transitions to the next turn's state ("state_2") based on an "action".
   - For example:
     {
       "state_1": [...],
       "action": "Request",
       "state_2": [...]
     }
   - "state_1" = the current turn’s final belief_state.
   - "action" = the main dialogue act(s) causing the change.
   - "state_2" = the next turn’s final belief_state.

7. "reward"
   - A single keyword if the utterance indicates a clear success or dissatisfaction signal, e.g., "positive", "negative", "completed". Otherwise, "NA".

### Important Output Requirements

- **Output must be a valid dictionary**, not an array.
- The top-level structure: 
  {
    "turn_1": { ... },
    "turn_2": { ... },
    ...
  }
"""

In [None]:
idx=0
texts = ""
for item in first_utterance:
    texts+="\n"+item
    user_prompt = f"""
    Below is a natural language conversation between a user and a system. Please extract, define, and annotate the dialogue elements based on the guidelines provided.
    Text:
    {texts}
    """
    print("utterance", idx)
    print(texts)
    print(annotate_text(prompt2,user_prompt))
    idx+=1

In [15]:
result = annotate_text(prompt2,user_prompt)

In [None]:
print(result.replace("```","").replace("json","").strip())

In [17]:
clean_result = result.replace("```","").replace("json","").strip()

In [None]:
import json

data = json.loads(clean_result)
print(type(data))
# data는 list일 것으로 예상
print(data.keys())


In [None]:
data

For all dialogues

In [None]:
results = []
idx = 0
total = len(dialogues[:10])
for dialogue in dialogues[:10]:
    print("processing: ", str(idx/total))
    text = get_utterances_from_dialogue(dialogue)
    user_prompt = f"""
    Below is a natural language conversation between a user and a system. Please extract, define, and annotate the dialogue elements based on the guidelines provided.
    Text:
    {text}
    """
    annotation = annotate_text(prompt2,user_prompt)
    results.append(annotation)
    idx+=1

In [None]:
results

In [22]:
def refine_result(result):
    result = result.replace("```","").replace("json","").strip()
    return(result)

In [None]:
for i, dialogue in zip(results, dialogues[:10]):
    print(get_utterances_from_dialogue(dialogue))
    print(refine_result(i))

In [25]:
prompt3="""
You are an expert annotator for multi-domain dialogues.

I will provide you with a multi-turn conversation between a user and a system. 
I want you to produce a **single top-level dictionary** with keys for each turn in the format 'turn_1', 'turn_2', etc. 

The value for each key must be a dictionary containing the following fields:

1. 'turn_idx'
   - The turn index (starting from 1).

2. 'speaker'
   - Either 'user' or 'system'.

3. 'utterance'
   - The exact text of that turn.

4. 'dialogue_acts'
   - An object whose keys are act types (e.g., 'Inform', 'Request', 'Offer', 'Confirm', 'Recommend', 'NoOffer', 'Select'), 
     and whose values are arrays of [domain, slot, value] tuples.

5. 'belief_state'
   - An array of objects, each corresponding to a domain. 
   - **If there is any user-level or global context** (e.g., user’s current location, time constraints, special requirements), 
     then it should be stored in a special domain named '**context**' (or 'user_info'). 
     For example:
       {
         "domain": "context",
         "slots": [
            ["current_location", "cambridge station"],
            ["time_constraint", "morning"]
         ]
       }
   - Other domains (hotel, restaurant, train, taxi, etc.) remain the same.

6. 'transitions'
   - Show how the current turn’s final state ('state_1') transitions to the next turn’s state ('state_2') based on an 'action'.

7. 'reward'
   - A single keyword if the utterance indicates a clear success or dissatisfaction signal, e.g., 'positive', 'negative', 'completed'. Otherwise, 'NA'.

### Additional Notes
- The top-level output must be a valid JSON dictionary with keys "turn_1", "turn_2", etc.
- If the user **moves location** or updates their constraints, then update the 'context' domain in the 'belief_state' accordingly.


"""

In [None]:
results_prompt3 = []
idx = 0
total = len(dialogues[:10])
for dialogue in dialogues[:10]:
    print("processing: ", str(idx/total))
    text = get_utterances_from_dialogue(dialogue)
    user_prompt = f"""
    Below is a natural language conversation between a user and a system. Please extract, define, and annotate the dialogue elements based on the guidelines provided.
    Text:
    {text}
    """
    annotation = annotate_text(prompt3,user_prompt)
    results_prompt3.append(annotation)
    idx+=1

In [None]:
for i, dialogue in zip(results_prompt3, dialogues[:10]):
    print(get_utterances_from_dialogue(dialogue))
    print(refine_result(i))

In [30]:
prompt4="""
You are an expert annotator for multi-domain dialogues.
Below are the annotation rules you MUST follow when I provide a conversation.

I want you to produce a single top-level dictionary with keys for each turn:
  "turn_1", "turn_2", "turn_3", ...

Each "turn_x" is a dictionary containing:

1) "turn_idx"
   - The turn number (starting from 1).

2) "speaker"
   - Either "user" or "system".

3) "utterance"
   - The exact text for that turn.

4) "dialogue_acts"
   - An object whose keys are act types (e.g., "Inform", "Request", "Offer", etc.),
     and whose values are arrays of [domain, slot, value] tuples.

5) "belief_state"
   - An array of objects, each with:
       "domain": (e.g., "hotel", "restaurant", "train", "context", etc.)
       "slots": a list of [slot, value] pairs
   - **Important**: If user or system mentions any global context 
     (e.g., "I am currently at Cambridge station," "I only have 1 hour left," etc.),
     store that info under `{"domain": "context", "slots": [...]}`.
   - Keep domain-specific info in their respective domain objects (e.g. "hotel", "restaurant").

6) "transitions"
   - Show how the turn’s final state changes to the next turn’s state:
       {
         "state_1": [...],
         "action": "...",
         "state_2": [...]
       }
   - "state_1" = the final belief_state after processing this turn.
   - "action" = main dialogue act(s) that changed the state.
   - "state_2" = the belief_state for the next turn.

7) "reward"
   - "positive", "completed", or "negative" if the user or system indicates success/failure/dissatisfaction. Otherwise "NA".

### Output Must Be Valid JSON

Your final output must be a single JSON dictionary, not an array, with top-level keys "turn_1", "turn_2", etc.

Example (simplified):

{
  "turn_1": {
    "turn_idx": 1,
    "speaker": "user",
    "utterance": "...",
    "dialogue_acts": {
      "Request": [
        ["restaurant", "area", "?"]
      ]
    },
    "belief_state": [
      {
        "domain": "context",
        "slots": [
          ["current_location", "Cambridge station"]
        ]
      }
    ],
    "transitions": {
      "state_1": [],
      "action": "Request",
      "state_2": [
        {
          "domain": "context",
          "slots": [
            ["current_location", "Cambridge station"]
          ]
        }
      ]
    },
    "reward": "NA"
  },
  "turn_2": {
     ...
  }
}

Remember to always include "context" domain for user-level info. 

"""

In [None]:
results_prompt4 = []
idx = 0
total = len(dialogues[:10])
for dialogue in dialogues[:10]:
    idx+=1
    print("processing: ", str(idx/total))
    text = get_utterances_from_dialogue(dialogue)
    user_prompt = f"""
    Below is a natural language conversation between a user and a system. Please extract, define, and annotate the dialogue elements based on the guidelines provided.
    Text:
    {text}
    """
    annotation = annotate_text(prompt4,user_prompt)
    results_prompt4.append(annotation)
    

In [None]:
for i, dialogue in zip(results_prompt4, dialogues[:10]):
    print(get_utterances_from_dialogue(dialogue))
    print(refine_result(i))

In [None]:
type(refine_result(results_prompt4))

In [47]:
results = {}
for i in range(len(results_prompt4)):
    json_result_per_line = json.loads(refine_result(results_prompt4[i]))
    results[i]=json_result_per_line

In [53]:
with open('./results_prompt4_full.json','w') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

In [None]:
results_prompt4_full = []
idx = 0
total = len(dialogues)
for dialogue in dialogues:
    idx+=1
    print("processing: ", str(idx/total))
    text = get_utterances_from_dialogue(dialogue)
    user_prompt = f"""
    Below is a natural language conversation between a user and a system. Please extract, define, and annotate the dialogue elements based on the guidelines provided.
    Text:
    {text}
    """
    annotation = annotate_text(prompt4,user_prompt)
    results_prompt4_full.append(annotation)
    
results = {}
for i in range(len(results_prompt4_full)):
    json_result_per_line = json.loads(refine_result(results_prompt4_full[i]))
    results[i]=json_result_per_line
with open('./results_prompt4_full.json','w') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)