In [20]:
import json
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.manifold import TSNE
from sklearn.cluster import AgglomerativeClustering
import plotly.express as px
import numpy as np

In [21]:
#load data
filepath = "results_parallel.json"
with open(filepath) as f:
    data = json.load(f)


In [22]:
len(data)

512

In [40]:
import json

# Load data
filepath = "results_parallel.json"
with open(filepath) as f:
    data = json.load(f)

def extract_state_action_sequences(data):
    all_sequences = []
    
    # Process each dialogue
    for dialogue_id, dialogue in data.items():
        dialogue_sequence = []
        seen_actions = {}  # Dictionary to track first occurrence of each action by domain
        
        # Get all turns sorted by index
        turns = []
        for key in dialogue:
            if key.startswith('turn_'):
                turn_idx = int(key.split('_')[1])
                turns.append((turn_idx, dialogue[key]))
        turns.sort()
        
        for turn_idx, turn in turns:
            transitions = turn.get('transitions', {})
            action = transitions.get('action', 'NONE')
            state_2 = transitions.get('state_2', [])
            
            # Extract all slots from all domains
            turn_state = {}
            for domain_data in state_2:
                domain = domain_data.get('domain')
                if domain and domain != 'context':  # Skip context domain
                    slots = domain_data.get('slots', [])
                    
                    # Handle different slot formats
                    if isinstance(slots, list):
                        # Check if slots is a list of lists or a simple list
                        if slots and isinstance(slots[0], list):
                            # Format: [[slot_name, slot_value], ...]
                            for slot_item in slots:
                                if len(slot_item) == 2:
                                    slot_name, slot_value = slot_item
                                    key = f"{domain}-{slot_name}"
                                    turn_state[key] = slot_value
                        else:
                            # Maybe a simple list of values
                            for i, value in enumerate(slots):
                                key = f"{domain}-slot_{i}"
                                turn_state[key] = value
                    elif isinstance(slots, dict):
                        # Format: {slot_name: slot_value, ...}
                        for slot_name, slot_value in slots.items():
                            key = f"{domain}-{slot_name}"
                            turn_state[key] = slot_value
            
            # Format based on action and state
            if turn_state:
                formatted_parts = []
                for slot_key, slot_value in sorted(turn_state.items()):
                    domain = slot_key.split('-')[0]
                    
                    # Track the first action encountered for each domain
                    domain_action_key = f"{domain}-{action}"
                    if action != 'NONE' and domain_action_key not in seen_actions:
                        seen_actions[domain_action_key] = f"FIND_{domain.upper()}"
                    
                    # Use the first encountered action or NONE
                    if action == 'NONE':
                        action_prefix = "NONE"
                    else:
                        action_prefix = seen_actions.get(domain_action_key, f"FIND_{domain.upper()}")
                    
                    formatted_parts.append(f"{action_prefix}({slot_key}={slot_value})")
                
                state_str = " + ".join(formatted_parts)
                if state_str and (not dialogue_sequence or dialogue_sequence[-1] != state_str):
                    dialogue_sequence.append(state_str)
        
        if dialogue_sequence:
            all_sequences.append(" → ".join(dialogue_sequence))
    
    return all_sequences

# Try extracting sequences with improved error handling
try:
    sequences = extract_state_action_sequences(data)
    
    # Print results
    for i, sequence in enumerate(sequences):
        if sequence:
            print(f"Dialogue {i+1}:")
            print(sequence)
            print()
    
    print(f"Total sequences extracted: {len([s for s in sequences if s])}")
except Exception as e:
    print(f"Error during extraction: {e}")
    import traceback
    traceback.print_exc()

Dialogue 1:
FIND_RESTAURANT(restaurant-food=european) + FIND_RESTAURANT(restaurant-price range=expensive) → FIND_RESTAURANT(restaurant-address=St. Michael's Church Trinity Street City Centre) + FIND_RESTAURANT(restaurant-area=centre) + FIND_RESTAURANT(restaurant-food=european) + FIND_RESTAURANT(restaurant-name=Eraina) + FIND_RESTAURANT(restaurant-phone number=01223 355166) + FIND_RESTAURANT(restaurant-price range=expensive)

Dialogue 2:
FIND_TAXI(taxi-destination=Avalon) + FIND_TAXI(taxi-time=19:15) → FIND_TAXI(taxi-departure=Primavera) + FIND_TAXI(taxi-destination=Avalon) + FIND_TAXI(taxi-time=19:15) → FIND_TAXI(taxi-car=black volvo) + FIND_TAXI(taxi-departure=Primavera) + FIND_TAXI(taxi-destination=Avalon) + FIND_TAXI(taxi-time=19:15) → FIND_TAXI(taxi-car=black volvo) + FIND_TAXI(taxi-contact=07698022958) + FIND_TAXI(taxi-departure=Primavera) + FIND_TAXI(taxi-destination=Avalon) + FIND_TAXI(taxi-time=19:15)

Dialogue 3:
FIND_RESTAURANT(restaurant-booking_day=Thursday) + FIND_RESTAURA

In [24]:
sequences

["FIND_RESTAURANT(restaurant-food=european) + FIND_RESTAURANT(restaurant-price range=expensive) → FIND_RESTAURANT(restaurant-address=St. Michael's Church Trinity Street City Centre) + FIND_RESTAURANT(restaurant-area=centre) + FIND_RESTAURANT(restaurant-food=european) + FIND_RESTAURANT(restaurant-name=Eraina) + FIND_RESTAURANT(restaurant-phone number=01223 355166) + FIND_RESTAURANT(restaurant-price range=expensive)",
 'FIND_TAXI(taxi-destination=Avalon) + FIND_TAXI(taxi-time=19:15) → FIND_TAXI(taxi-departure=Primavera) + FIND_TAXI(taxi-destination=Avalon) + FIND_TAXI(taxi-time=19:15) → FIND_TAXI(taxi-car=black volvo) + FIND_TAXI(taxi-departure=Primavera) + FIND_TAXI(taxi-destination=Avalon) + FIND_TAXI(taxi-time=19:15) → FIND_TAXI(taxi-car=black volvo) + FIND_TAXI(taxi-contact=07698022958) + FIND_TAXI(taxi-departure=Primavera) + FIND_TAXI(taxi-destination=Avalon) + FIND_TAXI(taxi-time=19:15)',
 'FIND_RESTAURANT(restaurant-booking_day=Thursday) + FIND_RESTAURANT(restaurant-booking_people

In [25]:
import json

# Load data
filepath = "results_prompt4.json"
with open(filepath) as f:
    data = json.load(f)

def extract_state_action_sequences(data):
    all_sequences = []
    
    # Process each dialogue
    for dialogue_id, dialogue in data.items():
        dialogue_sequence = []
        current_state = {}
        
        # Get all turns sorted by index
        turns = []
        for key in dialogue:
            if key.startswith('turn_'):
                turn_idx = int(key.split('_')[1])
                turns.append((turn_idx, dialogue[key]))
        turns.sort()
        
        for turn_idx, turn in turns:
            transitions = turn.get('transitions', {})
            action = transitions.get('action', 'NONE')
            state_2 = transitions.get('state_2', [])
            
            # Extract all slots from all domains
            turn_state = {}
            for domain_data in state_2:
                domain = domain_data.get('domain')
                if domain and domain != 'context':  # Skip context domain
                    for slot_name, slot_value in domain_data.get('slots', []):
                        key = f"{domain}-{slot_name}"
                        turn_state[key] = slot_value
            
            # Format based on action and state
            if turn_state:
                formatted_parts = []
                for slot_key, slot_value in sorted(turn_state.items()):
                    domain = slot_key.split('-')[0]
                    action_prefix = f"FIND_{domain.upper()}" if action in ['Request', 'Inform'] else "NONE"
                    formatted_parts.append(f"{action_prefix}({slot_key}={slot_value})")
                
                state_str = " + ".join(formatted_parts)
                if state_str and (not dialogue_sequence or dialogue_sequence[-1] != state_str):
                    dialogue_sequence.append(state_str)
        
        if dialogue_sequence:
            all_sequences.append(" → ".join(dialogue_sequence))
    
    return all_sequences

# Extract all sequences
sequences = extract_state_action_sequences(data)

# Print results
for i, sequence in enumerate(sequences):
    if sequence:
        print(f"Dialogue {i+1}:")
        print(sequence)
        print()

print(f"Total sequences extracted: {len([s for s in sequences if s])}")

Dialogue 1:
FIND_RESTAURANT(restaurant-area=center) + FIND_RESTAURANT(restaurant-price=expensive) → NONE(restaurant-area=center) + NONE(restaurant-price=expensive) → FIND_RESTAURANT(restaurant-area=center) + FIND_RESTAURANT(restaurant-food=any) + FIND_RESTAURANT(restaurant-price=expensive) → FIND_RESTAURANT(restaurant-area=center) + FIND_RESTAURANT(restaurant-food=African) + FIND_RESTAURANT(restaurant-name=Bedouin) + FIND_RESTAURANT(restaurant-price=expensive) → FIND_HOTEL(hotel-price=expensive) + FIND_RESTAURANT(restaurant-area=center) + FIND_RESTAURANT(restaurant-food=African) + FIND_RESTAURANT(restaurant-name=Bedouin) + FIND_RESTAURANT(restaurant-price=expensive) → FIND_HOTEL(hotel-area=center) + FIND_HOTEL(hotel-name=University Arms Hotel) + FIND_HOTEL(hotel-price=expensive) + FIND_RESTAURANT(restaurant-area=center) + FIND_RESTAURANT(restaurant-food=African) + FIND_RESTAURANT(restaurant-name=Bedouin) + FIND_RESTAURANT(restaurant-phone=01223367660) + FIND_RESTAURANT(restaurant-price

In [52]:
try:
    sequences = extract_state_action_sequences(data)
    
    # Prepare data for DataFrame
    dialogue_ids = []
    sequence_list = []
    
    for i, sequence in enumerate(sequences):
        if sequence:
            dialogue_ids.append(i+1)
            sequence_list.append(sequence)
    
    # Create DataFrame
    df = pd.DataFrame({"dialogue_id": dialogue_ids, "sequence": sequence_list})
    
    # Now you can use df as needed
    # For example, you can print it or save it to a file
    print(df)
    
    # Optional: Save to CSV
    # df.to_csv("sequences.csv", index=False)
    
except Exception as e:
    print(f"Error occurred: {e}")

     dialogue_id                                           sequence
0              1  FIND_RESTAURANT(restaurant-food=european) + FI...
1              2  FIND_TAXI(taxi-destination=Avalon) + FIND_TAXI...
2              3  FIND_RESTAURANT(restaurant-booking_day=Thursda...
3              4  FIND_TRAIN(train-destination=Ely) → FIND_TRAIN...
4              5  FIND_RESTAURANT(restaurant-food=Chinese) + FIN...
..           ...                                                ...
485          486  FIND_RESTAURANT(restaurant-area=centre) + FIND...
486          487  FIND_TRAIN(train-departure=Cambridge) + FIND_T...
487          488  FIND_TRAIN(train-destination=Cambridge) → FIND...
488          489  FIND_HOTEL(hotel-price=moderate) → FIND_HOTEL(...
489          490  FIND_RESTAURANT(restaurant-cuisine=chinese) + ...

[490 rows x 2 columns]


In [None]:
#embedding
sequences = extract_state_action_sequences(data)
dialogue_id = 0
for i in data:
    df = pd.DataFrame({"dialogue_id": dialogue_id, "sequence": sequences})

In [51]:
df

Unnamed: 0,dialogue_id,sequence
0,0,FIND_RESTAURANT(restaurant-food=european) + FI...
1,0,FIND_TAXI(taxi-destination=Avalon) + FIND_TAXI...
2,0,FIND_RESTAURANT(restaurant-booking_day=Thursda...
3,0,FIND_TRAIN(train-destination=Ely) → FIND_TRAIN...
4,0,FIND_RESTAURANT(restaurant-food=Chinese) + FIN...
...,...,...
485,0,FIND_RESTAURANT(restaurant-area=centre) + FIND...
486,0,FIND_TRAIN(train-departure=Cambridge) + FIND_T...
487,0,FIND_TRAIN(train-destination=Cambridge) → FIND...
488,0,FIND_HOTEL(hotel-price=moderate) → FIND_HOTEL(...


In [53]:
model = SentenceTransformer("all-mpnet-base-v2")
embeddings = model.encode(df["sequence"].tolist(), convert_to_tensor=True)


In [39]:
embeddings.size()

torch.Size([10, 768])

In [54]:
#similarity calculation
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

cos_sim_matrix = cosine_similarity(embeddings.cpu())
distance_matrix = 1 - cos_sim_matrix

clustering = AgglomerativeClustering(n_clusters=20, linkage='average', metric='precomputed')
labels = clustering.fit_predict(distance_matrix)
df["cluster"] = labels

In [55]:
#t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
reduced = tsne.fit_transform(embeddings.cpu().numpy())

In [56]:
#visualize
def shorten_sequence(seq, max_len=80):
    return "<br>".join([seq[i:i+max_len] for i in range(0, len(seq), max_len)])

plot_df = pd.DataFrame({
    "x": reduced[:, 0],
    "y": reduced[:, 1],
    "dialogue_id": df["dialogue_id"],
    "cluster": df["cluster"].astype(str),
    "sequence": df["sequence"]
})
plot_df["short_sequence"] = plot_df["sequence"].apply(shorten_sequence)

fig = px.scatter(
    plot_df,
    x="x",
    y="y",
    color="cluster",
    hover_data={"dialogue_id": True, "short_sequence": True, "sequence": False},
    title="t-SNE of MDP Dialogue Sequences from MultiWOZ (Interactive)"
)

# 8. 저장
fig.write_html("tsne_mdp_multiwoz.html")
