이 파일에서 할 일:

1. 원래 MultiWoZ에서 가공하지 않고 데이터 가져옴

2. State-Action을 추출해서 각각에 대한 MDP String Sequence를 만듦

3. 그 결과를 stentence transformer all-mpnet-base-v2로 임베딩

4. TSNE로 시각화

In [13]:
import os
import json
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.manifold import TSNE
from sklearn.cluster import AgglomerativeClustering
import plotly.express as px

# 1. JSON 파일 불러오기
data_dir = "/Users/hyegang/Desktop/졸업논문/multiwoz/data/MultiWOZ_2.2/train"
dialogues = []

for file_name in tqdm(os.listdir(data_dir)):
    if file_name.endswith(".json"):
        with open(os.path.join(data_dir, file_name), "r", encoding="utf-8") as f:
            dialogues.extend(json.load(f))

# 2. MDP 시퀀스 추출 함수
def extract_sequence(dialogue):
    sequence = []
    for turn in dialogue.get("turns", []):
        if turn["speaker"] == "USER":
            actions = []
            for frame in turn.get("frames", []):
                intent = frame.get("state", {}).get("active_intent", "")
                slots = frame.get("state", {}).get("slot_values", {})
                for slot, values in slots.items():
                    for value in values:
                        actions.append(f"{intent.upper()}({slot}={value})")
            if actions:
                sequence.append(" + ".join(actions))
    return " → ".join(sequence)

# 3. 시퀀스 리스트 생성
dialogue_ids, sequences = [], []
for d in dialogues:
    seq = extract_sequence(d)
    if seq:
        dialogue_ids.append(d["dialogue_id"])
        sequences.append(seq)

df = pd.DataFrame({"dialogue_id": dialogue_ids, "sequence": sequences})

# 4. Sentence-BERT 임베딩
model = SentenceTransformer("all-mpnet-base-v2")
embeddings = model.encode(df["sequence"].tolist(), convert_to_tensor=True)

# 5. 거리 행렬 + 클러스터링
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

cos_sim_matrix = cosine_similarity(embeddings.cpu())
distance_matrix = 1 - cos_sim_matrix

clustering = AgglomerativeClustering(n_clusters=20, linkage='average', metric='precomputed')
labels = clustering.fit_predict(distance_matrix)
df["cluster"] = labels

# 6. t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
reduced = tsne.fit_transform(embeddings.cpu().numpy())

# 7. Plotly 시각화
def shorten_sequence(seq, max_len=80):
    return "<br>".join([seq[i:i+max_len] for i in range(0, len(seq), max_len)])

plot_df = pd.DataFrame({
    "x": reduced[:, 0],
    "y": reduced[:, 1],
    "dialogue_id": df["dialogue_id"],
    "cluster": df["cluster"].astype(str),
    "sequence": df["sequence"]
})
plot_df["short_sequence"] = plot_df["sequence"].apply(shorten_sequence)

fig = px.scatter(
    plot_df,
    x="x",
    y="y",
    color="cluster",
    hover_data={"dialogue_id": True, "short_sequence": True, "sequence": False},
    title="t-SNE of MDP Dialogue Sequences from MultiWOZ (Interactive)"
)

# 8. 저장
fig.write_html("tsne_mdp_sequences_multiwoz.html")


100%|██████████| 17/17 [00:02<00:00,  7.93it/s]
  sf: grouped.get_group(s if len(s) > 1 else s[0])


In [14]:
import os
import json
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.manifold import TSNE
from sklearn.cluster import AgglomerativeClustering
import plotly.express as px

# 1. 특정 JSON 파일만 불러오기 (dialogues_001.json)
file_path = "dataset/train/dialogues_001.json"  # 경로 수정
dialogues = []

try:
    with open(file_path, "r", encoding="utf-8") as f:
        dialogues = json.load(f)
    print(f"Loaded {len(dialogues)} dialogues from {file_path}")
except FileNotFoundError:
    print(f"File not found: {file_path}")
    exit(1)

# 2. MDP 시퀀스 추출 함수
def extract_sequence(dialogue):
    sequence = []
    for turn in dialogue.get("turns", []):
        if turn["speaker"] == "USER":
            actions = []
            for frame in turn.get("frames", []):
                intent = frame.get("state", {}).get("active_intent", "")
                slots = frame.get("state", {}).get("slot_values", {})
                for slot, values in slots.items():
                    for value in values:
                        actions.append(f"{intent.upper()}({slot}={value})")
            if actions:
                sequence.append(" + ".join(actions))
    return " → ".join(sequence)

# 3. 시퀀스 리스트 생성
dialogue_ids, sequences = [], []
for d in dialogues:
    seq = extract_sequence(d)
    if seq:
        dialogue_ids.append(d["dialogue_id"])
        sequences.append(seq)

df = pd.DataFrame({"dialogue_id": dialogue_ids, "sequence": sequences})
print(f"Created dataframe with {len(df)} dialogues")

# 4. Sentence-BERT 임베딩
print("Generating embeddings...")
model = SentenceTransformer("all-mpnet-base-v2")
embeddings = model.encode(df["sequence"].tolist(), convert_to_tensor=True)

# 5. 거리 행렬 + 클러스터링
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

print("Computing distance matrix...")
cos_sim_matrix = cosine_similarity(embeddings.cpu())
distance_matrix = 1 - cos_sim_matrix

# dialogues_001.json의 크기에 따라 클러스터 수 조정
n_clusters = min(20, len(df) // 5) if len(df) > 5 else 2  # 최소 2개, 데이터 5개당 1개 클러스터
print(f"Clustering with {n_clusters} clusters...")

clustering = AgglomerativeClustering(n_clusters=n_clusters, linkage='average', metric='precomputed')
labels = clustering.fit_predict(distance_matrix)
df["cluster"] = labels

# 6. t-SNE
print("Applying t-SNE...")
# 데이터 크기에 따라 perplexity 조정
perplexity = min(30, len(df) // 3) if len(df) > 9 else 3  # perplexity는 데이터 크기의 1/3 정도로, 최소 3
tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity)
reduced = tsne.fit_transform(embeddings.cpu().numpy())

# 7. Plotly 시각화
def shorten_sequence(seq, max_len=80):
    return "<br>".join([seq[i:i+max_len] for i in range(0, len(seq), max_len)])

plot_df = pd.DataFrame({
    "x": reduced[:, 0],
    "y": reduced[:, 1],
    "dialogue_id": df["dialogue_id"],
    "cluster": df["cluster"].astype(str),
    "sequence": df["sequence"]
})
plot_df["short_sequence"] = plot_df["sequence"].apply(shorten_sequence)

fig = px.scatter(
    plot_df,
    x="x",
    y="y",
    color="cluster",
    hover_data={"dialogue_id": True, "short_sequence": True, "sequence": False},
    title="t-SNE of MDP Dialogue Sequences from dialogues_001.json (Interactive)"
)

# 8. 저장
output_file = "tsne_mdp_sequences_dialogues_001.html"
fig.write_html(output_file)
print(f"Visualization saved to {output_file}")

Loaded 512 dialogues from dataset/train/dialogues_001.json
Created dataframe with 490 dialogues
Generating embeddings...
Computing distance matrix...
Clustering with 20 clusters...
Applying t-SNE...
Visualization saved to tsne_mdp_sequences_dialogues_001.html




