In [10]:
import json
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.manifold import TSNE
from sklearn.cluster import AgglomerativeClustering
import plotly.express as px
import numpy as np

# 데이터 로딩
# with open("dialogues_001.json", "r", encoding="utf-8") as f:
#     data = json.load(f)

# 시퀀스 추출 함수
with open("/Users/hyegang/Desktop/졸업논문/multiwoz/data/MultiWOZ_2.2/train/dialogues_00.json", "r", encoding="utf-8") as f:
    data = json.load(f)
def extract_sequence(dialogue):
    sequence = []
    for turn in dialogue.get("turns", []):
        if turn["speaker"] == "USER":
            actions = []
            for frame in turn.get("frames", []):
                intent = frame.get("state", {}).get("active_intent", "")
                slots = frame.get("state", {}).get("slot_values", {})
                for slot, values in slots.items():
                    for value in values:
                        actions.append(f"{intent.upper()}({slot}={value})")
            if actions:
                sequence.append(" + ".join(actions))
    return " → ".join(sequence)

dialogue_ids, sequences = [], []
for d in data:
    seq = extract_sequence(d)
    if seq:
        dialogue_ids.append(d["dialogue_id"])
        sequences.append(seq)

df = pd.DataFrame({"dialogue_id": dialogue_ids, "sequence": sequences})

# 임베딩
model = SentenceTransformer("all-mpnet-base-v2")
embeddings = model.encode(df["sequence"].tolist(), convert_to_tensor=True)

# 유사도 기반 거리행렬 → 클러스터링
cos_sim_matrix = (embeddings @ embeddings.T).cpu().numpy()
distance_matrix = 1 - cos_sim_matrix
clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=0.3, linkage='average', metric='precomputed')
labels = clustering.fit_predict(distance_matrix)
df["cluster"] = labels

# t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=5)
reduced = tsne.fit_transform(embeddings.cpu().numpy())

# plotly 시각화
plot_df = pd.DataFrame({
    "x": reduced[:, 0],
    "y": reduced[:, 1],
    "dialogue_id": df["dialogue_id"],
    "cluster": df["cluster"].astype(str),
    "sequence": df["sequence"]
})

fig = px.scatter(
    plot_df,
    x="x",
    y="y",
    color="cluster",
    hover_data=["dialogue_id", "sequence"],
    title="t-SNE of MDP Dialogue Sequences (Interactive)"
)

# HTML 저장
fig.write_html("tsne_mdp_sequences.html")


In [11]:
import plotly.express as px
import pandas as pd
from sklearn.manifold import TSNE
from sentence_transformers import SentenceTransformer

# 시퀀스 줄바꿈 함수
def shorten_sequence(seq, max_len=60):
    return "<br>".join([seq[i:i+max_len] for i in range(0, len(seq), max_len)])

# 1. 시퀀스 임베딩
model = SentenceTransformer("all-mpnet-base-v2")
embeddings = model.encode(df["sequence"].tolist(), convert_to_tensor=True)

# 2. t-SNE 차원 축소
tsne = TSNE(n_components=2, random_state=42, perplexity=5)
reduced = tsne.fit_transform(embeddings.cpu().numpy())

# 3. plot용 데이터프레임 구성
plot_df = pd.DataFrame({
    "x": reduced[:, 0],
    "y": reduced[:, 1],
    "dialogue_id": df["dialogue_id"],
    "cluster": df["cluster"].astype(str),
    "sequence": df["sequence"]
})
plot_df["short_sequence"] = plot_df["sequence"].apply(lambda x: shorten_sequence(x))

# 4. Plotly 시각화
fig = px.scatter(
    plot_df,
    x="x",
    y="y",
    color="cluster",
    hover_data={"dialogue_id": True, "short_sequence": True, "sequence": False},
    title="t-SNE of MDP Dialogue Sequences (Interactive)"
)

# 5. HTML로 저장
fig.write_html("tsne_mdp_sequences_wrapped.html")


In [20]:
import os
import json
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.manifold import TSNE
from sklearn.cluster import AgglomerativeClustering
import plotly.express as px

# 1. JSON 파일 불러오기
data_dir = "/Users/hyegang/Desktop/졸업논문/multiwoz/data/MultiWOZ_2.2/train"
dialogues = []

for file_name in tqdm(os.listdir(data_dir)):
    if file_name.endswith(".json"):
        with open(os.path.join(data_dir, file_name), "r", encoding="utf-8") as f:
            dialogues.extend(json.load(f))

# 2. MDP 시퀀스 추출 함수
def extract_sequence(dialogue):
    sequence = []
    for turn in dialogue.get("turns", []):
        if turn["speaker"] == "USER":
            actions = []
            for frame in turn.get("frames", []):
                intent = frame.get("state", {}).get("active_intent", "")
                slots = frame.get("state", {}).get("slot_values", {})
                for slot, values in slots.items():
                    for value in values:
                        actions.append(f"{intent.upper()}({slot}={value})")
            if actions:
                sequence.append(" + ".join(actions))
    return " → ".join(sequence)

# 3. 시퀀스 리스트 생성
dialogue_ids, sequences = [], []
for d in dialogues:
    seq = extract_sequence(d)
    if seq:
        dialogue_ids.append(d["dialogue_id"])
        sequences.append(seq)

df = pd.DataFrame({"dialogue_id": dialogue_ids, "sequence": sequences})

# 4. Sentence-BERT 임베딩
model = SentenceTransformer("all-mpnet-base-v2")
embeddings = model.encode(df["sequence"].tolist(), convert_to_tensor=True)

# 5. 거리 행렬 + 클러스터링
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

cos_sim_matrix = cosine_similarity(embeddings.cpu())
distance_matrix = 1 - cos_sim_matrix

clustering = AgglomerativeClustering(n_clusters=20, linkage='average', metric='precomputed')
labels = clustering.fit_predict(distance_matrix)
df["cluster"] = labels

# 6. t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
reduced = tsne.fit_transform(embeddings.cpu().numpy())

# 7. Plotly 시각화
def shorten_sequence(seq, max_len=80):
    return "<br>".join([seq[i:i+max_len] for i in range(0, len(seq), max_len)])

plot_df = pd.DataFrame({
    "x": reduced[:, 0],
    "y": reduced[:, 1],
    "dialogue_id": df["dialogue_id"],
    "cluster": df["cluster"].astype(str),
    "sequence": df["sequence"]
})
plot_df["short_sequence"] = plot_df["sequence"].apply(shorten_sequence)

fig = px.scatter(
    plot_df,
    x="x",
    y="y",
    color="cluster",
    hover_data={"dialogue_id": True, "short_sequence": True, "sequence": False},
    title="t-SNE of MDP Dialogue Sequences from MultiWOZ (Interactive)"
)

# 8. 저장
fig.write_html("tsne_mdp_sequences_multiwoz.html")


100%|██████████| 17/17 [00:01<00:00, 14.11it/s]


In [21]:
# 클러스터별 대표 시퀀스 보기
top_sequences_per_cluster = (
    plot_df.groupby("cluster")["sequence"]
    .apply(lambda x: x.value_counts().head(3))
    .reset_index()
    .rename(columns={"level_1": "sequence", "sequence": "count"})
)

# 결과 출력
top_sequences_per_cluster


Unnamed: 0,cluster,sequence,count
0,0,FIND_ATTRACTION(attraction-name=people's portr...,2
1,0,FIND_ATTRACTION(attraction-type=museum) → FIND...,2
2,0,FIND_ATTRACTION(attraction-area=centre) → FIND...,2
3,1,FIND_ATTRACTION(attraction-type=museum) → FIND...,2
4,1,FIND_ATTRACTION(attraction-area=centre) → FIND...,2
5,1,FIND_ATTRACTION(attraction-type=museum) → FIND...,2
6,10,FIND_TAXI(taxi-destination=golden house) + FIN...,1
7,10,FIND_TAXI(taxi-departure=ali baba) + FIND_TAXI...,1
8,10,FIND_TAXI(taxi-destination=cambridge lodge res...,1
9,11,FIND_HOTEL(hotel-name=kirkwood house) → FIND_H...,2


In [22]:
top_sequences_per_cluster.to_excel('top_sequences2.xlsx')

In [25]:
from pathlib import Path

save_path = Path("/Users/hyegang/Desktop/mdp_classification_results")
def save_examples_by_group(df, group_col, filename, max_groups=10, samples_per_group=5):
    examples = []
    group_counts = df[group_col].value_counts().head(max_groups)

    for group in group_counts.index:
        group_rows = df[df[group_col] == group]
        sample_rows = group_rows.sample(min(samples_per_group, len(group_rows)), random_state=42)
        for _, row in sample_rows.iterrows():
            examples.append({
                group_col: group,
                "dialogue_id": row["dialogue_id"],
                "sequence": row["sequence"]
            })

    example_df = pd.DataFrame(examples)
    example_df.to_excel(filename, index=False)

# 예시 저장 경로
example_path = save_path / "examples"

example_path.mkdir(parents=True, exist_ok=True)

# intent 구조 예시 저장
df["intent_sequence"] = df["intents"].apply(lambda x: " → ".join(x))
save_examples_by_group(df, "intent_sequence", example_path / "intent_examples.xlsx")

# 행동 흐름 예시 저장 (sequence 자체 기준)
save_examples_by_group(df, "sequence", example_path / "action_flow_examples.xlsx")

# 슬롯 구조 예시 저장
df["slot_signature"] = df["slot_types"].apply(lambda x: ", ".join(sorted(x)))
save_examples_by_group(df, "slot_signature", example_path / "slot_structure_examples.xlsx")

# 도메인 구조 예시 저장
df["domain_signature"] = df["domains"].apply(lambda x: ", ".join(sorted(x)))
save_examples_by_group(df, "domain_signature", example_path / "domain_examples.xlsx")

# 대화 길이 그룹 예시 저장
df["turn_length_bucket"] = pd.cut(df["turn_length"], bins=[0, 4, 8, 15, 100], labels=["short", "medium", "long", "very long"])
save_examples_by_group(df, "turn_length_bucket", example_path / "turn_length_examples.xlsx")

# 성공 여부 예시 저장
df["success_label"] = df["task_success"].apply(lambda x: "success" if x else "failure")
save_examples_by_group(df, "success_label", example_path / "task_success_examples.xlsx")



KeyError: 'intents'

ModuleNotFoundError: No module named 'pydotplus'

In [18]:
import openpyxl
top_sequences_per_cluster.to_excel("top_sequences.xlsx")

In [17]:
!pip3 install openpyxl

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5


In [7]:
!pip3 install plotly

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting plotly
  Downloading plotly-6.0.1-py3-none-any.whl.metadata (6.7 kB)
Collecting narwhals>=1.15.1 (from plotly)
  Downloading narwhals-1.32.0-py3-none-any.whl.metadata (9.2 kB)
Downloading plotly-6.0.1-py3-none-any.whl (14.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.8/14.8 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading narwhals-1.32.0-py3-none-any.whl (320 kB)
Installing collected packages: narwhals, plotly
Successfully installed narwhals-1.32.0 plotly-6.0.1


In [2]:
!pip3 install sentence-transformers scikit-learn matplotlib seaborn

Collecting sentence-transformers
  Using cached sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp313-cp313-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Using cached transformers-4.50.1-py3-none-any.whl.metadata (39 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Using cached huggingface_hub-0.29.3-py3-none-any.whl.metadata (13 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Collecting pyyaml>=5.1 (from huggingface-hub>=0.20.0->sentence-transformers)
  Using cached PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl.metadata (2.1 kB)
Collecting requests (from huggingface-hub>=0.20.0->sentence-transformers)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collec