In [10]:
# ============================================
# 4개 predicate(Confirmshaming / Trick Questions / Pressured Selling / Activity Notifications)
# 유사도 & 분포 한방 분석 스크립트
# - TF-IDF(word 1~3-gram + char 3~5-gram)
# - Centroid 코사인/유클리드 + 히트맵
# - Intra/Inter 코사인 분포(히스토그램)
# - 거리(샘플↔자기 클래스 centroid) 분포(히스토그램/박스)
# - Silhouette 점수 분포(히스토그램)
# - 2D 임베딩(TruncatedSVD) 산점도
# - Centroid 덴드로그램(계층 군집)
# - (옵션) 샘플링 pairwise cosine 히트맵
# ============================================

import os, re, itertools, numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import squareform

# ------------------ 설정 ------------------
FILE_PATH = "/Users/soyoung/404DNF_AI/data/processed/template_predicate.csv"      # 필요시 경로 변경
OUTPUT_DIR = "dp_dist"
os.makedirs(OUTPUT_DIR, exist_ok=True)

TARGET_PREDICATES = [
    "confirmshaming",
    "trick questions",
    "pressured selling",
    "activity notifications",
]

# ------------------ 1) 데이터 로드/필터 ------------------
df = pd.read_csv(FILE_PATH)
assert {"String", "predicate"} <= set(df.columns), "CSV에 String, predicate 컬럼이 필요합니다."
df = df.dropna(subset=["String", "predicate"]).copy()
df["predicate"] = df["predicate"].str.lower().str.strip()

mask = df["predicate"].isin([p.lower() for p in TARGET_PREDICATES])
data = df.loc[mask].reset_index(drop=True)
if data.empty:
    raise ValueError("선택한 predicate가 데이터에 없습니다.")

print("=== 클래스별 샘플 수 ===")
print(data["predicate"].value_counts(), "\n")

# ------------------ 2) 텍스트 정제 ------------------
def clean_keep_nums(s: str) -> str:
    s = s.lower()
    s = re.sub(r"http\S+|www\.\S+", " ", s)
    # 숫자/%, $ 유지(가격/카운트류 신호 보존)
    s = re.sub(r"[^a-z0-9\s%$]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

data["text_clean"] = data["String"].astype(str).apply(clean_keep_nums)

# ------------------ 3) TF-IDF (word + char) ------------------
wvec = TfidfVectorizer(stop_words="english", ngram_range=(1,3), min_df=1, max_df=0.95, norm="l2")
cvec = TfidfVectorizer(analyzer="char_wb", ngram_range=(3,5), min_df=2, norm="l2")
Xw = wvec.fit_transform(data["text_clean"])
Xc = cvec.fit_transform(data["text_clean"])
X  = hstack([Xw, Xc])  # csr_matrix

labels = data["predicate"].values
classes = sorted(pd.unique(labels))

# ------------------ 4) Centroid 유사도/거리 + 히트맵 ------------------
centroids = []
for c in classes:
    rows = X[labels == c]
    centroids.append(np.asarray(rows.mean(axis=0)).ravel())
centroids = np.vstack(centroids)

centroid_cos = cosine_similarity(centroids)
centroid_euc = euclidean_distances(centroids)

centroid_cos_df = pd.DataFrame(centroid_cos, index=classes, columns=classes)
centroid_euc_df = pd.DataFrame(centroid_euc, index=classes, columns=classes)
centroid_cos_df.to_csv(os.path.join(OUTPUT_DIR, "centroid_cosine.csv"))
centroid_euc_df.to_csv(os.path.join(OUTPUT_DIR, "centroid_euclidean.csv"))

def heatmap(mat, ticks, title, path):
    plt.figure(figsize=(6,5))
    im = plt.imshow(mat, interpolation="nearest")
    plt.colorbar(im, fraction=0.046, pad=0.04)
    plt.xticks(range(len(ticks)), ticks, rotation=45, ha="right")
    plt.yticks(range(len(ticks)), ticks)
    plt.title(title)
    plt.tight_layout()
    plt.savefig(path, dpi=150)
    plt.close()

heatmap(centroid_cos_df.values, classes, "Centroid Cosine Similarity", os.path.join(OUTPUT_DIR, "centroid_cosine_heatmap.png"))
heatmap(centroid_euc_df.values, classes, "Centroid Euclidean Distance", os.path.join(OUTPUT_DIR, "centroid_euclidean_heatmap.png"))

# ------------------ 5) Intra/Inter 분포(히스토그램) ------------------
# Intra: 같은 클래스 내 pairwise cosine
for c in classes:
    idx = np.where(labels == c)[0]
    if len(idx) < 2:
        continue
    sub = X[idx]
    S = cosine_similarity(sub)
    # 상삼각(대각 제외)만 취해 분포 생성
    triu = np.triu_indices_from(S, k=1)
    vals = S[triu]
    plt.figure(figsize=(6,4))
    plt.hist(vals, bins=30)
    plt.title(f"Intra-class Cosine Distribution: {c}")
    plt.xlabel("cosine")
    plt.ylabel("count")
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, f"intra_cosine_hist_{c.replace(' ','_')}.png"), dpi=150)
    plt.close()

# Inter: 서로 다른 클래스 간 pairwise cosine
for a, b in itertools.combinations(classes, 2):
    Sa = X[labels == a]; Sb = X[labels == b]
    S = cosine_similarity(Sa, Sb).ravel()
    plt.figure(figsize=(6,4))
    plt.hist(S, bins=30)
    plt.title(f"Inter-class Cosine Distribution: {a} vs {b}")
    plt.xlabel("cosine")
    plt.ylabel("count")
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, f"inter_cosine_hist_{a.replace(' ','_')}__{b.replace(' ','_')}.png"), dpi=150)
    plt.close()

# ------------------ 6) 샘플↔자기 centroid 거리 분포 ------------------
# (1 - cosine(sample, class_centroid)) 히스토그램 + 박스
for c in classes:
    Xi = X[labels == c]
    ci = centroids[classes.index(c)].reshape(1,-1)
    d = 1 - cosine_similarity(Xi, ci).ravel()
    # 히스토그램
    plt.figure(figsize=(6,4))
    plt.hist(d, bins=30)
    plt.title(f"Distance to Own Centroid (1 - cosine): {c}")
    plt.xlabel("1 - cosine")
    plt.ylabel("count")
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, f"dist_to_centroid_hist_{c.replace(' ','_')}.png"), dpi=150)
    plt.close()
    # 박스플롯
    plt.figure(figsize=(4,5))
    plt.boxplot(d, vert=True, labels=[c])
    plt.ylabel("1 - cosine")
    plt.title(f"Box: Dist to Own Centroid")
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, f"dist_to_centroid_box_{c.replace(' ','_')}.png"), dpi=150)
    plt.close()

# ------------------ 7) Silhouette (cosine) 분포 ------------------
le = LabelEncoder(); y = le.fit_transform(labels)
sil = silhouette_score(X, y, metric="cosine")
print("Silhouette (cosine):", round(sil, 3))
s_each = silhouette_samples(X, y, metric="cosine")
plt.figure(figsize=(6,4))
plt.hist(s_each, bins=30)
plt.title("Silhouette Score Distribution (cosine)")
plt.xlabel("silhouette score")
plt.ylabel("count")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "silhouette_hist.png"), dpi=150)
plt.close()

# ------------------ 8) 2D 임베딩 산점도 (TruncatedSVD) ------------------
svd = TruncatedSVD(n_components=2, random_state=42)
X2 = svd.fit_transform(X)
print("SVD explained variance ratio (2D sum):", svd.explained_variance_ratio_.sum().round(3))

plt.figure(figsize=(6,5))
for c in classes:
    m = (labels == c)
    plt.scatter(X2[m,0], X2[m,1], s=18, alpha=0.85, label=c)
plt.legend(loc="best", fontsize=8)
plt.title("TruncatedSVD (TF-IDF word+char, 2D)")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "svd_tfidf_scatter.png"), dpi=150)
plt.close()

# ------------------ 9) Centroid 덴드로그램 ------------------
# 코사인 거리 = 1 - 코사인 유사도
D = 1 - centroid_cos_df.values
np.fill_diagonal(D, 0.0)
condensed = squareform(D, checks=False)
Z = linkage(condensed, method="average")
plt.figure(figsize=(6,4))
dendrogram(Z, labels=classes, orientation="top")
plt.title("Dendrogram of Class Centroids (cosine distance)")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "centroid_dendrogram.png"), dpi=150)
plt.close()

# ------------------ 10) (옵션) 샘플링 pairwise cosine 히트맵 ------------------
S_PER_CLS = 25  # 클래스당 최대 샘플 수
idx_all = []
for c in classes:
    ids = np.where(labels == c)[0].tolist()
    np.random.shuffle(ids)
    idx_all.extend(ids[:S_PER_CLS])
X_samp = X[idx_all]; labs_samp = labels[idx_all]
S = cosine_similarity(X_samp)

plt.figure(figsize=(7,6))
im = plt.imshow(S, interpolation="nearest")
plt.colorbar(im, fraction=0.046, pad=0.04)
ticks = range(len(idx_all))
plt.xticks(ticks, labs_samp, rotation=90)
plt.yticks(ticks, labs_samp)
plt.title(f"Pairwise Cosine (sampled up to {S_PER_CLS}/class)")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "pairwise_cosine_sampled_heatmap.png"), dpi=150)
plt.close()

# ------------------ 11) 간단 분류 성능(참고) ------------------
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
clf = LogisticRegression(max_iter=2000, solver="liblinear", multi_class="ovr")
clf.fit(X_tr, y_tr)
from sklearn.metrics import classification_report
print("\n=== Logistic Regression (OVR) on TF-IDF ===")
print(classification_report(y_te, clf.predict(X_te), target_names=classes))

print(f"\n완료! 결과 이미지/CSV는 '{OUTPUT_DIR}/' 폴더에 저장되었습니다.")


=== 클래스별 샘플 수 ===
predicate
activity notifications    499
confirmshaming            298
pressured selling         140
trick questions            62
Name: count, dtype: int64 



  plt.boxplot(d, vert=True, labels=[c])
  plt.boxplot(d, vert=True, labels=[c])
  plt.boxplot(d, vert=True, labels=[c])
  plt.boxplot(d, vert=True, labels=[c])


Silhouette (cosine): 0.062
SVD explained variance ratio (2D sum): 0.076

=== Logistic Regression (OVR) on TF-IDF ===
                        precision    recall  f1-score   support

activity notifications       0.89      1.00      0.94       100
        confirmshaming       0.98      1.00      0.99        60
     pressured selling       1.00      0.71      0.83        28
       trick questions       1.00      0.58      0.74        12

              accuracy                           0.94       200
             macro avg       0.97      0.82      0.88       200
          weighted avg       0.94      0.94      0.93       200


완료! 결과 이미지/CSV는 'dp_dist/' 폴더에 저장되었습니다.




In [13]:
# ============================================
# SBERT 기반 4개 predicate 유사도/분포 분석 (올인원)
# 대상: confirmshaming / trick questions / pressured selling / activity notifications
# - SBERT 임베딩(all-MiniLM-L6-v2, 384D, fast)
# - Centroid 코사인 유사도/거리, Intra/Inter 코사인 분포
# - Silhouette(코사인) 지표, 2D 임베딩(UMAP→SVD fallback)
# - 대표 문장(centroid 근접 Top-K) 추출
# - 샘플링 pairwise cosine 히트맵
# 산출물: sb_out/ 폴더에 PNG/CSV/TXT 저장
# ============================================

import os, re, numpy as np, pandas as pd, matplotlib.pyplot as plt
from collections import defaultdict

from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD

# ---------- 경로/모델 설정 ----------
FILE_PATH = "/Users/soyoung/404DNF_AI/data/processed/template_predicate.csv"   # 필요 시 절대경로로 변경: /mnt/data/paraphrase_predicate.csv
OUTPUT_DIR = "sb_out"; os.makedirs(OUTPUT_DIR, exist_ok=True)

TARGET_PREDICATES = [
    "confirmshaming",
    "trick questions",
    "pressured selling",
    "activity notifications",
]

MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
# 다국어 문장 섞여 있으면:
# MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"

BATCH_SIZE = 64
TOPK_REPR = 20
SAMPLE_PER_CLASS_FOR_HEATMAP = 25

# ---------- 1) 데이터 로드/필터 ----------
df = pd.read_csv(FILE_PATH)
assert {"String","predicate"} <= set(df.columns), "CSV에 String, predicate 컬럼이 필요합니다."
df = df.dropna(subset=["String","predicate"]).copy()
df["predicate"] = df["predicate"].str.lower().str.strip()

mask = df["predicate"].isin([p.lower() for p in TARGET_PREDICATES])
data = df.loc[mask].reset_index(drop=True)
if data.empty:
    raise ValueError("선택한 predicate가 데이터에 없습니다.")

print("=== 클래스별 샘플 수 ===")
print(data["predicate"].value_counts(), "\n")

texts = data["String"].astype(str).tolist()
labels = data["predicate"].values
classes = sorted(pd.unique(labels))

# ---------- 2) SBERT 임베딩 ----------
print("Embedding with SBERT:", MODEL_NAME)
model = SentenceTransformer(MODEL_NAME)
emb = model.encode(
    texts,
    batch_size=BATCH_SIZE,
    normalize_embeddings=True,   # L2 정규화 -> 코사인 = 내적
    convert_to_numpy=True,
    show_progress_bar=True
)  # shape: [N, D], 각 벡터는 단위 벡터

# ---------- 3) 클래스 centroid 및 유사도 ----------
centroids = []
for c in classes:
    ci = emb[labels == c].mean(axis=0)
    # 평균 후 다시 정규화 (단위 벡터로)
    norm = np.linalg.norm(ci) + 1e-12
    centroids.append(ci / norm)
centroids = np.vstack(centroids)  # [C, D]

# centroid 코사인/거리
centroid_cos = centroids @ centroids.T  # 단위벡터 -> 코사인 = 내적
centroid_dist = 1 - centroid_cos

centroid_cos_df = pd.DataFrame(centroid_cos, index=classes, columns=classes)
centroid_cos_df.to_csv(os.path.join(OUTPUT_DIR, "centroid_cosine.csv"))
pd.DataFrame(centroid_dist, index=classes, columns=classes).to_csv(
    os.path.join(OUTPUT_DIR, "centroid_cosine_distance.csv")
)

def heatmap(mat, ticks, title, path):
    plt.figure(figsize=(6,5))
    im = plt.imshow(mat, interpolation="nearest")
    plt.colorbar(im, fraction=0.046, pad=0.04)
    plt.xticks(range(len(ticks)), ticks, rotation=45, ha="right")
    plt.yticks(range(len(ticks)), ticks)
    plt.title(title)
    plt.tight_layout()
    plt.savefig(path, dpi=150); plt.close()

heatmap(centroid_cos, classes, "Centroid Cosine Similarity (SBERT)", os.path.join(OUTPUT_DIR, "centroid_cosine_heatmap.png"))
heatmap(centroid_dist, classes, "Centroid Cosine Distance (1-cos)", os.path.join(OUTPUT_DIR, "centroid_distance_heatmap.png"))

print("=== (요약) SBERT 기반 centroid 코사인 유사도 ===")
print(centroid_cos_df.round(3), "\n")

# ---------- 4) Intra/Inter 코사인 분포 ----------
# Intra: 같은 클래스 내 문장-문장 코사인 분포 (히스토그램)
for c in classes:
    idx = np.where(labels == c)[0]
    if len(idx) < 2: 
        continue
    S = cosine_similarity(emb[idx])
    iu = np.triu_indices_from(S, k=1)
    vals = S[iu]
    plt.figure(figsize=(6,4))
    plt.hist(vals, bins=30)
    plt.title(f"Intra-class Cosine (SBERT): {c}")
    plt.xlabel("cosine"); plt.ylabel("count")
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, f"intra_cosine_hist_{c.replace(' ','_')}.png"), dpi=150)
    plt.close()

# Inter: 서로 다른 클래스 간 문장-문장 코사인 분포 (히스토그램)
import itertools
for a, b in itertools.combinations(classes, 2):
    Sa = emb[labels == a]; Sb = emb[labels == b]
    sims = cosine_similarity(Sa, Sb).ravel()
    plt.figure(figsize=(6,4))
    plt.hist(sims, bins=30)
    plt.title(f"Inter-class Cosine (SBERT): {a} vs {b}")
    plt.xlabel("cosine"); plt.ylabel("count")
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, f"inter_cosine_hist_{a.replace(' ','_')}__{b.replace(' ','_')}.png"), dpi=150)
    plt.close()

# ---------- 5) Silhouette (cosine) 전반 분리도 ----------
le = LabelEncoder(); y = le.fit_transform(labels)
sil = silhouette_score(emb, y, metric="cosine")
print("Silhouette (cosine):", round(sil, 3), "(~0.2↑ 꽤 분리, 0.4↑ 매우 잘 분리)\n")

# 분포 그림
s_each = silhouette_samples(emb, y, metric="cosine")
plt.figure(figsize=(6,4))
plt.hist(s_each, bins=30)
plt.title("Silhouette Score Distribution (cosine, SBERT)")
plt.xlabel("silhouette score"); plt.ylabel("count")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "silhouette_hist.png"), dpi=150)
plt.close()

# ---------- 6) 2D 임베딩 산점도 (UMAP → SVD fallback) ----------
def scatter_2d(X2, labels_str, title, path):
    plt.figure(figsize=(6,5))
    for c in classes:
        m = (labels_str == c)
        plt.scatter(X2[m,0], X2[m,1], s=18, alpha=0.85, label=c)
    plt.legend(loc="best", fontsize=8)
    plt.title(title)
    plt.tight_layout()
    plt.savefig(path, dpi=150); plt.close()

try:
    import umap
    reducer = umap.UMAP(n_components=2, random_state=42, metric="cosine")
    X2 = reducer.fit_transform(emb)
    scatter_2d(X2, labels, "UMAP (SBERT, cosine)", os.path.join(OUTPUT_DIR, "umap_scatter.png"))
    print("UMAP 2D 산점도 생성 완료.")
except Exception as e:
    print("UMAP 사용 불가(fallback to TruncatedSVD):", e)
    svd = TruncatedSVD(n_components=2, random_state=42)
    X2 = svd.fit_transform(emb)  # 단위벡터지만 SVD로 투영
    scatter_2d(X2, labels, "TruncatedSVD (SBERT)", os.path.join(OUTPUT_DIR, "svd_scatter.png"))
    print("SVD 2D 산점도 생성 완료.")

# ---------- 7) 샘플링 pairwise cosine 히트맵 ----------
np.random.seed(42)
idx_all = []
for c in classes:
    ids = np.where(labels == c)[0].tolist()
    np.random.shuffle(ids)
    idx_all.extend(ids[:SAMPLE_PER_CLASS_FOR_HEATMAP])
emb_s = emb[idx_all]; labs_s = labels[idx_all]
S = cosine_similarity(emb_s)

plt.figure(figsize=(7,6))
im = plt.imshow(S, interpolation="nearest")
plt.colorbar(im, fraction=0.046, pad=0.04)
ticks = range(len(idx_all))
plt.xticks(ticks, labs_s, rotation=90)
plt.yticks(ticks, labs_s)
plt.title(f"Pairwise Cosine (SBERT, sampled up to {SAMPLE_PER_CLASS_FOR_HEATMAP}/class)")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "pairwise_cosine_sampled_heatmap.png"), dpi=150)
plt.close()

# ---------- 8) 클래스별 대표 문장(centroid 근접 Top-K) ----------
with open(os.path.join(OUTPUT_DIR, "representative_examples.txt"), "w", encoding="utf-8") as f:
    for i, c in enumerate(classes):
        sim_to_c = emb[labels==c] @ centroids[i]  # 단위벡터 내적 = 코사인
        idx_local = np.argsort(-sim_to_c)[:TOPK_REPR]
        ex_texts = data.loc[labels==c, "String"].iloc[idx_local].tolist()
        f.write(f"[{c}] centroid-near examples (Top {TOPK_REPR})\n")
        for t in ex_texts:
            f.write("- " + str(t).replace("\n"," ") + "\n")
        f.write("\n")

print(f"완료! 결과는 '{OUTPUT_DIR}/' 폴더에 저장되었습니다.")
print("- centroid_cosine.csv / centroid_*_heatmap.png / centroid_cosine_distance.csv")
print("- intra_cosine_hist_*.png / inter_cosine_hist_*__*.png")
print("- silhouette_hist.png")
print("- umap_scatter.png (또는 svd_scatter.png)")
print("- pairwise_cosine_sampled_heatmap.png")
print("- representative_examples.txt")


=== 클래스별 샘플 수 ===
predicate
activity notifications    499
confirmshaming            298
pressured selling         140
trick questions            62
Name: count, dtype: int64 

Embedding with SBERT: sentence-transformers/all-MiniLM-L6-v2


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

=== (요약) SBERT 기반 centroid 코사인 유사도 ===
                        activity notifications  confirmshaming  \
activity notifications                   1.000           0.312   
confirmshaming                           0.312           1.000   
pressured selling                        0.586           0.591   
trick questions                          0.497           0.552   

                        pressured selling  trick questions  
activity notifications              0.586            0.497  
confirmshaming                      0.591            0.552  
pressured selling                   1.000            0.836  
trick questions                     0.836            1.000   

Silhouette (cosine): 0.15 (~0.2↑ 꽤 분리, 0.4↑ 매우 잘 분리)

UMAP 사용 불가(fallback to TruncatedSVD): Numba needs NumPy 2.2 or less. Got NumPy 2.3.
SVD 2D 산점도 생성 완료.
완료! 결과는 'sb_out/' 폴더에 저장되었습니다.
- centroid_cosine.csv / centroid_*_heatmap.png / centroid_cosine_distance.csv
- intra_cosine_hist_*.png / inter_cosine_hist_*__*.png
- s