任务2总体思路

特征：
E0..E9（语义嵌入 10 维） + 7 个音频特征（0–1 归一）
支持数值块/嵌入块的加权（避免某一块支配）

降维（可选但推荐）：PCA 把 17 维 → 10 ，降噪、便于聚类

聚类： KMeans，用 肘部法 + 轮廓系数 选 K；

检索：给定歌曲 id → 找到其簇 C → 只在簇内做相似度排序（余弦），取 Top-K。
这样主题一致性更强、效率也更高。

训练脚本（聚类模型产出 + 工件保存）

In [15]:
import pandas as pd, numpy as np, joblib
from pathlib import Path
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

import pandas as pd, numpy as np
from pathlib import Path

import os
os.environ["LOKY_MAX_CPU_COUNT"] = "4"  
os.environ["OMP_NUM_THREADS"] = "4"

# 1️⃣ 读取 Task1 的结果文件
CSV = Path(r"C:\Users\Akari\OneDrive\Desktop\SEM 1\AML\Final CW\datas\spotify_preprocess.csv")
df = pd.read_csv(CSV)

# 2️⃣ 拆分 Embedding_10d 为 10 列
def split_embed(s):
    parts = [float(x) for x in str(s).split(',') if x != ""]
    if len(parts) < 10:
        parts += [0.0]*(10-len(parts))
    return parts[:10]

embeds = np.vstack(df['Embedding_10d'].apply(split_embed))
embed_cols = [f'E{i}' for i in range(10)]
for i, c in enumerate(embed_cols):
    df[c] = embeds[:, i]

# 3️⃣ 删除原字符串列（可选）
df.drop(columns=['Embedding_10d'], inplace=True)

# 4️⃣ 现在 df 就可以直接用于 Task2 的 PCA / KMeans
print("✅ 预处理完成，可直接用于聚类")
print(df.head(3))

EMB_COLS = [f"E{i}" for i in range(10)]
NUM_COLS = ['Danceability','Energy','Valence','Loudness','Speechiness','Acousticness','Instrumentalness']

# --- 特征加权（可调） ---
W_EMB = 1.0
W_NUM = 1.0
X = np.hstack([df[EMB_COLS].values * W_EMB,
               df[NUM_COLS].values * W_NUM])

# --- PCA（可选） ---
USE_PCA = True
PCA_DIM = 10
if USE_PCA:
    pca = PCA(n_components=PCA_DIM, random_state=42)
    Xp = pca.fit_transform(X)
else:
    pca, Xp = None, X

# --- 选 K：粗扫 + 指标 ---
candidates = [6,8,10,12,14]
scores = {}
for k in candidates:
    km = KMeans(n_clusters=k, n_init=20, random_state=42)
    lab = km.fit_predict(Xp)
    sil = silhouette_score(Xp, lab)
    scores[k] = sil
best_k = max(scores, key=scores.get)
print("Silhouette by K:", scores, "→ best_k =", best_k)

# --- 用 best_k 重新训练 ---
kmeans = KMeans(n_clusters=best_k, n_init=50, random_state=42)
labels = kmeans.fit_predict(Xp)

# --- 保存工件 ---
out_dir = Path(r"C:\Users\Akari\OneDrive\Desktop\SEM 1\AML\Final CW\datas\artifacts_task2"); out_dir.mkdir(exist_ok=True)
df_out = df.copy()
df_out["cluster"] = labels
df_out.to_csv(out_dir/"clustered_catalog.csv", index=False)

joblib.dump({"kmeans": kmeans, "pca": pca, "W_EMB": W_EMB, "W_NUM": W_NUM,
             "EMB_COLS": EMB_COLS, "NUM_COLS": NUM_COLS},
            out_dir/"cluster_model.pkl")
print("Saved:", out_dir)


✅ 预处理完成，可直接用于聚类
                       id                  Title  Danceability    Energy  \
0  003vvx7Niy0yvhvHt4a68B         Mr. Brightside      0.216321  0.906459   
1  00Ga884hbpVvCNyeQdle1U       Violet Chemistry      0.591969  0.688196   
2  02FaKXXL7KUtRc7K0k54tL  Cozy Little Christmas      0.762953  0.569042   

    Valence  Loudness  Speechiness  Acousticness  Instrumentalness      E0  \
0  0.211477  0.999725     0.084691      0.001006               0.0  0.1148   
1  0.585547  0.746225     0.026059      0.003018               0.0  0.0688   
2  0.555792  0.721656     0.171010      0.128773               0.0  0.0579   

       E1      E2      E3      E4      E5      E6      E7      E8      E9  
0  0.0401  0.1039  0.0970 -0.0332 -0.0449  0.2025  0.1399 -0.2235 -0.1483  
1  0.0480  0.0838  0.0890 -0.0333 -0.0059  0.2176  0.1298 -0.2488 -0.1638  
2  0.0626  0.0661  0.0605 -0.0436 -0.0274  0.1789  0.1501 -0.2526 -0.1877  




Silhouette by K: {6: 0.18599577377188012, 8: 0.1606890969199683, 10: 0.15231824195291613, 12: 0.15040373195291148, 14: 0.14468193768355264} → best_k = 6




Saved: C:\Users\Akari\OneDrive\Desktop\SEM 1\AML\Final CW\datas\artifacts_task2


在线推荐函数（簇内 Top-K）

In [21]:
import pandas as pd, numpy as np, joblib
from sklearn.metrics.pairwise import cosine_similarity

df = pd.read_csv(r"C:\Users\Akari\OneDrive\Desktop\SEM 1\AML\Final CW\Datas\artifacts_task2\clustered_catalog.csv")
model = joblib.load(r"C:\Users\Akari\OneDrive\Desktop\SEM 1\AML\Final CW\Datas\artifacts_task2\cluster_model.pkl")
kmeans = model["kmeans"]; pca = model["pca"]
EMB_COLS, NUM_COLS = model["EMB_COLS"], model["NUM_COLS"]
W_EMB, W_NUM = model["W_EMB"], model["W_NUM"]

def _build_features(subdf):
    X = np.hstack([subdf[EMB_COLS].values * W_EMB, subdf[NUM_COLS].values * W_NUM])
    return pca.transform(X) if pca is not None else X

# 预计算簇特征（可放缓存）
Xp = _build_features(df)

# id -> index
ID2IDX = {sid: i for i, sid in enumerate(df["id"])}

def recommend_cluster(song_id, k=10, diversify=False):
    if song_id not in ID2IDX:
        raise ValueError(f"id '{song_id}' 不存在")
    q = ID2IDX[song_id]
    c = df.loc[q, "cluster"]

    # 只取同簇候选
    idxs = np.where(df["cluster"].values == c)[0]
    Xc = Xp[idxs]; qv = Xp[q:q+1]

    # 余弦得分
    sims = cosine_similarity(qv, Xc).ravel()

    # 排除自己，取 Top-K
    mask = (idxs != q)
    idxs, sims = idxs[mask], sims[mask]
    k = min(k, len(idxs))
    order = np.argsort(sims)[-k:][::-1]
    idx_top = idxs[order]; sim_top = sims[order]

    out = df.iloc[idx_top][["id","Title"]].copy()
    out["score"] = sim_top
    out.insert(0, "rank", np.arange(1, k+1))

    if diversify:
        # 简单“同艺人降权”示例（Title 中艺人不在时可改成 Artists 列）
        # 这里假设 df 有 Artists 列；若没有可忽略此块或改为基于 Title 的去重。
        if "Artists" in df.columns:
            artists_q = set(str(df.loc[q, "Artists"]).split(" "))
            penalty = out.index.to_series().apply(
                lambda i: 0.9 if artists_q & set(str(df.loc[i, "Artists"]).split(" ")) else 1.0
            ).values
            out["score"] = out["score"] * penalty
            out = out.sort_values("score", ascending=False).head(k).reset_index(drop=True)
            out["rank"] = np.arange(1, len(out)+1)

    return out

# 使用：
print(recommend_cluster("003vvx7Niy0yvhvHt4a68B", k=10, diversify=True))


     rank                      id                                   Title  \
48      1  0QfZ8OHFnFzLe66iFBww2U                              Lux Æterna   
367     2  2nLtzopw4rPReszdYBJU6h                                    Numb   
83      3  0kAZ3H6G9Zac4PMpmobMkj  If This Was A Movie (Taylor's Version)   
651     4  4qzEjmuz380jeiBJp31oDY                                Chemical   
545     5  456WNXWhDwYOSf5SpTuqxd                       Dog Days Are Over   
105     6  0rc1HCVoReqzzXF9jssqZk                     Special (feat. SZA)   
750     7  5TOgxgZrIZzvaKg9r2bvc2                                Chemical   
787     8  5w40ZYhbBMAlHYNDaVJIUu                                Chemical   
800     9  60a0Rd6pjrkxjPbaKzXjfq                              In the End   
736    10  5NzfnUyVaNrxa2VtoyBWlR                       Tennessee Numbers   

        score  
48   0.963233  
367  0.934037  
83   0.924499  
651  0.921819  
545  0.913799  
105  0.912106  
750  0.909570  
787  0.908711  
800  0.9

读取 cluster_model.pkl 和 clustered_catalog.csv
打印出模型里有哪些对象、KMeans/PCA 关键信息、簇大小分布、每簇示例，以及一个快速的 silhouette score 复核

In [32]:
import os
import joblib
import pandas as pd
import numpy as np
from sklearn.metrics import silhouette_score

# === 1️⃣ 指定文件路径 ===
ART_DIR = r"C:\Users\Akari\OneDrive\Desktop\SEM 1\AML\Final CW\Datas\artifacts_task2"

model_path = os.path.join(ART_DIR, "cluster_model.pkl")
catalog_path = os.path.join(ART_DIR, "clustered_catalog.csv")

print("加载模型文件:", model_path)
print("加载数据文件:", catalog_path)

# === 2️⃣ 加载模型与数据 ===
model = joblib.load(model_path)
df = pd.read_csv(catalog_path)

# === 3️⃣ 解包模型内容 ===
kmeans = model.get("kmeans")
pca = model.get("pca")
W_EMB = model.get("W_EMB", 1.0)
W_NUM = model.get("W_NUM", 1.0)
EMB_COLS = model.get("EMB_COLS", [])
NUM_COLS = model.get("NUM_COLS", [])

print("\n=== 模型概要 ===")
print("KMeans 聚类数:", getattr(kmeans, "n_clusters", "未知"))
print("使用 PCA:", pca is not None)
print("嵌入权重:", W_EMB, " 数值特征权重:", W_NUM)
print("嵌入列:", EMB_COLS)
print("数值列:", NUM_COLS)

# === 4️⃣ 检查簇分布 ===
if "cluster" not in df.columns:
    raise ValueError("数据中没有 'cluster' 列，请确认文件是否正确。")

print("\n=== 簇大小分布 ===")
counts = df["cluster"].value_counts().sort_index()
for c, n in counts.items():
    print(f"Cluster {c}: {n} 首歌")

# === 5️⃣ 每个簇展示前 5 首歌 ===
show_cols = [c for c in ["id", "Title"] if c in df.columns]
print("\n=== 每簇示例（前 5 首） ===")
for c in counts.index:
    sub = df[df["cluster"] == c].head(5)
    print(f"\n[Cluster {c}]")
    print(sub[show_cols].to_string(index=False))

# === 6️⃣ 计算 silhouette 分数（复核模型质量） ===
try:
    X = np.hstack([
        df[EMB_COLS].values * W_EMB,
        df[NUM_COLS].values * W_NUM
    ])
    if pca is not None:
        X = pca.transform(X)
    labels = df["cluster"].values
    sil = silhouette_score(X, labels)
    print(f"\n模型整体轮廓系数 (silhouette score): {sil:.4f}")
except Exception as e:
    print("\n无法计算 silhouette：", e)


加载模型文件: C:\Users\Akari\OneDrive\Desktop\SEM 1\AML\Final CW\Datas\artifacts_task2\cluster_model.pkl
加载数据文件: C:\Users\Akari\OneDrive\Desktop\SEM 1\AML\Final CW\Datas\artifacts_task2\clustered_catalog.csv

=== 模型概要 ===
KMeans 聚类数: 6
使用 PCA: True
嵌入权重: 1.0  数值特征权重: 1.0
嵌入列: ['E0', 'E1', 'E2', 'E3', 'E4', 'E5', 'E6', 'E7', 'E8', 'E9']
数值列: ['Danceability', 'Energy', 'Valence', 'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness']

=== 簇大小分布 ===
Cluster 0: 110 首歌
Cluster 1: 92 首歌
Cluster 2: 244 首歌
Cluster 3: 115 首歌
Cluster 4: 308 首歌
Cluster 5: 137 首歌

=== 每簇示例（前 5 首） ===

[Cluster 0]
                    id                 Title
02FaKXXL7KUtRc7K0k54tL Cozy Little Christmas
0EgLxY52mpGsXETyEsgVlP                  HOPE
0FirgnvrpCkkhdaq64Gfen                YANKEE
0T2pB7P1VdXPhLdQZ488uH                Normal
0WoVfPjxsf9MTvPt9ptqeE           SLUT ME OUT

[Cluster 1]
                    id                                               Title
0B7wvvmu9EISAwZnOpjhNI                          