In [None]:
# --- Export embeddings for the primary node type (e.g., 'movie') ---

# Use the splits you already have (leakage-safe train_graph)
z = export_embeddings(model, splits['train_graph'], primary_ntype='movie', layers=2, batch_size=4096)

# L2-normalize (cosine-friendly)
z = torch.nn.functional.normalize(z, p=2, dim=1)  # [N, d]

In [None]:
def knn_graph_cosine(z, k=30, chunk=8192):
    """
    Returns (indices, sims), where:
      indices: [N, k] int64 neighbor IDs (excluding self)
      sims:    [N, k] float32 cosine sims corresponding to indices
    """
    z = torch.nn.functional.normalize(z, p=2, dim=1)
    N, d = z.shape
    all_idx = []
    all_sim = []
    for start in range(0, N, chunk):
        end = min(start+chunk, N)
        block = z[start:end]                               # [B, d]
        sims = block @ z.T                                 # [B, N]
        sims[:, start:end] = -1.0                          # exclude self-range; will be masked out by topk anyway
        vals, idx = torch.topk(sims, k=k, dim=1)           # [B, k]
        all_idx.append(idx.cpu())
        all_sim.append(vals.cpu())
    return torch.vstack(all_idx), torch.vstack(all_sim)

knn_idx, knn_sim = knn_graph_cosine(z, k=30)  # typical k=15~50


In [None]:
# pip install hdbscan (once)
import numpy as np
try:
    import hdbscan
except ImportError:
    raise RuntimeError("Please `pip install hdbscan` to use HDBSCAN.")

def cluster_hdbscan(z, min_cluster_size=15, min_samples=None, metric='euclidean'):
    # HDBSCAN works in distance space; use euclidean on normalized z (cosine≈euclid on unit vectors)
    Z = z.numpy() if isinstance(z, torch.Tensor) else z
    clf = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples, metric=metric)
    labels = clf.fit_predict(Z)    # -1 are noise points
    probs  = clf.probabilities_
    return labels, probs, clf

labels_hdb, probs_hdb, hdb = cluster_hdbscan(z, min_cluster_size=20, min_samples=10)


In [None]:
# pip install igraph leidenalg
import igraph as ig
import leidenalg as la

def leiden_from_knn(knn_idx, knn_sim=None, resolution=1.0, weighted=True):
    """
    Build an undirected graph from kNN edges and run Leiden.
    """
    N = knn_idx.shape[0]
    # Build edge list (i < j to avoid duplicates)
    src = np.repeat(np.arange(N), knn_idx.shape[1])
    dst = knn_idx.reshape(-1)
    edge_pairs = np.stack([src, dst], axis=1)
    # Make undirected unique edges
    edge_pairs = np.sort(edge_pairs, axis=1)
    edge_pairs = np.unique(edge_pairs, axis=0)
    g = ig.Graph(n=N, edges=edge_pairs.tolist(), directed=False)

    weights = None
    if weighted and knn_sim is not None:
        # Map weights per edge; we take max of (i->j, j->i) if duplicates happened before unique
        sim_map = {}
        for i in range(N):
            for k, j in enumerate(knn_idx[i]):
                a, b = (i, int(j))
                key = (min(a,b), max(a,b))
                w = float(knn_sim[i, k])
                sim_map[key] = max(sim_map.get(key, -1e9), w)
        weights = [sim_map[(a,b)] for a,b in edge_pairs]

    part = la.find_partition(g, la.RBConfigurationVertexPartition, weights=weights, resolution_parameter=resolution)
    return np.array(part.membership), g, part

labels_lei, g_lei, part_lei = leiden_from_knn(knn_idx.numpy(), knn_sim.numpy(), resolution=1.0, weighted=True)


In [None]:
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

def intrinsic_metrics(z, labels, max_points=20000):
    # ignore noise label -1 for silhouette if present
    z_np = z.numpy() if isinstance(z, torch.Tensor) else z
    idx = np.arange(len(labels))
    mask = labels != -1
    use = idx[mask]
    if len(use) < 2 or len(np.unique(labels[mask])) < 2:
        return {'silhouette': np.nan, 'calinski_harabasz': np.nan, 'davies_bouldin': np.nan}
    # downsample for speed
    if len(use) > max_points:
        use = np.random.RandomState(0).choice(use, size=max_points, replace=False)
    s = silhouette_score(z_np[use], labels[use], metric='euclidean')
    ch = calinski_harabasz_score(z_np[use], labels[use])
    db = davies_bouldin_score(z_np[use], labels[use])
    return {'silhouette': float(s), 'calinski_harabasz': float(ch), 'davies_bouldin': float(db)}

m_in_hdb = intrinsic_metrics(z, labels_hdb)
m_in_lei = intrinsic_metrics(z, labels_lei)
print("HDBSCAN intrinsic:", m_in_hdb)
print("Leiden   intrinsic:", m_in_lei)


In [None]:
def cluster_purity_movie_director(labels, data, primary_ntype='movie'):
    """
    For each movie, get its directors. For each cluster, take the majority director and compute purity.
    Returns micro- and macro-averaged purity (exclude noise cluster -1).
    """
    ei = data[('movie','to','director')].edge_index  # [2, E]
    m, d = ei[0].cpu().numpy(), ei[1].cpu().numpy()
    N = data[primary_ntype].num_nodes
    labels = np.asarray(labels)
    # build directors-per-movie lists
    from collections import defaultdict, Counter
    dirs_by_movie = defaultdict(list)
    for mi, di in zip(m, d):
        dirs_by_movie[int(mi)].append(int(di))
    # per-cluster majority director
    cluster_to_movies = {}
    for mi in range(N):
        c = int(labels[mi])
        if c == -1:   # skip noise
            continue
        cluster_to_movies.setdefault(c, []).append(mi)
    if not cluster_to_movies:
        return {'micro_purity': np.nan, 'macro_purity': np.nan, 'n_clusters': 0}

    purities = []
    total = 0
    correct = 0
    for c, movies in cluster_to_movies.items():
        # count directors across all movies in cluster
        cnt = Counter()
        for mi in movies:
            cnt.update(dirs_by_movie.get(mi, []))
        if len(cnt) == 0:
            purities.append(0.0)
            continue
        maj_dir, maj_count = cnt.most_common(1)[0]
        # total assignments = number of (movie, director) pairs in cluster
        cluster_total = sum(cnt.values())
        purities.append(maj_count / cluster_total)
        total += cluster_total
        correct += maj_count
    micro = correct / total if total > 0 else np.nan
    macro = float(np.mean(purities)) if purities else np.nan
    return {'micro_purity': micro, 'macro_purity': macro, 'n_clusters': len(cluster_to_movies)}

pur_hdb = cluster_purity_movie_director(labels_hdb, data, 'movie')
pur_lei = cluster_purity_movie_director(labels_lei, data, 'movie')
print("HDBSCAN movie→director purity:", pur_hdb)
print("Leiden   movie→director purity:", pur_lei)


In [None]:
# pip install scikit-learn (usually present)
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score

def stability_ari_nmi(z, cluster_fn, n_runs=3):
    parts = []
    for s in range(n_runs):
        np.random.seed(42 + s)
        torch.manual_seed(42 + s)
        labels, *_ = cluster_fn()
        parts.append(np.asarray(labels))
    # pairwise ARI/NMI
    aris, nmis = [], []
    for i in range(n_runs):
        for j in range(i+1, n_runs):
            aris.append(adjusted_rand_score(parts[i], parts[j]))
            nmis.append(normalized_mutual_info_score(parts[i], parts[j]))
    return {'ARI_mean': float(np.mean(aris)), 'NMI_mean': float(np.mean(nmis))}

# Example: HDBSCAN stability over different min_samples
stab_hdb = stability_ari_nmi(
    z,
    cluster_fn=lambda: cluster_hdbscan(z, min_cluster_size=20, min_samples=np.random.randint(5, 15)),
    n_runs=3
)
print("HDBSCAN stability:", stab_hdb)


In [None]:
# pip install umap-learn
import umap
def umap_embed(z, n_neighbors=30, min_dist=0.1, random_state=42):
    Z = z.numpy() if isinstance(z, torch.Tensor) else z
    reducer = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, random_state=random_state)
    return reducer.fit_transform(Z)  # [N, 2]

um = umap_embed(z, n_neighbors=30, min_dist=0.05)
# Plot with your favorite tool; color by labels_hdb or labels_lei
import matplotlib.pyplot as plt
plt.figure(figsize=(7,6))
plt.scatter(um[:,0], um[:,1], c=labels_hdb, s=5, cmap='tab20', alpha=0.9)
plt.title("UMAP of movie embeddings — HDBSCAN clusters")
plt.show()


In [None]:
# 1) Export embeddings (leakage-safe) and normalize
z = export_embeddings(model, splits['train_graph'], primary_ntype='movie', layers=2, batch_size=4096)
z = torch.nn.functional.normalize(z, p=2, dim=1)

# 2) k-NN graph
knn_idx, knn_sim = knn_graph_cosine(z, k=30)

# 3) Clustering
labels_hdb, probs_hdb, _ = cluster_hdbscan(z, min_cluster_size=20, min_samples=10)
labels_lei, g_lei, part_lei = leiden_from_knn(knn_idx.numpy(), knn_sim.numpy(), resolution=1.0, weighted=True)

# 4) Metrics
print("Intrinsic (HDBSCAN):", intrinsic_metrics(z, labels_hdb))
print("Intrinsic (Leiden):  ", intrinsic_metrics(z, labels_lei))
print("Purity (movie→director, HDBSCAN):", cluster_purity_movie_director(labels_hdb, data))
print("Purity (movie→director, Leiden): ", cluster_purity_movie_director(labels_lei, data))

# 5) Stability (optional)
stab_hdb = stability_ari_nmi(z, lambda: cluster_hdbscan(z, 20, np.random.randint(5, 15)), n_runs=3)
print("HDBSCAN stability:", stab_hdb)

# 6) UMAP (optional)
um = umap_embed(z, n_neighbors=30, min_dist=0.05)
plt.figure(figsize=(7,6))
plt.scatter(um[:,0], um[:,1], c=labels_hdb, s=5, cmap='tab20', alpha=0.9)
plt.title("UMAP of movie embeddings — HDBSCAN clusters")
plt.show()
