In [80]:
from torch_geometric.datasets import Planetoid
import torch
from torch_geometric.data import HeteroData
from torch_geometric.nn import MetaPath2Vec
import torch_geometric.transforms as T
import pandas as pd
import numpy as np
from tqdm import tqdm
from tqdm import trange
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

In [1]:
def vector_distances(v, x):
    """
    计算向量v与矩阵X中每个样本向量之间的距离
    
    参数:
    v (array-like): 输入向量，形状为(n,)
    x (array-like): 输入矩阵，形状为(m, n)，每行表示一个样本向量
    
    返回:
    distances (array-like): 距离数组，形状为(m,)，表示向量v与矩阵X中每个样本向量之间的距离
    """
    
    diff = x - v  # 计算差值
    squared_diff = diff ** 2  # 计算平方差值
    sum_squared_diff = np.sum(squared_diff, axis=1)  # 沿着列方向求和
    distances = np.sqrt(sum_squared_diff)  # 开方
    return distances

In [None]:
def Silhouette(data, labels, centers):
    '''计算轮廓系数
    
    参数 Parameters
    -------
    - data : ndarray
        完整数据
    - labels : ndarray
        一维的，标签表，长度等于样本数，指示对应样本的类
    - centers : ndarray
        各个聚类中心点
    '''
    
    kinds = list(range(len(centers)))  # 聚类数表
    s_i = []
    for cluster in kinds:
        homo_data = data[np.where(labels == cluster)[0]]  # 同类别数据
        hetero_index = [x for x in kinds if x != cluster]  # 非同类别数据类型表
        n = 0
        average_a = 0
        average_b = 0
        for s in homo_data:
            a = 0 if n == 0 else average_a
            a_new = vector_distances(s, homo_data).mean()
            average_a = n * (n + 1) * a + (1 / n) * a_new  # 增量更新a_i
            n += 1
        for n_kind in hetero_index:
            for s in homo_data:
                b = 0 if n == 0 else average_b

In [81]:
data = np.random.rand(170000, 30)

In [82]:
# 调用k均值聚类方法
k = 5
kmeans = KMeans(n_clusters=k, n_init=10)
kmeans.fit(data)

In [83]:
centers = kmeans.cluster_centers_
labels = kmeans.labels_

In [90]:
def calculate_CH_score(data, labels):
    """
    计算CH系数
    Args:
    - data: 输入数据，PyTorch张量形式，每一行表示一个样本
    - labels: 数据对应的簇标签，PyTorch张量形式，每个元素表示对应样本的簇标签
    Returns:
    - ch_score: CH系数
    """

    n_samples = data.shape[0]
    n_clusters = labels.max() + 1

    centroids = torch.zeros(n_clusters, data.shape[1], dtype=data.dtype, device='cuda')

    for i in range(n_clusters):
        cluster_data = data[labels == i]
        centroids[i] = torch.mean(cluster_data, dim=0)

    total_mean = torch.mean(data, dim=0)

    # 计算类内离散度
    within_cluster_dispersion = torch.zeros(n_clusters, dtype=data.dtype, device=data.device)
    for i in range(n_clusters):
        cluster_data = data[labels == i]
        if len(cluster_data) > 1:
            within_cluster_dispersion[i] = torch.sum(torch.pow(cluster_data - centroids[i], 2)) / (len(cluster_data) - 1)

    # 计算类间离散度
    between_cluster_dispersion = torch.sum(torch.pow(centroids - total_mean, 2), dim=1) * (len(data) - n_clusters) / (n_clusters - 1)

    # 计算CH系数
    ch_score = torch.mean(between_cluster_dispersion / within_cluster_dispersion)

    return ch_score.item()

In [92]:
ch_score = calculate_CH_score(torch.tensor(data), labels)

In [93]:
ch_score

2797.4879365110532