In [2]:
from torch_geometric.datasets import Planetoid
import torch
from torch_geometric.data import HeteroData
from torch_geometric.nn import MetaPath2Vec
import torch_geometric.transforms as T
import pandas as pd
import numpy as np
from tqdm import tqdm
from tqdm import trange
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

In [1]:
def vector_distances(v, x):
    """
    计算向量v与矩阵X中每个样本向量之间的距离
    
    参数:
    v (array-like): 输入向量，形状为(n,)
    x (array-like): 输入矩阵，形状为(m, n)，每行表示一个样本向量
    
    返回:
    distances (array-like): 距离数组，形状为(m,)，表示向量v与矩阵X中每个样本向量之间的距离
    """
    
    diff = x - v  # 计算差值
    squared_diff = diff ** 2  # 计算平方差值
    sum_squared_diff = np.sum(squared_diff, axis=1)  # 沿着列方向求和
    distances = np.sqrt(sum_squared_diff)  # 开方
    return distances

In [None]:
def Silhouette(data, labels, centers):
    '''计算轮廓系数
    
    参数 Parameters
    -------
    - data : ndarray
        完整数据
    - labels : ndarray
        一维的，标签表，长度等于样本数，指示对应样本的类
    - centers : ndarray
        各个聚类中心点
    '''
    
    kinds = list(range(len(centers)))  # 聚类数表
    s_i = []
    for cluster in kinds:
        homo_data = data[np.where(labels == cluster)[0]]  # 同类别数据
        hetero_index = [x for x in kinds if x != cluster]  # 非同类别数据类型表
        n = 0
        average_a = 0
        average_b = 0
        for s in homo_data:
            a = 0 if n == 0 else average_a
            a_new = vector_distances(s, homo_data).mean()
            average_a = n * (n + 1) * a + (1 / n) * a_new  # 增量更新a_i
            n += 1
        for n_kind in hetero_index:
            for s in homo_data:
                b = 0 if n == 0 else average_b

In [74]:
data = np.random.rand(170000, 30)

In [75]:
# 调用k均值聚类方法
k = 5
kmeans = KMeans(n_clusters=k, n_init=10)
kmeans.fit(data)

In [76]:
centers = kmeans.cluster_centers_
labels = kmeans.labels_

In [12]:
centers[0]

array([0.50477903, 0.511962  , 0.49100136, 0.51025126, 0.49314494,
       0.5053705 , 0.50325588, 0.5002258 , 0.49971705, 0.49978261,
       0.49382889, 0.4881789 , 0.48440911, 0.49702173, 0.49927294,
       0.50480186, 0.5157705 , 0.50249172, 0.54990527, 0.49395949,
       0.50321589, 0.22671222, 0.48617976, 0.49148147, 0.49010772,
       0.50046862, 0.50062196, 0.49562734, 0.27815578, 0.33247882])

In [38]:
indices = np.where(labels != 1)[0]

In [39]:
c1 = data[indices]

In [40]:
c1.shape

(136404, 30)

In [23]:
distance = vector_distances(centers[0], c1)

In [27]:
distance.mean()

1.6509786851144894

In [32]:
c1[0]

array([0.42099324, 0.04934486, 0.74219446, 0.04108663, 0.95841704,
       0.96061358, 0.03587082, 0.16117396, 0.14344126, 0.28826802,
       0.98629476, 0.07415029, 0.87999125, 0.05442717, 0.34257853,
       0.46207655, 0.16307596, 0.11289187, 0.22432044, 0.37797177,
       0.21192205, 0.77724597, 0.9271301 , 0.39756361, 0.23843577,
       0.87947782, 0.25024118, 0.05653372, 0.64692756, 0.14535374])

In [77]:
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import calinski_harabasz_score
db_score = davies_bouldin_score(data, labels)
ch_score = calinski_harabasz_score(data, labels)

In [78]:
db_score

5.065332311664916

In [79]:
ch_score

2790.6033000996727