In [67]:
from numpy import random
import numpy as np
import pandas as pd

# 不用dataframe，只用ndarray
# 计算速度：ndarray > Series > list > DataFrame

pd.reset_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
feature = pd.read_excel(r'./input/feature.xls')
col = feature.columns[10:-1]
feature_use = feature[col]
k = 4

In [68]:
# 生成一个随机矩阵
a = random.randint(1, 100, size=(3, 4))


In [69]:
feature_use = feature_use.to_numpy()

In [70]:
print(type(feature_use))
feature_use

<class 'numpy.ndarray'>


array([[ 4.9880e+01,  1.9300e+00,  1.3900e+00, ..., -2.8448e-01,
        -4.1390e-01,  3.7501e-01],
       [ 4.9710e+01,  2.6800e+00,  1.6400e+00, ...,  6.8212e-01,
        -5.4530e-01,  1.1233e-01],
       [ 5.0530e+01,  9.3600e+00,  3.0600e+00, ...,  9.0050e-02,
        -8.8356e-01,  2.6500e-02],
       ...,
       [ 2.0240e+01,  1.2620e+01,  3.5500e+00, ..., -5.5231e-01,
        -3.6670e-01,  2.3189e-01],
       [ 1.8030e+01,  4.1600e+00,  2.0400e+00, ..., -5.5231e-01,
        -3.6670e-01,  2.3189e-01],
       [ 2.9180e+01,  2.6570e+01,  5.1500e+00, ..., -5.5231e-01,
        -3.6670e-01,  2.3189e-01]])

In [71]:
feature_use.shape[0], feature_use.shape[1]

(6747, 121)

In [72]:
# # 特征总数，即列数
# feature_num = feature_use.shape[1]
# # 声音片段总数，即行数
# voice_num = feature_use.shape[0]

In [73]:
a = np.array([1, 2, 3])
b = np.array([3, 4, 7])
(a - b) ** 2

array([ 4,  4, 16], dtype=int32)

In [74]:
# 计算两点的欧氏距离
def Euclidean_distance(x, y):
    distance = np.sqrt(np.sum((x - y) ** 2))
    return distance

In [75]:
# 聚类取平均值，用于计算更新聚类点
def cluster_mean(cluster):
    return np.mean(cluster, axis=0)

In [76]:
# 找到初始聚类点
def init_centroid(dataset):
    # 特征总数，即列数
    feature_num = dataset.shape[1]
    # 声音片段总数，即行数
    voice_num = dataset.shape[0]

    # 第一个聚类点随机选取
    # 之后每个聚类点取到之前距离最大的点
    centroid = np.zeros((4, feature_num))

    index = int(np.random.uniform(0, voice_num))
    centroid[0, :] = dataset[index, :]
    centroid_0 = centroid[0, :]

    centroid_1 = centroid_0
    centroid_2 = centroid_0
    centroid_3 = centroid_0
    max_distance = 0

    for i in range(voice_num):
        dataset_i = dataset[i, :]
        distance = Euclidean_distance(centroid_0, dataset_i)
        if distance > max_distance:
            centroid_1 = dataset_i
            max_distance = distance

    centroid[1, :] = centroid_1
    max_distance = 0

    for i in range(voice_num):
        dataset_i = dataset[i, :]
        distance = Euclidean_distance(centroid_0, dataset_i) + Euclidean_distance(centroid_1, dataset_i)
        if distance > max_distance:
            centroid_2 = dataset_i
            max_distance = distance

    centroid[2, :] = centroid_2
    max_distance = 0

    for i in range(voice_num):
        dataset_i = dataset[i, :]
        distance = Euclidean_distance(centroid_0, dataset_i) + Euclidean_distance(centroid_1,
                                                                                  dataset_i) + Euclidean_distance(
            centroid_2, dataset_i)
        if distance > max_distance:
            centroid_3 = dataset_i
            max_distance = distance

    centroid[3, :] = centroid_3

    return centroid

In [77]:
def Kmeans(dataset):
    # 特征总数，即列数
    feature_num = dataset.shape[1]
    # 声音片段总数，即行数
    voice_num = dataset.shape[0]

    # 初始化一个矩阵来储存每个点的簇分配结果
    # clusterAssment包含两列：1列记录簇索引值，2列存储当前点到簇质心的距离（用来评价聚类的效果）
    # 全部置为-1，因为有0类点
    cluster_assessment = np.ones((voice_num, 2))
    cluster_assessment = -cluster_assessment
    cluster_change = True

    centroid = init_centroid(dataset)

    # 初始化标志变量，用于判断迭代是否继续，如果True，则迭代继续
    while cluster_change:
        cluster_change = False

        # 遍历所有行
        for i in range(voice_num):
            min_distance = 10000
            # 距离最近的聚类点的索引（0,1,2,3)
            min_index = -1

            # 遍历寻找距离每个点最近的质心
            for j in range(k):
                distance = Euclidean_distance(centroid[j, :], dataset[i, :])
                # 如果距离小于minDist，更新minDist和index索引值
                if distance < min_distance:
                    min_distance = distance
                    min_index = j

            # 如果有任意一点的簇分配结果改变，则更新cluster_change为True
            if cluster_assessment[i, 0] != min_index:
                cluster_change = True
                cluster_assessment[i, :] = min_index, min_distance

        # 遍历所有质心，更新取值
        for i in range(k):
            # 获取相同簇质心的所有点，取均值更新簇质心
            same_cluster_index = (cluster_assessment[:, 0] == i)
            points_same_cluster = dataset[same_cluster_index]
            centroid[i, :] = cluster_mean(points_same_cluster)

    print("Cluster complete!")
    return centroid, cluster_assessment


In [78]:
feature_test = feature_use[:20]
centroids, clusterAssment = Kmeans(feature_test)

print(centroids)
print(clusterAssment)

Cluster complete!
[[ 5.94000000e+01  2.25416667e+01  4.36666667e+00  7.07483333e+01
   5.42600000e+01  5.61333333e+01  5.90816667e+01  6.21583333e+01
   6.50650000e+01 -1.08050000e+01  1.21983333e+01  1.79016667e+01
   3.59666667e+00  2.43816667e+01  8.23000000e+00  9.58666667e+00
   1.15066667e+01  1.40250000e+01  1.71833333e+01 -8.95333333e+00
   1.87833333e+00  1.10000000e-01  3.16666667e-01  2.86000000e+00
   1.53833333e+00  1.65333333e+00  1.84500000e+00  2.07166667e+00
   2.30000000e+00 -7.61666667e-01  2.54833333e+00  6.16666667e-02
   2.36666667e-01  3.25833333e+00  2.25000000e+00  2.39666667e+00
   2.54500000e+00  2.70333333e+00  2.86333333e+00 -6.13333333e-01
   1.50000000e-02  0.00000000e+00  1.33333333e-02  8.83333333e-02
   5.00000000e-03  6.66666667e-03  8.33333333e-03  1.66666667e-02
   2.83333333e-02 -2.33333333e-02  7.50000000e-02  3.33333333e-03
   6.50000000e-02  3.88333333e-01  8.33333333e-03  2.66666667e-02
   6.50000000e-02  1.05000000e-01  1.63333333e-01 -1.55000

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = um.true_divide(


In [79]:
# # 主成分分析
# def pca(data):
#     mean = cluster_mean(data)
#     # np.tile()默认复制x轴，如果有两个参数如(2,1)，则前者为复制y轴，后者为复制x轴



In [80]:
help(np.tile)

Help on function tile in module numpy:

tile(A, reps)
    Construct an array by repeating A the number of times given by reps.
    
    If `reps` has length ``d``, the result will have dimension of
    ``max(d, A.ndim)``.
    
    If ``A.ndim < d``, `A` is promoted to be d-dimensional by prepending new
    axes. So a shape (3,) array is promoted to (1, 3) for 2-D replication,
    or shape (1, 1, 3) for 3-D replication. If this is not the desired
    behavior, promote `A` to d-dimensions manually before calling this
    function.
    
    If ``A.ndim > d``, `reps` is promoted to `A`.ndim by pre-pending 1's to it.
    Thus for an `A` of shape (2, 3, 4, 5), a `reps` of (2, 2) is treated as
    (1, 1, 2, 2).
    
    Note : Although tile may be used for broadcasting, it is strongly
    recommended to use numpy's broadcasting operations and functions.
    
    Parameters
    ----------
    A : array_like
        The input array.
    reps : array_like
        The number of repetitions of `A`

In [81]:
type(np.zeros((3, 1)))

numpy.ndarray

In [82]:
a = np.zeros((2, 3))
print(type(a))
print(a)
b = np.mat(np.zeros((2, 3)))
print((type(b)))
print(b)


<class 'numpy.ndarray'>
[[0. 0. 0.]
 [0. 0. 0.]]
<class 'numpy.matrix'>
[[0. 0. 0.]
 [0. 0. 0.]]


In [83]:
a = np.ones((3, 2))
a[0, 0] = -1
a = -a
print(a)
a[:, 0] == 1

[[ 1. -1.]
 [-1. -1.]
 [-1. -1.]]


array([ True, False, False])

In [91]:
mean_points = cluster_mean(feature_use)
m = feature_use.shape[0]
points_distance_to_center = np.zeros(feature_use.shape[0])

for i in range(m):
    temp_dist = Euclidean_distance(feature_use[i], mean_points)
    points_distance_to_center[i] = temp_dist

points_distance_to_center = pd.DataFrame(points_distance_to_center)
points_distance_to_center.describe()

points_distance_to_center.to_csv(r"./output/points_distance_to_center.csv")

In [94]:
a = np.ones((3, 4))
a[0, 0] = 0
a[2, 2] = 4
print(a)
a = np.delete(a, 1, axis=0)
a

[[0. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 4. 1.]]


array([[0., 1., 1., 1.],
       [1., 1., 4., 1.]])

In [110]:
a = feature.columns[10:-1]
b = feature.columns[1]
a = np.insert(a, 0, b)
name = feature[b]
# name = np.delete(name,[1])
name.values

array(['1-L-1.wav', '1-L-2.wav', '2-L-1.wav', ..., '702-L-7.wav',
       '702-L-8.wav', '702-L-9.wav'], dtype=object)