In [1]:
from collections import defaultdict
from random import uniform
from math import sqrt
import pandas as pd
from pandas.core.frame import DataFrame
import numpy as np

In [2]:
def point_avg(points):
    # 每个维度的中心值
    dimensions = len(points[0])

    new_center = []

    for dimension in range(dimensions):
        dim_sum = 0
        for p in points:
            dim_sum += p[dimension]

        # 每个维度的平均值
        new_center.append(dim_sum / float(len(points)))

    return new_center

In [3]:
# 更新中心值，即每个维度的中心值
def update_centers(data_set, assignments):
    new_means = defaultdict(list)
    centers = []
    for assignment, point in zip(assignments, data_set):
        new_means[assignment].append(point)

    for points in new_means.values():
        centers.append(point_avg(points))

    return centers

In [4]:
def assign_points(data_points, centers):
    # 数据距离哪个质心最近 标识
    assignments = []
    for point in data_points:
        shortest = float("inf")  # 正无穷
        shortest_index = 0
        for i in range(len(centers)):
            val = distance(point, centers[i])
            if val < shortest:
                shortest = val
                shortest_index = i
        assignments.append(shortest_index)
    return assignments

In [5]:
#计算两点间的距离
def distance(a, b):
    """
    """
    dimensions = len(a)

    _sum = 0
    for dimension in range(dimensions):
        difference_sq = (a[dimension] - b[dimension]) ** 2
        _sum += difference_sq
    return sqrt(_sum)

In [6]:
# 选择个k个初始质心
def generate_k(data_set, k):
    centers = []
    dimensions = len(data_set[0])
    min_max = defaultdict(int)

    # 找到每个维度所有座标中的最大值和最小值
    for point in data_set:
        for i in range(dimensions):
            val = point[i]
            min_key = 'min_%d' % i
            max_key = 'max_%d' % i
            if min_key not in min_max or val < min_max[min_key]:
                min_max[min_key] = val
            if max_key not in min_max or val > min_max[max_key]:
                min_max[max_key] = val

    # 在每个维度的最大值和最小值这个范围中 随机选择值
    for _k in range(k):
        rand_point = []
        for i in range(dimensions):
            min_val = min_max['min_%d' % i]
            max_val = min_max['max_%d' % i]

            rand_point.append(uniform(min_val, max_val))

        centers.append(rand_point)

    return centers

In [7]:
def k_means(dataset, k):
    k_points = generate_k(dataset, k)
    print("初始点：\n",k_points)
    assignments = assign_points(dataset, k_points)
    old_assignments = None
    while assignments != old_assignments:
        new_centers = update_centers(dataset, assignments)
        old_assignments = assignments
        assignments = assign_points(dataset, new_centers)
    print("迭代终止后数据属于的簇：\n", assignments)
    return assignments

In [8]:
data = pd.read_csv("../iris.data", header=None)  # header 不把第一行做为列属性
#data.columns=['sepal length','sepal width','petal length','petal width','class']

data = data.to_numpy()
# 各取出10条，共30条
train = np.vstack((data[0:10, :], data[50:60, :], data[100:110, :]))

# print("训练数据：\n",train)
X = train[:, 0:4]  # data
print("训练数据：\n", X)
Y = train[:, 4]  # target
print("训练数据：\n", Y)
assign = k_means(X, 3)

print('\n')
results={"[sepal length]":train[:,0],
"[sepal width]":train[:,1],
"[petal length]":train[:,2],
"[petal width]":train[:,3],
"[Class]":train[:,4],
"[聚类结果]":assign}

results=DataFrame(results)
print(results)

训练数据：
 [[5.1 3.5 1.4 0.2]
 [4.9 3.0 1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.0 3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.0 3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [7.0 3.2 4.7 1.4]
 [6.4 3.2 4.5 1.5]
 [6.9 3.1 4.9 1.5]
 [5.5 2.3 4.0 1.3]
 [6.5 2.8 4.6 1.5]
 [5.7 2.8 4.5 1.3]
 [6.3 3.3 4.7 1.6]
 [4.9 2.4 3.3 1.0]
 [6.6 2.9 4.6 1.3]
 [5.2 2.7 3.9 1.4]
 [6.3 3.3 6.0 2.5]
 [5.8 2.7 5.1 1.9]
 [7.1 3.0 5.9 2.1]
 [6.3 2.9 5.6 1.8]
 [6.5 3.0 5.8 2.2]
 [7.6 3.0 6.6 2.1]
 [4.9 2.5 4.5 1.7]
 [7.3 2.9 6.3 1.8]
 [6.7 2.5 5.8 1.8]
 [7.2 3.6 6.1 2.5]]
训练数据：
 ['Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa'
 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa'
 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor'
 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor'
 'Iris-versicolor' 'Iris-versicolor' 'Iris-virginica' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iri