# K-means 算法实现

In [1]:
# 导入库
from sklearn import datasets
import numpy as np
import random
from sklearn.model_selection import train_test_split

In [2]:
#加载数据并显示
iris = datasets.load_iris()
[length, width] = iris.data.shape
print('iris dataset length:%d, width:%d' % (length,width))

iris dataset length:150, width:4


In [3]:
#定义Initial_Center类包含五种初始化方法
class Initial_Center():
    #初始化函数记录用于类别个数k
    #Random_Center对应随机选取数据中心的方法 整个数据集作为输入参数
    #Empircal_Center 对应随机选取数据中心的方法 以指定的数据中心作为输入参数
    #Random_MeanCenter 对应随机将数据分为K类取这K类数据的均值向量作为初始数据中心的方法 整个数据集作为输入参数
    #FarDistance_Center 选择批次距离尽可能远的点作为初始数据中心 整个数据集作为输入参数
    #Dense_Center 对应密度法求初始数据中心 
    #输入整个数据集， r表示计算密度的半径 Dt是用于选择高密度集的阈值
    #经过对数据的计算分析， iris数据集r取0.4，Dt取6较合适
    def __init__(self,k):
        self.n_class = k
        
    def Random_Center(self,dataset):
        [length, width] = dataset.shape
        indices = random.sample(range(0, length),self.n_class)
        self.initial_center  = dataset[indices]
      
    
    def Empirical_Center(self,Centers):
        self.initial_center = Centers
    
    def Random_MeanCenter(self,dataset):
        [length, width] = dataset.shape
        length = length - length % self.n_class
        
        indices = random.sample(range(0, length),length)
        self.initial_center  = np.mean(dataset[indices].reshape((self.n_class,-1,width)), 1)
        
    
    def FarDistance_Center(self,dataset):
        [length, width] = dataset.shape
        self.initial_center = np.zeros((self.n_class,width))
        self.indices = random.sample(range(0, length),self.n_class)
        self.indices[0] = np.random.randint(0,length)
        self.initial_center[0] = dataset[self.indices[0]]
        self.distance = np.sum(np.square(dataset - self.initial_center[0]),1)
            
        for i in range(self.n_class-1):
            self.distance += np.sum(np.square(dataset - self.initial_center[i]),1)
            indice = np.argmax(self.distance)
            #print(np.max(self.distance))
            #print(np.min(self.distance))
            if indice in self.indices:
                self.distance[indice] = 0
                
            self.indices[i+1] = np.argmax(self.distance)
            
        #print(sef.indices)   
        self.initial_center = dataset[self.indices]
       
    
    def Calcul_Dense(self,dataset,r):
        [length,width] = dataset.shape
        self.dense = np.zeros((length,))
        
        for i in range(length):
            decision = np.zeros((length,))
            temp_distance = np.sum(np.square(dataset - dataset[i]),1)
            decision[temp_distance <= r*r] = 1
            self.dense[i] = np.sum(decision)
            #print(self.dense[i]-1)
            
            
    def Dense_Center(self,dataset,r,Dt):
        [length, width] = dataset.shape
        self.initial_center = np.zeros((self.n_class,width))
        self.Calcul_Dense(dataset,r)
        dataset = dataset[self.dense > Dt]
        temp_matrix = self.dense[self.dense > Dt]
        #print(temp_matrix)
        self.indices = random.sample(range(0, length),self.n_class)
        self.indices[0] = np.argmax(temp_matrix)
        
        self.initial_center[0] = dataset[self.indices[0]]
        self.distance = np.sum(np.square(dataset - self.initial_center[0]),1)
            
        for i in range(self.n_class-1):
            self.distance += np.sum(np.square(dataset - self.initial_center[i]),1)
            indice = np.argmax(self.distance)
            #print(np.max(self.distance))
            #print(np.min(self.distance))
            if indice in self.indices:
                self.distance[indice] = 0
                
            self.indices[i+1] = np.argmax(self.distance)
            
        #print(sef.indices)   
        self.initial_center = dataset[self.indices]
    

In [4]:
class Kmeans():
    #__init__ 函数初始化K-means类，传入初始聚类中心initial_center 类数K
    #Redistribution 函数按照最小距离准则对样本数据进行一次重新分配，并计算返回新的样本中心
    #Clustering 函数根据新返回的样本中心与上一步样本中心的关系决定是否结束聚类过程
    #其中iteration 定义最大迭代次数 误差为1e-6是认为样本中心点不在改变
    def __init__(self,initial_center,k):
        self.centers = initial_center
        self.n_class = k

        
    def Redistribution(self,dataset):
        [length,width] = dataset.shape
        self.labels = np.zeros((length,))
        centers = np.zeros((self.n_class,width))
        
        for i in range(length):
            self.labels[i] = np.argmin(np.sum(np.square(np.ones((self.n_class,1)) * dataset[i,:] - self.centers),1))
        
        for i in range(self.n_class):
            centers[i] = np.mean(dataset[self.labels == i],0)
            [length,width] = dataset[self.labels == i].shape
            
            if length == 0:
                centers[i] = self.centers[i]
        
        return centers 
    
    def Clustering(self,dataset,iteration):
        for i in range(iteration):
            centers = self.Redistribution(dataset)
            changes = np.sum(np.square(centers - self.centers))
            #print('centers：',centers)
            #print('self.centers: ',self.centers)
            if changes < 1e-6 :
                print('iteration: %d, Algorithm converge -> stop update'%(i))
                break
            
            self.centers = centers
        
            
            if i == iteration-1 :
                print('Algorithm disperse -> stop clustering')
                print('changes: ',changes)
        

In [5]:
#实验数据可视化 判断是否适用于kmeans算法
import matplotlib.pyplot as plt

def plot_features(a,b):
    plt.scatter(iris.data[:50,a],iris.data[:50,b],c='r')
    plt.scatter(iris.data[50:100,a],iris.data[50:100,b],c='g')
    plt.scatter(iris.data[100:150,a],iris.data[100:150,b],c='b')
    
plt.subplot(321)
plot_features(0,1)

plt.subplot(322)
plot_features(0,2)

plt.subplot(323)
plot_features(0,3)

plt.subplot(324)
plot_features(1,2)

plt.subplot(325)
plot_features(1,3)

plt.subplot(326)
plot_features(2,3)

plt.show()

<Figure size 640x480 with 6 Axes>

In [22]:
#定义重要数据
# k 类数
# r 用于密度法的半径
# Dt 用于密度法的阈值
# iteration kmeans算法的最大迭代次数
k = 2
r = 0.4
Dt = 3
iteration = 100

In [25]:
#初始化数据中心
#dataset = iris.data[0:100,:]
#dataset = np.vstack((iris.data[0:50,:],iris.data[100:150,:]))
dataset = iris.data[50:150,:]
#dataset = iris.data
[length,width] = dataset.shape

center = Initial_Center(k)
#center.Random_Center(iris.data)
#center.FarDistance_Center(dataset)
center.Dense_Center(dataset,r,Dt)
print('initial_centers indices: ',center.indices)
print('initial_centers: \n',center.initial_center)
#Centers = np.zeros((k,width))
#center.Empirical_Center(Centers)

initial_centers indices:  [29, 55]
initial_centers: 
 [[5.6 2.7 4.2 1.3]
 [6.8 3.2 5.9 2.3]]


In [26]:
#kmeans算法对数据进行分类
kmeans1 = Kmeans(center.initial_center,k)
kmeans1.Clustering(dataset,iteration)
print('centers: \n',kmeans1.centers)
print('labels: \n',kmeans1.labels.reshape(2,50))

iteration: 2, Algorithm converge -> stop update
centers: 
 [[5.88360656 2.74098361 4.38852459 1.43442623]
 [6.85384615 3.07692308 5.71538462 2.05384615]]
labels: 
 [[1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]
 [1. 0. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 0. 1. 0. 1. 0.
  1. 1. 0. 0. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 0. 1.
  1. 0.]]
