In [1]:
import numpy as np
import random

In [2]:
# return matrix shape m*n：m-样本数，n-特征数
def loadDataSet(fileName):
    dataMat = []
    fr = open(fileName)
    for line in fr.readlines():
        curLine = line.strip().split('\t')
        fltLine = list(map(float, curLine))
        dataMat.append(fltLine)
    return np.mat(dataMat)

In [3]:
# 用欧式距离计算向量间的距离
# return double
def distEclud(vecA, vecB):
    dist = np.sum(np.power(vecA-vecB,2))**0.5
    return dist

In [4]:
# 随机生成簇质点
# return matrix shape k*n：k-质点数，n-特征数
def randCent(dataSet, k):
    # 数据集的维数
    n = np.shape(dataSet)[1]
    # 生成多组 k*n 的随机组合，便于之后生成簇质点
    # 用 set来当容器，方便保证组合没有重复
    rand = set()
    while len(rand)<k:
        rand.add(tuple([random.random() for _ in range(n)]))
    randMat = np.mat(list(map(list,list(rand))))
    # 各维度的统计性数据
    minMat = np.apply_along_axis(arr=dataSet, axis=0, func1d=np.min)
    maxMat = np.apply_along_axis(arr=dataSet, axis=0, func1d=np.max)
    rangeMat = maxMat - minMat
    
    centroids = minMat + np.multiply(randMat, rangeMat)
    return centroids
    

In [5]:
path = "F:/for learn/MLiA/Ch10/"
fileName = path + "testSet.txt"
dataSet = loadDataSet(fileName)

In [6]:
centroids = randCent(dataSet, 5)
centroids

matrix([[-0.21492701,  4.83478898],
        [ 1.5062499 , -3.48224949],
        [ 1.71389623, -1.02928711],
        [ 2.4093189 , -3.36526945],
        [ 1.00531852,  4.84027164]])

In [8]:
dataSet[:10,]

matrix([[ 1.658985,  4.285136],
        [-3.453687,  3.424321],
        [ 4.838138, -1.151539],
        [-5.379713, -3.362104],
        [ 0.972564,  2.924086],
        [-3.567919,  1.531611],
        [ 0.450614, -3.302219],
        [-3.487105, -1.724432],
        [ 2.668759,  1.594842],
        [-3.156485,  3.191137]])

In [41]:
def kMeans(dataSet, k, distFun=distEclud, createCentFun=randCent):
    centroids = createCentFun(dataSet, k)
    totalSSE = changeSSE = np.inf    
    # 给出每个样本最近的簇质点序号以及 SE
    # return (index:Int, dist^2:Double)
    def getIndex(vecA, centroids, distFun):
        distEval = np.array([distFun(vecA, x) for x in centroids])
        index = distEval.argmin()
        dist = distEval[index]
        return index,dist**2
    
    # 根据生成的簇，计算新的簇质心
    # return matrix shape k*n: k-质点数，n-特征数
    def generateNewCentroids(dataSet, index, k):
        newCentroids = np.mat([np.apply_along_axis(
                               arr = dataSet[index==i],
                               axis = 0,
                               func1d=np.mean) for i in range(k)])
        return newCentroids
        
    # 收敛条件：改变的 SSE≤1
    while changeSSE>1:
        clusterAssment = np.array([getIndex(x, centroids,distFun) for x in dataSet])
        index = clusterAssment[:,0]
        sse = np.sum(clusterAssment[:,1])
        changeSSE = totalSSE - sse
        print("now SSE is:", sse, "\t", "change SSE is:", changeSSE)
        totalSSE = sse
        centroids = generateNewCentroids(dataSet, index, k)
    
    return centroids, clusterAssment
    

In [59]:
result = kMeans(dataSet, k=5)

now SSE is: 441.458289833 	 change SSE is: inf
now SSE is: 184.017208876 	 change SSE is: 257.441080957
now SSE is: 140.233228481 	 change SSE is: 43.7839803954
now SSE is: 135.825506074 	 change SSE is: 4.40772240752
now SSE is: 135.134888793 	 change SSE is: 0.690617280683


In [None]:
def biKmeans(dataSet, k, distFun=distEclud):
    m = dataSet.shape[0]
    # 最开始将所有数据看成是一个簇，初始化簇类别（0）和距离矩阵 clusterAssment
    centroids = np.mean(dataSet, axis=0)
    clusterAssment = np.mat([(0,distFun(x, centroids)) for x in dataSet])
    
    
    

In [69]:
distEval

array([ 4.58330836,  4.75184468,  5.08674313,  6.28562276,  3.06493429,
        3.76614642,  3.40196938,  3.82255261,  3.1716421 ,  4.37718688,
        5.20799242,  4.14068002,  5.21346225,  3.5250503 ,  0.96823758,
        4.02836769,  3.30536066,  3.35973568,  4.57409687,  3.87678203,
        3.72972168,  3.0135229 ,  3.01728091,  4.34878565,  4.2641849 ,
        2.69187763,  3.423168  ,  5.08674265,  4.08492369,  2.67708207,
        4.20916698,  4.2557841 ,  3.43650877,  3.53574277,  4.56127559,
        4.32954223,  5.50909737,  2.24178376,  4.00049716,  4.98882339,
        4.41852373,  3.87591797,  4.74732748,  4.73805701,  3.96090665,
        4.52556189,  5.01811579,  4.4234735 ,  4.55126049,  4.49401392,
        3.76899068,  5.80943401,  6.40297196,  5.73124909,  4.2929766 ,
        4.944687  ,  3.22350258,  3.53629097,  3.78283225,  4.75008729,
        3.86204421,  3.74913194,  5.37922984,  3.21435622,  5.21971538,
        3.83044563,  3.53782534,  4.05711072,  3.93635838,  3.29

In [62]:
distEclud(vecA, centroids)

12.668953910823756