# 无监督学习
## K均值聚类算法(K-Means)
通过K-Means算法对MNIST数据集中的图片进行类型标注

In [5]:
# 加载数据
from collections import Counter
from random import randint
import tensorflow as tf
import numpy as np
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
print("number of train data is %d"%(mnist.train.num_examples))
print("number of test data is %d"%(mnist.test.num_examples))
# 使用训练集的图片数据作为输入数据
# shape(55000,784)
X=mnist.train.images
# 样本数目
N=mnist.train.num_examples

Extracting MNIST_data\train-images-idx3-ubyte.gz
Extracting MNIST_data\train-labels-idx1-ubyte.gz
Extracting MNIST_data\t10k-images-idx3-ubyte.gz
Extracting MNIST_data\t10k-labels-idx1-ubyte.gz
number of train data is 55000
number of test data is 10000


### 实现K均值聚类算法

对于MNIST数据集来说，最终被分为0~9一共10类，因此K值取10

In [2]:
k=10
# 最大迭代次数
MAX_ITERS=100
# 对于初始质心的选取，是在样本数据的边界通过随机选取的方式来实现的
start_pos=tf.Variable(X[np.random.randint(X.shape[0],size=k),:],dtype=tf.float32)
centroids=tf.Variable(start_pos.initialized_value(),'S',dtype=tf.float32)
# 输入值
points=tf.Variable(X,'X',dtype=tf.float32)
ones_like=tf.ones((points.get_shape()[0],1))
prev_assignments=tf.Variable(tf.zeros((points.get_shape()[0],),dtype=tf.int64))
# 获取距离
p1=tf.matmul(
    tf.expand_dims(tf.reduce_sum(tf.square(points),1),1),
    tf.ones(shape=(1,k))
)
p2=tf.transpose(tf.matmul(
    tf.reshape(tf.reduce_sum(tf.square(centroids),1),shape=[-1,1]),
    ones_like,
    transpose_b=True
))
# 计算距离
distance=tf.sqrt(tf.add(p1,p2)-2*tf.matmul(points,centroids,transpose_b=True))
# 划分该点的簇
point_to_centroid_assignment=tf.argmin(distance,axis=1)
# 计算均值
total=tf.unsorted_segment_sum(points,point_to_centroid_assignment,k)
count=tf.unsorted_segment_sum(ones_like,point_to_centroid_assignment,k)
means=total/count
# 中心店是否变化
is_continue=tf.reduce_any(tf.not_equal(point_to_centroid_assignment,prev_assignments))
# 循环迭代
with tf.control_dependencies([is_continue]):
    loop=tf.group(centroids.assign(means),prev_assignments.assign(point_to_centroid_assignment))

W1115 11:07:25.460860 12592 deprecation.py:323] From <ipython-input-2-b8e6a17e4a75>:6: Variable.initialized_value (from tensorflow.python.ops.variables) is deprecated and will be removed in a future version.
Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.


### 进行数据训练

In [3]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    changed=True
    iterNum=0
    while changed and iterNum < MAX_ITERS:
        iterNum+=1
        # 数据训练
        [changed,_]=sess.run([is_continue,loop])
        res=sess.run(point_to_centroid_assignment)
        print(iterNum)
    print("Train finished.")

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
Train finished.


### 模型评估
对每一个簇中的样本正确的标签进行统计，显示数量排在前三的标签及对应数量

In [7]:
# 记录训练集的真实标签数据，为测试号准备率
y_=mnist.train.labels
y=[]
for m in range(N):
    for n in range(10):
        if y_[m][n]==1:
            y.append(n)

# 评估。获取每个簇所有的点，按照真实标签的前三数量显示
nums_in_clusters=[[] for i in range(10)]
for i in range(N):
    nums_in_clusters[res[i]].append(y[i])
for i in range(10):
    print(Counter(nums_in_clusters[i]).most_common(3))


[(3, 3679), (5, 1669), (8, 1071)]
[(7, 2449), (9, 1441), (4, 1416)]
[(8, 3165), (5, 1279), (3, 983)]
[(1, 2673), (5, 645), (2, 514)]
[(4, 2044), (9, 1495), (7, 604)]
[(7, 2220), (9, 2158), (4, 1391)]
[(1, 3459), (3, 339), (6, 288)]
[(6, 4516), (2, 3454), (4, 151)]
[(0, 2295), (5, 253), (2, 153)]
[(0, 2609), (6, 80), (5, 54)]
