In [105]:
import numpy as np

class KMeans:
    
    def __init__(self, data, K):
    
        self.data = data
        self.K = K
        
    def train(self, max_iter):
        
        centroids = self.init_centroids(self.data, self.K)
        ids = np.empty((self.data.shape[0], 1))
        
        for _ in range(max_iter):
            ids = self.find_ids(self.data, centroids)            
            centroids = self.new_centroid(self.data, ids, self.K)
            
        return centroids, ids
        
    
    def init_centroids(self, data, num_centroids):
        # pick random K data in dataset as centroids initiated
        ind = np.random.choice(data.shape[0], num_centroids, replace=False)
        centroids = data[ind, :]
        print('centroid init ' + str(centroids))
        return centroids
    
    def find_ids(self, data, centroids):
        
        num_data = data.shape[0]
        num_centroids = centroids.shape[0]
#        ids = np.zeros((num_data,1))
#        this will make ids a 2d-array, lead to 'flatten()' being use in new_centroid() 
        ids = np.zeros((num_data))
        
        for i in range(num_data):
#             print('find id for point ' + str(i))
            distances = np.zeros((num_centroids, 1))
            min_distance = float('inf')
            min_id = 0
            for j in range(num_centroids):
                diff = data[i, :] - centroids[j, :]
                distance = np.sum(diff ** 2)
                if distance < min_distance:
                    min_distance = distance
                    min_id = j
#                     print('min_distance for point '+ str(i) + ' ' + str(min_distance))
#                     print('min_id for point '+ str(i) + ' ' + str(min_id))
                    ids[i] = j
        return ids
    
    def new_centroid(self, data, ids, K):
        
        centroids = np.zeros((K, data.shape[1]))
        
        for i in range(K):
            
            i_ids = ids == i
#             print(i_ids)
            # centroids[i] = np.mean(data[i_ids.flatten(), :], axis = 0)            
            # i_ids is a 2d array of True and False, which need to be flatten
            centroids[i] = np.mean(data[i_ids, :], axis = 0)
        print(centroids)
            
        return centroids
            
                
        
    
    
    
        

In [106]:
import numpy as np
import pandas as pd
data = pd.read_csv('data/iris.csv')
num_data = data.shape[0]
#print(data)
x_train = data.values.reshape((num_data,5))[:,:-1]
print(x_train)
print(type(x_train))
# ind = np.array([1,2,3])

kmeans = KMeans(x_train,3)
kmeans.train(1000)

[[5.1 3.5 1.4 0.2]
 [4.9 3.0 1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.0 3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.0 3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.0 1.4 0.1]
 [4.3 3.0 1.1 0.1]
 [5.8 4.0 1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.0 0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.0 3.0 1.6 0.2]
 [5.0 3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.0 3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.1 1.5 0.1]
 [4.4 3.0 1.3 0.2]
 [5.1 3.4 1.5 0.2]
 [5.0 3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.0 3.5 1.6 0.6]
 [5.1 3.8 1.9 0.4]
 [4.8 3.0 1.4 0.3]
 [5.1 3.8 1.6 0.2]
 [4.6 3.2 1.4 0.2]
 [5.3 3.7 1.5 0.2]
 [5.0 3.3 1.4 0.2]
 [7.0 3.2 4.7 1.4]
 [6.4 3.2 4.5 1.5]
 [6.9 3.1 4.

(array([[5.006     , 3.418     , 1.464     , 0.244     ],
        [5.88360656, 2.74098361, 4.38852459, 1.43442623],
        [6.85384615, 3.07692308, 5.71538462, 2.05384615]]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 2.,
        1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 1.,
        2., 2., 2., 2., 1., 2., 2., 2., 2., 2., 2., 1., 1., 2., 2., 2., 2.,
        1., 2., 1., 2., 1., 2., 2., 1., 1., 2., 2., 2., 2., 2., 1., 2., 2.,
        2., 2., 1., 2., 2., 2., 1., 2., 2., 2., 1., 2., 2., 1.]))

In [45]:
d = np.array([[1,1,2,3],[2,2,3,4],[3,3,4,5]])
a = np.array([[True], [True], [False]])
np.mean(d[a.flatten(),:], axis = 0)


array([1.5, 1.5, 2.5, 3.5])

In [41]:
d.flatten()

array([1, 1, 2, 3, 2, 2, 3, 4, 3, 3, 4, 5])

In [102]:
np.zeros((10))

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])