In [1]:
import numpy as np

In [2]:
# Vanilla initialization method for KMeans
def get_lloyd_k_means(n, n_cluster, x, generator):
    return generator.choice(n, size=n_cluster)

# doc string for class

        Class KMeans:
        Attr:
            n_cluster - Number of clusters for kmeans clustering (Int)
            max_iter - maximum updates for kmeans clustering (Int)
            e - error tolerance (Float)
            
        
## doc string for def fit
        Finds n_cluster in the data x
            ==> nc: k clusters
        params:
            x - N X D numpy array
                ==> input; properly classify these points
        returns:
            A tuple in the following order:
              - final centroids (fc) w/ dims n_clusters (nc) by D which is a numpy array, 
              - a length (N,) numpy array where cell i is the ith sample's 
                  assigned cluster's index (start from 0),
                  ==> a list of what point is assigned to which cluster
              - number of times you update the assignment, an Int (at most self.max_iter)
              
              tuple 
                  ==> fc : self.centers
       
        ###################################################################
        # TODO: Update means and membership until convergence 
        #   (i.e., average K-mean objective changes less than self.e)
        #   or until you have made self.max_iter updates.
        ###################################################################

In [344]:
class KMeans():


    def __init__(self, n_cluster, max_iter=100, e=0.0001, generator=np.random):
        self.n_cluster = n_cluster
        self.max_iter = max_iter
        self.e = e
        self.generator = generator

    def fit(self, x, centroid_func=get_lloyd_k_means):

        assert len(x.shape) == 2, "fit function takes 2-D numpy arrays as input"
        self.generator.seed(42)
        N, D = x.shape
        
#         get_lloyd_k_means(n, n_cluster, x, generator):
#         [expression for item in iterable if condition == True]
        self.centers = [x[i] for i in centroid_func(len(x), self.n_cluster, x, self.generator)]
    
#          objective func = how close each point is to the cluster center; if close enough stop running
#         drop if obj func doesn't change much

        prev_obj_func = None
        for c in range(self.max_iter) :
            
            points_in_clusters = [[] for _ in range(len(self.centers))]
            
#             create empty list; use to assign points to specific clusters
            assignment = np.zeros([N])
            res = []
            obj_func = 0
             
            for d_p in range(len(x)) : 
#                 numpy handles the mapping of 1 single data point (x[d_p]) to each center
#                 axis = 1 to treat every cluster...
                dist = np.square(np.linalg.norm(x[d_p] - self.centers, axis=1))
#                 print("dist : ", dist)
               
#                 select the INDEX of the shortest distance 
                s_d_idx = np.argmin(dist)
#                 print("s_d_idx : ", s_d_idx)
                
#                 add the points (.append(x[d_p])) to points_in_clusters list @ this INDEX ([s_d_idx])
                points_in_clusters[s_d_idx].append(x[d_p])
#                 print(">>>>>>>>>>>> points in cluster ", s_d_idx , points_in_clusters[s_d_idx])
        
#                 assign @ this point this INDEX
                assignment[d_p] = s_d_idx
#                 print("assignment : ", assignment, "\n")

#               take the current ojb_func & add it to the distance at this INDEX
#               print("dist[s_d_idx] : ", dist[s_d_idx], "@ index : ", s_d_idx)
                obj_func = obj_func + dist[s_d_idx]
#             print("obj_func : ", obj_func)
#             print("N : ", N)
#             mean
            obj_func = obj_func/N
#             print("obj_func : ", obj_func, "\n")

#           show points in each cluster
            for pt in range(len(points_in_clusters)) :                
                if len(points_in_clusters[pt]) > 0 :
#                     print("points_in_clusters[pt] : ", points_in_clusters[pt], pt)
                    cluster_mean = np.mean(points_in_clusters[pt], axis=0)
#                     print("mean in cluster ", pt, cluster_mean)
#                     print(" self.centers[pt] : ",  self.centers[pt], pt)
#                     update the center @ this point
                    self.centers[pt] = np.array(cluster_mean, dtype=float)
                
#             check convergence
            if prev_obj_func != None :
                if np.abs(obj_func - prev_obj_func) < self.e :
                    break
            prev_obj_func = obj_func
            
        return self.centers, assignment, c

In [345]:
n = 12
n_cluster = 4

x = np.array([
    [2, 80],
    [5, 67],
    [90, 42],
    [32, 50],
    [34, 1],
    [10, 100],
    [2, 20],
    [9, 9],
    [45, 37],
    [78, 150],
    [100, 13],
    [78, 140],
])
# make sure self.centers has the same # of columns as x.shape
# print(x.shape)


In [346]:
k_m = KMeans(n_cluster)

In [347]:
k_m.fit(x)

([array([15., 10.]),
  array([ 78., 145.]),
  array([95. , 27.5]),
  array([18.8, 66.8])],
 array([3., 3., 2., 3., 0., 3., 0., 0., 3., 1., 2., 1.]),
 7)