### Finding Euclidean distance using numpy.linalg.norm

In [1]:
import numpy as np

a = np.array([1,2])
b = np.array([2,3])

dist = np.linalg.norm(b-a)
print("Distance: ",dist)

Distance:  1.4142135623730951


## KMeans Clustering

In [2]:
import numpy as np

class Kmeans:
    def __init__(self, k = 5, centroids = None, tolerance = 0.0001, n_iter = 500):
        '''
        k  = number of clusters, default is 5
        centroids = can be provided in an array of centroids to start with, 
                    if not then starting k row from data is selected as centroid.
        tolerance = default is 0.0001
        n_iter = default is 500
        '''
        
        self.k = k
        self.centroids = centroids
        self.tolerance = tolerance
        self.n_iter = n_iter
        self.clusters = {}
        
    #Function to fit the data    
    def fit(self, data):
        if self.centroids is None:
            self.centroids = {}
            for i in range(self.k):
                self.centroids[i] = data[i]
        else:
            #Check if given centroids array has k centroid 
            assert self.k == len(self.centroids), "len of centroids array must be equal to k"
            c = {}
            for i in range(len(self.centroids)):
                c[i] = self.centroids[i]
            self.centroids = c
        
        #KMeans loop
        for i in range(self.n_iter):
            for i in range(self.k):
                self.clusters[i] = []
                
            #Calculating distance and assigning cluster
            for row in data:
                dist = [np.linalg.norm(row - self.centroids[c]) for c in self.centroids.keys()]
                cluster = dist.index(min(dist))
                self.clusters[cluster].append(row)

            prev_centroids = dict(self.centroids)
            
            #Claculating new centroid of each cluster by averaging
            for cluster in self.clusters.keys():
                self.centroids[cluster] = np.average(self.clusters[cluster], axis=0)

            isSol = True
            for c in self.centroids.keys():
                prev = prev_centroids[i]
                curr = self.centroids[c]

                if np.sum((curr-prev)/prev * 100.0) > self.tolerance:
                    isSol = False
            if isSol:
                break
    
    #Function to predict the cluster
    def predict(self, data):
        dist = [np.linalg.norm(data-self.centroids[c]) for c in self.centroids.keys()]
        cluster = dist.index(min(dist))
        return cluster
    
    #Function to show cluster no. and cluster data
    def clusters_(self):
        print("Cluster # \t Cluster")
        for c, cluster in self.clusters.items():
            print(c,"\t\t",cluster)
            

### <i>Demo</i>

In [3]:
x = np.array([1,2,3,5,6,8,9,23,33])

km = Kmeans(k = 3)
km.fit(x)
print("Pred: ", km.predict(55))
print("Centroids: ",km.centroids)
km.clusters_()

Pred:  2
Centroids:  {0: 2.0, 1: 7.0, 2: 28.0}
Cluster # 	 Cluster
0 		 [1, 2, 3]
1 		 [5, 6, 8, 9]
2 		 [23, 33]


### <i>Providing centroids</i>

In [4]:
km1 = Kmeans(k=3, centroids = [2,3,22])
km1.fit(x)
print("Pred: ", km1.predict(4))
print("Centroids: ",km1.centroids)
km1.clusters_()

Pred:  0
Centroids:  {0: 2.0, 1: 7.0, 2: 28.0}
Cluster # 	 Cluster
0 		 [1, 2, 3]
1 		 [5, 6, 8, 9]
2 		 [23, 33]


### <i>Timeit</i>

In [5]:
%%timeit
km2 = Kmeans(k=3)
km2.fit(x)
km2.predict(15)

1.15 ms ± 17.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
