In [1]:
#Unsupervised K-means Clustering
import numpy as np
import random 
import pandas as pd
def initialize_centroids(X,k):
    random_indices = random.sample(range(len(X)),K)
    centroids = [X[i] for i in random_indices]
    return centroids
def assign_clusters(X,centroids):
    clusters = [[] for _ in range (len(centroids))]
    for point in X:
        distance = [np.linalg.norm(point-centroid) for centroid in centroids]
        closest_centroid_index = np.argmin(distance)
        clusters[closest_centroid_index].append(point)
    return clusters
def update_centroids(clusters):
    centroids = []
    for cluster in clusters:
        centroid = np.mean(cluster,axis =0)
        centroids.append(centroid)
    return centroids
def Kmeans(X,K,max_iteration = 100):
    centroids = initialize_centroids(X,K)
    for _ in range(max_iteration):
        clusters = assign_clusters(X,centroids)
        new_centroids = update_centroids(clusters)
        if np.allclose(centroids,new_centroids):
            break
        centroids = new_centroids
    return clusters,centroids
data = pd.read_csv('Downloads/iris.csv')
X = data.iloc[:,1].values
X = (X-np.mean(X,axis = 0))/np.std(X,axis=0)
K = 2
clusters,centroids = Kmeans(X,K)
for i,cluster in enumerate(clusters):
    print(f"cluster{i+1}",cluster)

cluster1 [-0.9006811702978088, -1.1430169111851105, -1.3853526520724133, -1.5065205225160652, -1.0218490407414595, -0.537177558966854, -1.5065205225160652, -1.0218490407414595, -1.7488562634033669, -1.1430169111851105, -0.537177558966854, -1.2641847816287624, -1.2641847816287624, -1.870024133847019, -0.05250607719224957, -0.1736739476359004, -0.537177558966854, -0.9006811702978088, -0.1736739476359004, -0.9006811702978088, -0.537177558966854, -0.9006811702978088, -1.5065205225160652, -0.9006811702978088, -1.2641847816287624, -1.0218490407414595, -1.0218490407414595, -0.7795132998541568, -0.7795132998541568, -1.3853526520724133, -1.2641847816287624, -0.537177558966854, -0.7795132998541568, -0.4160096885232032, -1.1430169111851105, -1.0218490407414595, -0.4160096885232032, -1.1430169111851105, -1.7488562634033669, -0.9006811702978088, -1.0218490407414595, -1.6276883929597161, -1.7488562634033669, -1.0218490407414595, -0.9006811702978088, -1.2641847816287624, -0.9006811702978088, -1.50652

In [2]:
import numpy as np
import operator
import matplotlib.pyplot as plt
import matplotlib.cm as cm

r = lambda: np.random.randint(1, 100)

class Centroid:
    """
    pos    = [x, y] coordinate array
    points = points assigned to centroid
    """
    def __init__(self, pos):
        self.pos = pos
        self.points = []
        self.previous_points = []
        self.color = None

class KMeans:
    """
    Unsupervised clustering algortihm.
    """
    def __init__(self, n_centroids=5):
        self.n_centroids = n_centroids
        self.centroids = []

        # generate initial centroids
        for _ in range(n_centroids):
            self.centroids.append(Centroid(np.array([r(), r()])))
       
        # assign a color to each centroid
        colors = cm.rainbow(np.linspace(0, 1, len(self.centroids)))
        for i, c in enumerate(self.centroids):
            c.color = colors[i]

    def sample_data(self, samples=50):
        """
        Generates sample data assings to self.X
        """
        self.X = [[r(), r()] for _ in range(samples)]

    def fit(self):
        """
        Fits points in self.X
        Assigns points to centroids.
        Calls to update centroid mean to reflect mean of assigned points.
        """
        self.n_iters = 0
        fit = False
        while not fit:
            for point in self.X:
                closest = self.assign_centroid(point)
                closest.points.append(point)

            # if length of array of centroids that did not change == number of centroids
            if len([c for c in self.centroids if c.points == c.previous_points]) == self.n_centroids:
                fit = True
                self._update_centroids(reset=False)
            else:
                self._update_centroids()

            self.n_iters += 1


    def assign_centroid(self, x):
        """
        Returns centroid closest to point.
        """
        distances = {}
        for centroid in self.centroids:
            distances[centroid] = np.linalg.norm(centroid.pos - x)
        closest = min(distances.items(), key=operator.itemgetter(1))[0]
        return closest


    def _update_centroids(self, reset=True):
        """
        Updates centroid position based on mean of assigned points.
        """
        for centroid in self.centroids:
            centroid.previous_points = centroid.points
            x_cor = [x[0] for x in centroid.points]
            y_cor = [y[1] for y in centroid.points]
            try:
                centroid.pos[0] = sum(x_cor)/len(x_cor)
                centroid.pos[1] = sum(y_cor)/len(y_cor)
            except:
                pass

            if reset:
                centroid.points = []
       
    def show(self):
        """
        Displays clustering, saves plot to {title}.png.
        """

        for i, c in enumerate(self.centroids):
            plt.scatter(c.pos[0], c.pos[1], marker='o', color=c.color, s=75)
            x_cors = [x[0] for x in c.points]
            y_cors = [y[1] for y in c.points]
            plt.scatter(x_cors, y_cors, marker='.', color=c.color)

        title = 'K-Means'
        plt.xlabel('X')
        plt.ylabel('Y')
        plt.title(title)
        plt.savefig('{}.png'.format(title))
        plt.show()



if __name__ == '__main__':
    kmeans = KMeans(n_centroids=50)
    kmeans.sample_data()
    kmeans.fit()
    print('Iterations: {0}'.format(kmeans.n_iters))
    kmeans.show()

Iterations: 4


<Figure size 640x480 with 1 Axes>