In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

In [None]:
X = pd.read_csv('.csv', header=None, usecols=[0, 1, 2, 3]).values

In [None]:
y = pd.read_csv('iris.csv', header=None, usecols=[4]).values

In [None]:
def initialize_centroids(data, k_centroids):
    '''Randomly picks k elements from data as centroids'''
    
    index = np.random.choice(data.shape[0], k_centroids)
    return data[index]

def find_closest_centroid(data, centroids):
    '''assign each data element the closest centroid from k centroids'''
    
    closest_centroid_index = np.zeros(data.shape[0])

    for ind in range(data.shape[0]):
        closest_centroid_index[ind] = np.argmin(np.sum(np.square(data[ind] - centroids), axis=1))
    
    return closest_centroid_index

def compute_centroids(data, closest_centroid_index, n_centroids):
    new_centroids = np.zeros((n_centroids, data.shape[1]))
    n_neighbours = np.zeros(n_centroids)
    
    for ind in range(data.shape[0]):
        centroid_index = int(closest_centroid_index[ind])
        new_centroids[centroid_index] += data[ind]
        n_neighbours[centroid_index] += 1
        
    for ind in range(n_centroids):
        new_centroids[ind] /= n_neighbours[ind]
        
    return new_centroids

In [None]:
def k_means(data, k_centroids, iterations):
    centroids = initialize_centroids(data, k_centroids)
#     centroids_history = list()
#     centroids_history.append(centroids)
    
    for i in range(iterations):
        closest_centroid = find_closest_centroid(data, centroids)
        centroids = compute_centroids(data, closest_centroid, k_centroids)
#         centroids_history.append(centroids)
    
    return centroids, closest_centroid

In [None]:
def compute_cost(data, centroids, closest_centroids):
    cost = 0
    for i in range(data.shape[0]):
        cost += np.sum(np.square(data[i] - centroids[int(closest_centroids[i])]))
    return cost

In [None]:
max_cluster = 5
cost_history = list()
centroid_history = list()
for k in range(1, max_cluster + 1):
    centroids, closest_cent = k_means(X, k, 20)
    centroid_history.append(centroids)
    cost_history.append(compute_cost(X, centroids, closest_cent))

In [None]:
fig = plt.figure()
plt.plot([k for k in range(1, max_cluster + 1)], cost_history, label=['x', 'y'])
plt.xlabel = 'number of clusters'
plt.ylabel = 'cost'
# fig.savefig('k_means.jpg')