In [26]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.cluster import KMeans

In [27]:
class K_Means:
  def __init__(self,n_clusters = 3,max_iter=100):
    self.n_clusters = n_clusters
    self.max_iter = max_iter

  def euclidean_distance(self,x1, x2):
    return np.sqrt(np.sum((x2-x1)**2))

  def list_equal(self, l1,l2):
    for i in l1:
      if i not in l2:
        return False
    return True

  def fit(self, x):
    cluster = [[] for _ in range(self.n_clusters)]
    #cluster = [[]]* self.n_clusters. Note : cannot use this as it creates the list with same refereces and hence updating one updates all using the append function
    print("inital points in the clusters",cluster)
    # select random k points from the x_train and assign them as my inital centroids for the  the k clusters

    initial_centroid_ids =np.random.choice([i for i in range(x.shape[0])],size=self.n_clusters, replace=False)
    initial_centroids = [x[id] for id in initial_centroid_ids]

    for i in range(self.max_iter):
      for idx, row in enumerate(x):
        # distance of each point to all the cluster centroid inorder to decide to which cluster it belongs to
        distance = [self.euclidean_distance(row,each_centroid) for each_centroid in initial_centroids]
        cluster_id = np.argmin(distance)
        # Assign the point to that cluster

        cluster[cluster_id].append(idx)
      # update the centroid

      centroid_ids = [np.mean(x[cluster[i]],axis = 0) for i in range(self.n_clusters)]

      converge_cond = [self.euclidean_distance(centroid_ids[i], initial_centroids[i]) for i in range(self.n_clusters)]
      if sum(converge_cond)==0:
        self.final_centroids = centroid_ids
        break
      else:
        initial_centroids = centroid_ids


    self.final_centroids = centroid_ids


  def predict(self, x):
    predictions = []
    for row in x:
      distances = [self.euclidean_distance(row,each_cluster) for each_cluster in self.final_centroids]
      predictions.append(np.argsort(distances)[0])
    return predictions



In [34]:
iris = datasets.load_iris()
x,y = iris.data,iris.target
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3,random_state = 43)
model = K_Means()
model.fit(x_train)
predictions = model.predict(x_test)
print("predictions",predictions)
print("accuracy score", accuracy_score(predictions,y_test))

inital points in the clusters [[], [], []]
predictions [0, 0, 1, 1, 2, 0, 1, 1, 1, 1, 0, 1, 2, 0, 1, 1, 0, 0, 1, 2, 0, 0, 0, 1, 1, 2, 0, 1, 0, 0, 1, 0, 1, 1, 2, 2, 1, 2, 1, 1, 1, 1, 1, 2, 0]
accuracy score 0.8444444444444444


# Using Sklearn KMeans for classification

In [37]:
sklearn_model = KMeans(n_clusters = 3,max_iter = 100)
sklearn_model.fit(x_train)
sk_predictions = sklearn_model.predict(x_test)
print("predictions",sk_predictions)
print("accuracy score", accuracy_score(sk_predictions,y_test))

predictions [0 0 1 1 2 0 1 1 1 1 0 1 2 0 1 1 0 0 1 2 0 0 0 2 1 2 0 1 0 0 1 0 1 1 2 2 1
 2 1 1 1 1 1 2 0]
accuracy score 0.8666666666666667


  super()._check_params_vs_input(X, default_n_init=10)
