<a href="https://colab.research.google.com/github/B34R-e/Basic-Machine-Learning/blob/main/Python_implementation_of_k_Means_Clustering_(Updated).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
import numpy as np
from numpy.linalg import norm
import math

In [31]:
def dot_product(a, b):
  if len(a) != len(b):
    print(f'Error: The vector {a} and {b} are not of the same length')
    return None
  result = 0
  for i in range(len(a)):
    result += a[i] * b[i]
  return result

In [32]:
def vectorized_dot_product(a, b):
  a = np.array(a)
  b = np.array(b)
  try:
    return a.dot(b)
  except:
    print(f'Error: The vector {a} and {b} are not of the same length')
    return None

In [34]:
def initialized_centroids(X, k):
  # randomly pick k indices from range(len(X))
  indices = np.random.choice(len(X), k)

  # return the data points at the chosen indices as centroids
  return [X[i] for i in indices]

In [None]:
# def assign_points_to_clusters(X, centroids):
#   # caculate distances between data points and centroids
#   distances = np.sqrt(((X[: np.newaxis] - centroids) ** 2).sum(axis = 2))

#   # get the index of the closest centroid for each data point
#   assignment = np.argmin()
#   return assignment

In [38]:
def assign_points_to_clusters(X, centroids):
  # caculate distances between data points and centroids
  distances = [[norm(X[i] - centroids[ii]) for ii in range(len(centroids))] for i in range(len(X))]

  # get the index of the closest centroid for each data point
  assignment = [distances[i].index(min(distances[i])) for i in range(len(distances))]
  return assignment

In [42]:
def update_centroids(X, clusters, k):
  # initialized the centroids array to store new centroids
  centroids = np.zeros((k, X.shape[1]))

  # loop through the clusters
  for cluster_idx in range(k):

    # get all data points assigned to the current cluster
    cluster_points = [X[i] for i in range(len(X)) if assign_points_to_clusters(X, clusters)[i] == cluster_idx]

    # caculate the mean of the points in the current cluster
    centroids[cluster_idx] = np.mean(cluster_points)
  
  # return new centroids
  return centroids

In [None]:
def kmeans(X, k, max_iters = 100):
  # initialize centroids
  centroids = initialized_centroids(X, k)

  for _ in range(max_iters):
    # assign data points to the clusters
    clusters  = assign_points_to_clusters(X, centroids)

    # update centroids
    new_centroids = update_centroids(X, clusters, k)

    # check convergence
    if np.allclose(new_centroids, centroids):
      break

    # assign new centroids for the next iteration
    centroids = new_centroids

  return clusters, centroids