In [48]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_blobs
from scipy.spatial import distance

# Data

## make_blobs

In [49]:
centers = [ [1,3], [6,7] ]

data, labels = make_blobs(centers=centers, n_samples=10)

In [50]:
data

array([[0.10164501, 3.2813839 ],
       [5.93977772, 6.41630702],
       [5.94962598, 6.19725221],
       [6.31633406, 6.88007122],
       [0.50378722, 3.48417485],
       [1.84650161, 4.17489407],
       [2.05782671, 2.79378114],
       [5.34451912, 7.29174357],
       [0.59946845, 1.7537998 ],
       [5.45989671, 9.36631648]])

In [51]:
labels

array([0, 1, 1, 1, 0, 0, 0, 1, 0, 1])

# Cluster

In [52]:
def cluster(data,labels):
    cluster=[[] for _ in range(2)]
    for i in range(data.shape[0]):
        cluster[labels[i]].append(data[i])
    return cluster

In [53]:
cluster(data,labels)

[[array([0.10164501, 3.2813839 ]),
  array([0.50378722, 3.48417485]),
  array([1.84650161, 4.17489407]),
  array([2.05782671, 2.79378114]),
  array([0.59946845, 1.7537998 ])],
 [array([5.93977772, 6.41630702]),
  array([5.94962598, 6.19725221]),
  array([6.31633406, 6.88007122]),
  array([5.34451912, 7.29174357]),
  array([5.45989671, 9.36631648])]]

# Intra

In [54]:
def inertie_intra_class1(cluster):
    n=0
    inertie=0
    for cl in cluster:
        s=0
        for x in cl:
            for y in cl:
                s+=distance.euclidean(x,y)**2
#         n+=len(cl)
        inertie+=s/(2*len(cl))
    return inertie/len(data) # /n

In [55]:
inertie_intra_class1(cluster(data,labels))

1.333916775483064

In [56]:
def inertie_intra_class2(cluster):
    i=0
    for cl in cluster:
        ic=0
        for x in cl:
            ic+=distance.euclidean(x,np.mean(cl,axis=0))**2
        i+=ic/len(cl)
    return i/len(cluster)

In [57]:
inertie_intra_class2(cluster(data,labels))

1.3339167754830643

# Inter

In [58]:
def inertie_inter_class(cluster, g):
    n=0
    inertie=0
    for cl in cluster:
        c=np.mean(cl,axis=0)
        inertie+=len(cl)*distance.euclidean(c,g)**2
#         n+=len(cl)
    return inertie/len(data) # n

In [59]:
inertie_inter_class(cluster(data,labels),np.mean(data,axis=0))

9.982409059416451

# Total

In [60]:
def inertie_totale(data):
    g=np.mean(data,axis=0)
    s=0
    for x in data:
        s+=distance.euclidean(x,g)**2
    return s/len(data) # data.shape[0]

In [61]:
inertie_totale(data)

11.316325834899514

# comparaison

In [62]:
abs( inertie_totale(data) - (inertie_inter_class(cluster(data,labels),np.mean(data,axis=0))+inertie_intra_class1(cluster(data,labels))) )

1.7763568394002505e-15

# Expectation

In [63]:
def Expectation(data, W):
    Exp=[]
#     Gr=[[] for _ in range(W.shape[0])]
    for x in data:
        liste=[]
        for k in range(len(W)):
            liste.append(distance.euclidean(x,W[k]))
        Exp.append(np.argmin(liste))
#         Gr[np.argmin(liste)].append(x)
    return Exp

# Centroid

In [64]:
def centroid(cluster):
    i=0
    list=[]
    for cl in cluster:
        for x in cl:
            list.append(np.mean(cl,axis=0))
    return list

In [65]:
W = centroid(cluster(data,labels))
W

[array([1.0218458 , 3.09760675]),
 array([1.0218458 , 3.09760675]),
 array([1.0218458 , 3.09760675]),
 array([1.0218458 , 3.09760675]),
 array([1.0218458 , 3.09760675]),
 array([5.80203072, 7.2303381 ]),
 array([5.80203072, 7.2303381 ]),
 array([5.80203072, 7.2303381 ]),
 array([5.80203072, 7.2303381 ]),
 array([5.80203072, 7.2303381 ])]

In [81]:
Exp = Expectation(data, W)
Exp

[0, 5, 5, 5, 0, 0, 0, 5, 0, 5]

In [89]:
def initialize_centroids(data, k):
    indices = np.random.choice(len(data), k, replace=False)
    return data[indices]

In [96]:
def k_means(data, k=2, max_iterations=100):
    # Initialize centroids
    centroids = initialize_centroids(data, k)
    
    for _ in range(max_iterations):
        # Assign data points to clusters
        clusters = Expectation(data, W)
        
        # Update centroids
        centroid(cluster(data,labels))
    
    return clusters, centroids

In [97]:
# Run K-Means algorithm
clusters, final_centroids = k_means(data[:, :2], k=2)

In [98]:
clusters

[0, 5, 5, 5, 0, 0, 0, 5, 0, 5]

In [99]:
final_centroids

array([[0.50378722, 3.48417485],
       [0.10164501, 3.2813839 ]])