In [4]:
import numpy as np
import pandas as pd
from collections import Counter
from math import sqrt
from matplotlib import pyplot as plt

from sklearn.datasets import load_iris

iris = load_iris()

X = iris.data
y = iris.target
labels = iris.target_names

def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

In [5]:
def k_means(X,K,max_iters=100,tol=1e-4,random_state = None):
    np.random.seed(random_state)
    n_samples, n_features = X.shape

    # 随机选k个点作为初始中心点
    indices = np.random.choice(n_samples, K, replace=False)
    centroids = X[indices,:].copy()

    cluster_assignments = np.zeros(n_samples,dtype=np.int32)
    for _ in range(max_iters):
        for i in range(n_samples):
            distances = [euclidean_distance(X[i],c) for c in centroids]
            # 离第i个样本最近的中心点的索引
            cluster_assignments[i] = np.argmin(distances)

        new_centroids = np.zeros((K,n_features))
        for k in range(K):
            points_in_cluster = X[cluster_assignments == k]
            if len(points_in_cluster) > 0:
                    new_centroids[k] = np.mean(points_in_cluster,axis=0)
            else:
                new_centroids[k] = X[np.random.choice(n_samples)]

        shift = np.sum(np.sqrt(np.sum((new_centroids - centroids) ** 2,axis=1)))
        if shift < tol:
            break
    return cluster_assignments,centroids


In [7]:
def main():
    k=3
    assignments,centroids = k_means(X,k,random_state=42)
    print(f"K-means cluster assignments: {Counter(assignments)}")

if __name__ == "__main__":
    main()

K-means cluster assignments: Counter({0: 84, 1: 50, 2: 16})
