In [1]:
import numpy as np
import random

# Step 1: Initialize centroids randomly
def initialize_centroids(data, k):
    centroids = random.sample(data, k)
    return centroids

# Step 2: Assign each data point to the nearest centroid
def assign_to_clusters(data, centroids):
    clusters = [[] for _ in range(len(centroids))]

    for point in data:
        distances = [np.linalg.norm(np.array(point) - np.array(centroid)) for centroid in centroids]
        cluster_index = np.argmin(distances)
        clusters[cluster_index].append(point)

    return clusters

# Step 3: Update centroids based on the mean of the data points in each cluster
def update_centroids(clusters):
    centroids = [np.mean(cluster, axis=0) if cluster else [] for cluster in clusters]
    return centroids

# Step 4: Check for convergence
def has_converged(prev_centroids, centroids, tol=1e-4):
    return all(np.linalg.norm(np.array(prev) - np.array(current)) < tol for prev, current in zip(prev_centroids, centroids))

# K-Means algorithm
def k_means(data, k, max_iterations=100):
    centroids = initialize_centroids(data, k)
    
    for _ in range(max_iterations):
        clusters = assign_to_clusters(data, centroids)
        prev_centroids = centroids
        centroids = update_centroids(clusters)
        
        if has_converged(prev_centroids, centroids):
            break
    
    return centroids, clusters

# Example usage
if __name__ == "__main__":
    # Sample data points
    data = [
        [2.000000, 4.000000],
        [3.000000, 3.000000],
        [3.000000, 4.000000],
        [3.000000, 5.000000],
        [4.000000, 3.000000],
        [4.000000, 5.000000],
        [9.000000, 4.000000],
        [9.000000, 5.000000],
        [9.000000, 9.000000],
        [9.000000, 10.000000],
        [10.000000, 4.000000],
        [10.000000, 5.000000],
        [10.000000, 9.000000],
        [10.000000, 10.000000],
        [11.000000, 10.000000],
        [15.000000, 4.000000],
        [15.000000, 5.000000],
        [15.000000, 6.000000],
        [16.000000, 4.000000],
        [16.000000, 5.000000],
        [16.000000, 6.000000]
    ]
    
    # Set the number of clusters
    k = 3
    
    # Run K-Means
    centroids, clusters = k_means(data, k)
    
    # Print the final centroids and clusters
    print("Final centroids:")
    print(centroids)
    
    print("\nClusters:")
    for i, cluster in enumerate(clusters):
        print(f"Cluster {i + 1}:")
        print(cluster)


Final centroids:
[array([4., 4.]), array([12. ,  6.4]), array([2.75, 4.  ])]

Clusters:
Cluster 1:
[[4.0, 3.0], [4.0, 5.0]]
Cluster 2:
[[9.0, 4.0], [9.0, 5.0], [9.0, 9.0], [9.0, 10.0], [10.0, 4.0], [10.0, 5.0], [10.0, 9.0], [10.0, 10.0], [11.0, 10.0], [15.0, 4.0], [15.0, 5.0], [15.0, 6.0], [16.0, 4.0], [16.0, 5.0], [16.0, 6.0]]
Cluster 3:
[[2.0, 4.0], [3.0, 3.0], [3.0, 4.0], [3.0, 5.0]]


In [2]:
import numpy as np
import pandas as pd

In [None]:
data = pd.read_csv("C:\\kmtest.csv")

In [4]:
data


Unnamed: 0,2.000000 4.000000
0,3.000000 3.000000
1,3.000000 4.000000
2,3.000000 5.000000
3,4.000000 3.000000
4,4.000000 5.000000
5,9.000000 4.000000
6,9.000000 5.000000
7,9.000000 9.000000
8,9.000000 10.000000
9,10.000000 4.000000
