In [19]:
import numpy as np
import sklearn as sk

In [20]:
# Data
X_cluster_data = np.array([
    [2, 10],  # Point 0
    [2, 5],   # Point 1
    [8, 4],   # Point 2
    [5, 8],   # Point 3
    [7, 5],   # Point 4
    [6, 4],   # Point 5
    [1, 2],   # Point 6
    [4, 9]    # Point 7
])

In [21]:
# 4.1 Manual K-Means Iterations

In [22]:
C1 = X_cluster_data[0]  # [2, 10]
C2 = X_cluster_data[1]  # [2, 5]

In [23]:
assignments = []
for i, point in enumerate(X_cluster_data):
    d_C1 = np.linalg.norm(point - C1)
    d_C2 = np.linalg.norm(point - C2)
    
    if d_C1 < d_C2:
        assignments.append(1) 
    else:
        assignments.append(2) 
    
    print(f"Point {i} = {point}, Distance to C1: {d_C1:.2f}, Distance to C2: {d_C2:.2f} => Assigned to Cluster {assignments[-1]}")

Point 0 = [ 2 10], Distance to C1: 0.00, Distance to C2: 5.00 => Assigned to Cluster 1
Point 1 = [2 5], Distance to C1: 5.00, Distance to C2: 0.00 => Assigned to Cluster 2
Point 2 = [8 4], Distance to C1: 8.49, Distance to C2: 6.08 => Assigned to Cluster 2
Point 3 = [5 8], Distance to C1: 3.61, Distance to C2: 4.24 => Assigned to Cluster 1
Point 4 = [7 5], Distance to C1: 7.07, Distance to C2: 5.00 => Assigned to Cluster 2
Point 5 = [6 4], Distance to C1: 7.21, Distance to C2: 4.12 => Assigned to Cluster 2
Point 6 = [1 2], Distance to C1: 8.06, Distance to C2: 3.16 => Assigned to Cluster 2
Point 7 = [4 9], Distance to C1: 2.24, Distance to C2: 4.47 => Assigned to Cluster 1


In [24]:
cluster1_points = X_cluster_data[np.array(assignments) == 1]
cluster2_points = X_cluster_data[np.array(assignments) == 2]

new_C1 = np.mean(cluster1_points, axis=0)
new_C2 = np.mean(cluster2_points, axis=0)

print("Updated Centroids:")
print("New Centroid 1 (C1):", new_C1)
print("New Centroid 2 (C2):", new_C2)

Updated Centroids:
New Centroid 1 (C1): [3.66666667 9.        ]
New Centroid 2 (C2): [4.8 4. ]


In [25]:
# 4.2 Using Scikit-learn

In [26]:
from sklearn.cluster import KMeans

X_cluster_data = np.array([
    [2, 10],
    [2, 5],
    [8, 4],
    [5, 8],
    [7, 5],
    [6, 4],
    [1, 2],
    [4, 9]
])

In [29]:
kmeans = KMeans(n_clusters=2, random_state=42, n_init='auto')
kmeans.fit(X_cluster_data)

print("Final Centroids:")
print(kmeans.cluster_centers_)


Final Centroids:
[[5.5  3.75]
 [3.25 8.  ]]


In [28]:
print("\nCluster Labels Assigned to Each Point:")
for i, label in enumerate(kmeans.labels_):
    print(f"Point {i} = {X_cluster_data[i]} → Cluster {label}")

final_centroids = kmeans.cluster_centers_
cluster_labels = kmeans.labels_


Cluster Labels Assigned to Each Point:
Point 0 = [ 2 10] → Cluster 1
Point 1 = [2 5] → Cluster 1
Point 2 = [8 4] → Cluster 0
Point 3 = [5 8] → Cluster 1
Point 4 = [7 5] → Cluster 0
Point 5 = [6 4] → Cluster 0
Point 6 = [1 2] → Cluster 0
Point 7 = [4 9] → Cluster 1


In [None]:
# Interpretation:
# The two final centroids calculated from the manual iteration and scikit learn are different.
# Manual - C1 ≈ [3.67, 9.0], C2 ≈ [4.8, 4.0] | Scikit - C1 ≈ [3.25, 8.0], C2 ≈ [5.5, 3.75]
# Scikit-learn's KMeans runs multiple iterations (until convergence). 
# It keeps recalculating centroids and reassigning points each time, minimizing the within-cluster variance (inertia).