In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

data = pd.read_csv('data.csv', header=None).values
labels = pd.read_csv('label.csv', header=None).values.flatten()

def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2)**2))

def kmeans(X, K, distance_fn, max_iters=100):
    n_samples, n_features = X.shape
    centroids = X[np.random.choice(n_samples, K, replace=False)]
    for _ in range(max_iters):
        clusters = [[] for _ in range(K)]
        for i, x in enumerate(X):
            distances = [distance_fn(x, centroid) for centroid in centroids]
            cluster_idx = np.argmin(distances)
            clusters[cluster_idx].append(i)
        new_centroids = np.zeros_like(centroids)
        for idx, cluster in enumerate(clusters):
            if cluster:  # Avoid empty clusters
                new_centroids[idx] = np.mean(X[cluster], axis=0)
            else:
                new_centroids[idx] = X[np.random.choice(n_samples)]
        if np.allclose(centroids, new_centroids):
            break
        centroids = new_centroids
    return clusters, centroids

def compute_sse(X, clusters, centroids, distance_fn):
    sse = 0
    for cluster_idx, cluster in enumerate(clusters):
        for sample_idx in cluster:
            sse += distance_fn(X[sample_idx], centroids[cluster_idx])**2
    return sse

K = len(np.unique(labels))

metrics = {
    "Euclidean": euclidean_distance
    }

sse_results = {}
for name, metric in metrics.items():
    clusters, centroids = kmeans(data, K, metric)
    sse = compute_sse(data, clusters, centroids, metric)
    sse_results[name] = sse
    print(f"{name} SSE: {sse}")




Euclidean SSE: 25532057041.0
