In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter


data = pd.read_csv('data.csv', header=None).values
labels = pd.read_csv('label.csv', header=None).values.flatten()


def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2)**2))

def cosine_distance(x1, x2):
    sim = cosine_similarity([x1], [x2])
    return 1 - sim[0, 0]

def jaccard_distance(x1, x2):
    intersection = np.sum(np.minimum(x1, x2))
    union = np.sum(np.maximum(x1, x2))
    return 1 - (intersection / union)


def kmeans(X, K, distance_fn, max_iters=100):
    n_samples, n_features = X.shape
    centroids = X[np.random.choice(n_samples, K, replace=False)]
    for _ in range(max_iters):
        
        clusters = [[] for _ in range(K)]
        for i, x in enumerate(X):
            distances = [distance_fn(x, centroid) for centroid in centroids]
            cluster_idx = np.argmin(distances)
            clusters[cluster_idx].append(i)
        new_centroids = np.zeros_like(centroids)
        for idx, cluster in enumerate(clusters):
            if cluster:  
                new_centroids[idx] = np.mean(X[cluster], axis=0)
            else:
                new_centroids[idx] = X[np.random.choice(n_samples)]
        if np.allclose(centroids, new_centroids):
            break
        centroids = new_centroids
    return clusters, centroids

def assign_cluster_labels(clusters, labels):
    cluster_labels = {}
    for cluster_idx, cluster in enumerate(clusters):
        cluster_labels[cluster_idx] = Counter(labels[cluster]).most_common(1)[0][0]
    return cluster_labels


def compute_accuracy(clusters, cluster_labels, labels):
    correct = 0
    total = 0
    for cluster_idx, cluster in enumerate(clusters):
        predicted_label = cluster_labels[cluster_idx]
        for sample_idx in cluster:
            if labels[sample_idx] == predicted_label:
                correct += 1
            total += 1
    return correct / total

K = len(np.unique(labels))
metrics = {
    "Euclidean": euclidean_distance,
    "Cosine": cosine_distance,
    "Jaccard": jaccard_distance
}

accuracy_results = {}
for name, metric in metrics.items():
    clusters, centroids = kmeans(data, K, metric)
    cluster_labels = assign_cluster_labels(clusters, labels)
    accuracy = compute_accuracy(clusters, cluster_labels, labels)
    accuracy_results[name] = accuracy
    print(f"{name} Accuracy: {accuracy:.4f}")

best_metric = max(accuracy_results, key=accuracy_results.get)
print(f"The best metric is {best_metric} with accuracy = {accuracy_results[best_metric]:.4f}")


Euclidean Accuracy: 0.6037
Cosine Accuracy: 0.6149
Jaccard Accuracy: 0.6281
The best metric is Jaccard with accuracy = 0.6281
