In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import time

data = pd.read_csv('data.csv', header=None).values
labels = pd.read_csv('label.csv', header=None).values.flatten()

def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2)**2))

def cosine_distance(x1, x2):
    return 1 - cosine_similarity([x1], [x2])[0, 0]

def jaccard_distance(x1, x2):
    intersection = np.sum(np.minimum(x1, x2))
    union = np.sum(np.maximum(x1, x2))
    return 1 - intersection / union

def kmeans_with_criteria(X, K, distance_fn, max_iters=500):
    n_samples, n_features = X.shape
    centroids = X[np.random.choice(n_samples, K, replace=False)]
    prev_sse = float('inf')
    start_time = time.time()
    
    for iteration in range(max_iters):
        clusters = [[] for _ in range(K)]
        for i, x in enumerate(X):
            distances = [distance_fn(x, centroid) for centroid in centroids]
            cluster_idx = np.argmin(distances)
            clusters[cluster_idx].append(i)
        
        new_centroids = np.zeros_like(centroids)
        for idx, cluster in enumerate(clusters):
            if cluster:  
                new_centroids[idx] = np.mean(X[cluster], axis=0)
            else:
                new_centroids[idx] = X[np.random.choice(n_samples)]
        
        sse = compute_sse(X, clusters, centroids, distance_fn)
        
        if np.allclose(centroids, new_centroids) or sse > prev_sse:
            break
        
        centroids = new_centroids
        prev_sse = sse
    
    total_time = time.time() - start_time
    return clusters, centroids, iteration + 1, total_time

def compute_sse(X, clusters, centroids, distance_fn):
    sse = 0
    for cluster_idx, cluster in enumerate(clusters):
        for sample_idx in cluster:
            sse += distance_fn(X[sample_idx], centroids[cluster_idx])**2
    return sse

metrics = {
    "Euclidean": euclidean_distance,
    "Cosine": cosine_distance,
    "Jaccard": jaccard_distance
}

results = {}
K = len(np.unique(labels))  
max_iters = 500  

for name, metric in metrics.items():
    print(f"Running {name} K-means...")
    clusters, centroids, iterations, total_time = kmeans_with_criteria(data, K, metric, max_iters=max_iters)
    results[name] = {
        "iterations": iterations,
        "time": total_time
    }
    print(f"{name} K-means: Iterations = {iterations}, Time = {total_time:.2f} seconds\n")

most_iterations = max(results, key=lambda x: results[x]['iterations'])
most_time = max(results, key=lambda x: results[x]['time'])

print(f"The method requiring the most iterations: {most_iterations} with {results[most_iterations]['iterations']} iterations.")
print(f"The method requiring the most time: {most_time} with {results[most_time]['time']:.2f} seconds.")
