In [27]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.preprocessing import MinMaxScaler

# --------------- Load your dataset ---------------
X = pd.read_csv("data.csv").to_numpy()
y = pd.read_csv("label.csv").to_numpy().ravel()
K = len(np.unique(y))

# Scale features to [0,1] for Jaccard
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# --------------- Distance functions ---------------
def euclidean_distance(a, b):
    return np.linalg.norm(a - b)

def cosine_distance(a, b):
    sim = np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
    return 1 - sim  # 1 - cosine similarity

def jaccard_distance(a, b):
    mins = np.minimum(a, b)
    maxs = np.maximum(a, b)
    denom = np.sum(maxs)
    if denom == 0:
        return 0
    sim = np.sum(mins) / denom
    return 1 - sim  # 1 - generalized Jaccard similarity

# --------------- Helper functions ---------------
def assign_clusters(X, centroids, dist_func):
    labels = []
    for x in X:
        dists = [dist_func(x, c) for c in centroids]
        labels.append(np.argmin(dists))
    return np.array(labels)

def update_centroids(X, labels, K):
    centroids = []
    for k in range(K):
        cluster_points = X[labels == k]
        if len(cluster_points) > 0:
            centroids.append(np.mean(cluster_points, axis=0))
        else:
            centroids.append(X[np.random.randint(0, len(X))])
    return np.array(centroids)

def compute_sse(X, labels, centroids, dist_func):
    sse = 0
    for i in range(len(X)):
        sse += dist_func(X[i], centroids[labels[i]]) ** 2
    return sse

def majority_vote(y_true, y_pred, K):
    mapping = {}
    for k in range(K):
        idx = np.where(y_pred == k)[0]
        if len(idx) == 0:
            mapping[k] = np.random.choice(y_true)
        else:
            mapping[k] = Counter(y_true[idx]).most_common(1)[0][0]
    return np.array([mapping[label] for label in y_pred])

# --------------- K-Means algorithm ---------------
def kmeans(X, y, K, dist_func, max_iter=500):
    np.random.seed(42)
    centroids = X[np.random.choice(len(X), K, replace=False)]
    prev_sse = np.inf
    for it in range(max_iter):
        labels = assign_clusters(X, centroids, dist_func)
        new_centroids = update_centroids(X, labels, K)
        sse = compute_sse(X, labels, new_centroids, dist_func)
        if np.allclose(new_centroids, centroids) or sse > prev_sse:
            break
        centroids = new_centroids
        prev_sse = sse
    y_pred = majority_vote(y, labels, K)
    accuracy = np.mean(y_pred == y)
    return sse, accuracy, it + 1

# --------------- Run K-Means for all three metrics ---------------
results = {}
for name, dist_func in {
    "Euclidean": euclidean_distance,
    "Cosine": cosine_distance,
    "Jaccard": jaccard_distance
}.items():
    sse, acc, iters = kmeans(X, y, K, dist_func)
    results[name] = {"SSE": sse, "Accuracy": acc, "Iterations": iters}

# Display results
print("Results of K-Means from Scratch:")
for method, vals in results.items():
    print(f"{method:10s} -> SSE: {vals['SSE']:.4f}, Accuracy: {vals['Accuracy']:.4f}, Iterations: {vals['Iterations']}")

Results of K-Means from Scratch:
Euclidean  -> SSE: 389497.8949, Accuracy: 0.6005, Iterations: 66
Cosine     -> SSE: 697.9888, Accuracy: 0.6387, Iterations: 27
Jaccard    -> SSE: 3676.1825, Accuracy: 0.6278, Iterations: 23


In [23]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load your uploaded dataset
X = pd.read_csv("data.csv").to_numpy()
y = pd.read_csv("label.csv").to_numpy().ravel()

# Min–Max scaling to [0,1] for Jaccard
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

K = len(np.unique(y))
def kmeans_vectorized_modes(X_arr, K, mode="euclidean", stop_mode="no_centroid_change", 
                            max_iters_default=500, init_indices=None):
    import numpy as np
    def pairwise_euclidean_sq(X_arr, C):
        X2 = np.sum(X_arr * X_arr, axis=1, keepdims=True)
        C2 = np.sum(C * C, axis=1, keepdims=True).T
        XC = X_arr @ C.T
        D2 = X2 + C2 - 2.0 * XC
        D2[D2 < 0] = 0.0
        return D2

    def pairwise_cosine_distance(X_arr, C):
        X_norm = np.linalg.norm(X_arr, axis=1, keepdims=True)
        C_norm = np.linalg.norm(C, axis=1, keepdims=True).T
        Xn = np.where(X_norm == 0, 1.0, X_norm)
        Cn = np.where(C_norm == 0, 1.0, C_norm)
        sims = (X_arr @ C.T) / (Xn * Cn)
        sims = np.clip(sims, -1.0, 1.0)
        return 1.0 - sims

    def pairwise_gen_jaccard_distance(X_arr, C):
        n, d = X_arr.shape
        K = C.shape[0]
        D = np.empty((n, K))
        for k in range(K):
            mins = np.minimum(X_arr, C[k])
            maxs = np.maximum(X_arr, C[k])
            denom = np.sum(maxs, axis=1)
            denom = np.where(denom == 0.0, 1.0, denom)
            sim = np.sum(mins, axis=1) / denom
            D[:, k] = 1.0 - sim
        return D

    def distance_matrix(X_arr, C, mode):
        if mode == "euclidean":
            return np.sqrt(pairwise_euclidean_sq(X_arr, C))
        elif mode == "cosine":
            return pairwise_cosine_distance(X_arr, C)
        elif mode == "jaccard":
            return pairwise_gen_jaccard_distance(X_arr, C)

    n, d = X_arr.shape
    if init_indices is None:
        rng = np.random.default_rng(42)
        init_indices = rng.choice(n, size=K, replace=False)
    C = X_arr[init_indices].astype(float).copy()

    prev_sse = np.inf
    n_iter = 0
    max_iters = 100 if stop_mode == "max_iter_100" else max_iters_default

    for it in range(1, max_iters + 1):
        D = distance_matrix(X_arr, C, mode)
        labels = np.argmin(D, axis=1)
        newC = np.zeros_like(C)
        for k in range(K):
            idx = np.where(labels == k)[0]
            if len(idx) == 0:
                ridx = np.random.randint(0, n)
                newC[k] = X_arr[ridx]
            else:
                newC[k] = X_arr[idx].mean(axis=0)
        move = np.linalg.norm(newC - C)
        C = newC
        chosen = D[np.arange(n), labels]
        sse = np.sum(chosen ** 2)
        n_iter = it
        if stop_mode == "no_centroid_change" and move == 0.0:
            break
        elif stop_mode == "sse_increase" and sse > prev_sse:
            break
        prev_sse = sse

    return {"SSE": sse, "iterations": n_iter}


# assuming: X_scaled (min–max), y, K, and a kmeans function that accepts stop modes
modes = ["euclidean", "cosine", "jaccard"]
stops = ["no_centroid_change", "sse_increase", "max_iter_100"]

rows = []
for m in modes:
    for s in stops:
        out = kmeans_vectorized_modes(X_scaled, K, mode=m, stop_mode=s, max_iters_default=500)
        rows.append({"metric": m, "stop_condition": s, "SSE": out["SSE"], "iterations": out["iterations"]})

import pandas as pd
df_q4 = pd.DataFrame(rows).pivot(index="metric", columns="stop_condition", values="SSE")
print(df_q4.round(4))
out = kmeans_vectorized_modes(X_scaled, K, mode=m, stop_mode=s, max_iters_default=500)


stop_condition  max_iter_100  no_centroid_change  sse_increase
metric                                                        
cosine              697.3234            697.3234      696.3731
euclidean        390193.0999         390193.0999   390193.0999
jaccard            3660.8293           3660.8293     4145.4241
