In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize

data = pd.read_csv("data.csv")
data = data.values

def euclidean_distance(a, b):
    return np.linalg.norm(a - b)

def cosine_similarity(a, b):
    return 1 - (np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))

def generalized_jaccard_similarity(a, b):
    return 1 - (np.sum(np.minimum(a, b)) / np.sum(np.maximum(a, b)))

class KMeans:
    def __init__(self, n_clusters, max_iter=100, distance_func=euclidean_distance):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.distance_func = distance_func

    def fit(self, X):
        np.random.seed(42)
        self.centroids = X[np.random.choice(range(X.shape[0]), self.n_clusters, replace=False)]
        
        for i in range(self.max_iter):
            self.labels = np.array([self._assign_cluster(x) for x in X])
            new_centroids = np.array([X[self.labels == k].mean(axis=0) for k in range(self.n_clusters)])

            self.centroids = new_centroids

    def _assign_cluster(self, x):
        distances = [self.distance_func(x, centroid) for centroid in self.centroids]
        return np.argmin(distances)

    def sse(self, X):
        return np.sum([self.distance_func(x, self.centroids[label]) ** 2 for x, label in zip(X, self.labels)])

data = normalize(data)

n_clusters = 3

kmeans_euclidean = KMeans(n_clusters=n_clusters, max_iter=100, distance_func=euclidean_distance)
kmeans_euclidean.fit(data)
sse_euclidean = kmeans_euclidean.sse(data)

kmeans_cosine = KMeans(n_clusters=n_clusters, max_iter=100, distance_func=cosine_similarity)
kmeans_cosine.fit(data)
sse_cosine = kmeans_cosine.sse(data)

kmeans_jaccard = KMeans(n_clusters=n_clusters, max_iter=100, distance_func=generalized_jaccard_similarity)
kmeans_jaccard.fit(data)
sse_jaccard = kmeans_jaccard.sse(data)

print(f"SSE with Euclidean distance: {sse_euclidean}")
print(f"SSE with Cosine similarity: {sse_cosine}")
print(f"SSE with Generalized Jaccard similarity: {sse_jaccard}")



SSE with Euclidean distance: 5219.103325530017
SSE with Cosine similarity: 1030.683247360933
SSE with Generalized Jaccard similarity: 4543.85763272726
