In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity

In [2]:


class KMeans:
    def __init__(self, num_clusters, similarity='euclidean', max_iters=100, tol=1e-4):
        self.num_clusters = num_clusters
        self.similarity = similarity
        self.max_iters = max_iters
        self.tol = tol

    @staticmethod
    def jaccard_similarity(a, b):
        intersection = np.sum(np.minimum(a, b))
        union = np.sum(np.maximum(a, b))
        return intersection / union if union != 0 else 0

    def _compute_distances(self, X, centroids):
        if self.similarity == 'euclidean':
            return pairwise_distances(X, centroids, metric='euclidean')
        elif self.similarity == 'cosine':
            return 1 - cosine_similarity(X, centroids)
        elif self.similarity == 'jaccard':
            distances = np.array([1 - self.jaccard_similarity(X[i], centroid) for i in range(len(X)) for centroid in centroids])
            return distances.reshape(len(X), self.num_clusters)

    def _update_centroids(self, X, labels):
        new_centroids = []
        for i in range(self.num_clusters):
            cluster_points = X[labels == i]
            if len(cluster_points) > 0:
                new_centroid = cluster_points.mean(axis=0)
            else:
                new_centroid = X[np.random.choice(len(X))]
            new_centroids.append(new_centroid)
        return np.array(new_centroids)

    def _calculate_sse(self, X, centroids, labels):
        sse = 0
        for i in range(self.num_clusters):
            cluster_points = X[labels == i]
            sse += np.sum((cluster_points - centroids[i])**2)
        return sse

    def _check_convergence(self, new_centroids, centroids, sse, prev_sse, iteration):
        norm_diff = np.linalg.norm(new_centroids - centroids)
        return (norm_diff < self.tol) or (sse > prev_sse) or (iteration == self.max_iters - 1)

    def fit(self, X):
        centroids = X[np.random.choice(X.shape[0], self.num_clusters, replace=False)]
        prev_sse = np.inf
        
        for current_iteration in range(self.max_iters):
            distances = self._compute_distances(X, centroids)
            cluster_assignments = np.argmin(distances, axis=1)
            new_centroids = self._update_centroids(X, cluster_assignments)
            sse = self._calculate_sse(X, new_centroids, cluster_assignments)
            
            if self._check_convergence(new_centroids, centroids, sse, prev_sse, current_iteration):
                break
            
            prev_sse = sse
            centroids = new_centroids
        
        return centroids, cluster_assignments, sse, current_iteration + 1

# Read data from CSV file
data_file_path = '/Users/asish/Documents/F drive/Future/CSE572_DM/Homework/HW3/kmeans_data/data.csv'
data_df = pd.read_csv(data_file_path)

label_file_path = '/Users/asish/Documents/F drive/Future/CSE572_DM/Homework/HW3/kmeans_data/label.csv'
label_df = pd.read_csv(label_file_path)
ground_truth_labels = label_df.values

data_features = data_df.values

scaler = StandardScaler()
standardized_features = scaler.fit_transform(data_features)

# Example usage
num_clusters = len(label_df['7'].unique())
max_iters = 100

# Initialize KMeans objects
kmeans_euclidean = KMeans(num_clusters, similarity='euclidean', max_iters=max_iters)
kmeans_cosine = KMeans(num_clusters, similarity='cosine', max_iters=max_iters)
kmeans_jaccard = KMeans(num_clusters, similarity='jaccard', max_iters=max_iters)

# Apply k-means with Euclidean similarity
centroids_euclidean, cluster_assignments_euclidean, sse_euclidean, iterations_euclidean = kmeans_euclidean.fit(standardized_features)

# Apply k-means with Cosine similarity
centroids_cosine, cluster_assignments_cosine, sse_cosine, iterations_cosine = kmeans_cosine.fit(standardized_features)

# Apply k-means with Jaccard similarity
centroids_jaccard, cluster_assignments_jaccard, sse_jaccard, iterations_jaccard = kmeans_jaccard.fit(standardized_features)

# Print the SSEs and iterations for each method
print("SSE (Euclidean):", sse_euclidean)
print("Iterations (Euclidean):", iterations_euclidean)

print("SSE (Cosine):", sse_cosine)
print("Iterations (Cosine):", iterations_cosine)

print("SSE (Jaccard):", sse_jaccard)
print("Iterations (Jaccard):", iterations_jaccard)


SSE (Euclidean): 5554660.746840994
Iterations (Euclidean): 53
SSE (Cosine): 5601495.973329594
Iterations (Cosine): 40
SSE (Jaccard): 6410324.038038823
Iterations (Jaccard): 3
