In [7]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity
from scipy.stats import mode

class DataLoader:
    def __init__(self, data_path, label_path):
        self.data_path = data_path
        self.label_path = label_path

    def load_data(self):
        """Read data from CSV file."""
        data = pd.read_csv(self.data_path)
        label_data = pd.read_csv(self.label_path)
        return data, label_data.values

class DataProcessor:
    def __init__(self, data):
        self.data = data

    def preprocess_data(self):
        """Standardize features."""
        scaler = StandardScaler()
        features_standardized = scaler.fit_transform(self.data)
        return features_standardized

class KMeansClustering:
    def __init__(self, num_clusters, max_iters=100):
        self.num_clusters = num_clusters
        self.max_iters = max_iters

    @staticmethod
    def jaccard_similarity(a, b):
        intersection = np.sum(np.minimum(a, b))
        union = np.sum(np.maximum(a, b))
        return intersection / union if union != 0 else 0

    def _compute_distances(self, X_data, cluster_centroids, similarity):
        if similarity == 'euclidean':
            return pairwise_distances(X_data, cluster_centroids, metric='euclidean')
        elif similarity == 'cosine':
            return 1 - cosine_similarity(X_data, cluster_centroids)
        elif similarity == 'jaccard':
            distances = np.array([1 - self.jaccard_similarity(X_data[i], centroid) for i in range(len(X_data)) for centroid in cluster_centroids])
            return distances.reshape(len(X_data), self.num_clusters)
        else:
            raise ValueError("Invalid similarity metric. Use 'euclidean', 'cosine', or 'jaccard'.")

    def fit(self, X_data, similarity='euclidean'):
        cluster_centroids = X_data[np.random.choice(len(X_data), self.num_clusters, replace=False)]
        
        for _ in range(self.max_iters):
            distances = self._compute_distances(X_data, cluster_centroids, similarity)
            cluster_labels = np.argmin(distances, axis=1)
            
            new_centroids = np.array([X_data[cluster_labels == i].mean(axis=0) if np.sum(cluster_labels == i) > 0 else X_data[np.random.choice(len(X_data))] for i in range(self.num_clusters)])
            
            if np.all(new_centroids == cluster_centroids):
                break
            
            cluster_centroids = new_centroids
        
        sse = np.sum((X_data - cluster_centroids[cluster_labels]) ** 2)
        
        return cluster_centroids, cluster_labels, sse

    @staticmethod
    def assign_majority_labels(cluster_labels, true_labels_data, num_clusters):
        assigned_labels = np.zeros_like(cluster_labels)
        for i in range(num_clusters):
            cluster_indices = np.where(cluster_labels == i)
            majority_label = mode(true_labels_data[cluster_indices])[0][0]
            assigned_labels[cluster_indices] = majority_label
        return assigned_labels

# Define file paths
data_path = '/Users/asish/Documents/F drive/Future/CSE572_DM/Homework/HW3/kmeans_data/data.csv'
label_path = '/Users/asish/Documents/F drive/Future/CSE572_DM/Homework/HW3/kmeans_data/label.csv'

# Load data
data_loader = DataLoader(data_path, label_path)
data, true_labels = data_loader.load_data()

# Preprocess data
data_processor = DataProcessor(data.values)
features_standardized = data_processor.preprocess_data()

# Initialize KMeansClustering object
kmeans_clustering = KMeansClustering(num_clusters=len(np.unique(true_labels)))

# Apply k-means with Euclidean similarity
centroids_euclidean, labels_euclidean, sse_euclidean = kmeans_clustering.fit(features_standardized, similarity='euclidean')

# Apply k-means with Cosine similarity
centroids_cosine, labels_cosine, sse_cosine = kmeans_clustering.fit(features_standardized, similarity='cosine')

# Apply k-means with Jaccard similarity
centroids_jaccard, labels_jaccard, sse_jaccard = kmeans_clustering.fit(features_standardized, similarity='jaccard')

# Assign labels
assigned_labels_euclidean = kmeans_clustering.assign_majority_labels(labels_euclidean, true_labels, len(np.unique(true_labels)))
assigned_labels_cosine = kmeans_clustering.assign_majority_labels(labels_cosine, true_labels, len(np.unique(true_labels)))
assigned_labels_jaccard = kmeans_clustering.assign_majority_labels(labels_jaccard, true_labels, len(np.unique(true_labels)))

# Compute accuracies
accuracy_euclidean = accuracy_score(true_labels, assigned_labels_euclidean)
accuracy_cosine = accuracy_score(true_labels, assigned_labels_cosine)
accuracy_jaccard = accuracy_score(true_labels, assigned_labels_jaccard)

print("Euclidean similarity SSE:", sse_euclidean)
print("Cosine similarity SSE:", sse_cosine)
print("Jaccard similarity SSE:", sse_jaccard)

print("Accuracy (Euclidean):", accuracy_euclidean)
print("Accuracy (Cosine):", accuracy_cosine)
print("Accuracy (Jaccard):", accuracy_jaccard)


Euclidean similarity SSE: 5569451.3558367565
Cosine similarity SSE: 5604872.572570423
Jaccard similarity SSE: 6448978.453950518
Accuracy (Euclidean): 0.5056505650565056
Accuracy (Cosine): 0.5533553355335533
Accuracy (Jaccard): 0.2222222222222222
