# Question 1

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from scipy.stats import mode

In [2]:
# File paths for data and labels
file1 = '/Users/asish/Documents/F drive/Future/CSE572_DM/Homework/HW3/kmeans_data/data.csv'
file2 = '/Users/asish/Documents/F drive/Future/CSE572_DM/Homework/HW3/kmeans_data/label.csv'

# read data from CSV files
data_read = pd.read_csv(file1) 
label_read = pd.read_csv(file2)

# convert dataframes to numpy arrays
label_features = label_read.values
data_features = data_read.values

# Standardize features using StandardScaler
scaled_values = StandardScaler()
features_standardized = scaled_values.fit_transform(data_features)


In [3]:
# Define Jaccard similarity calculation function
def calculate_jaccard_similarity(a, b):
    # Calculate the intersection of binary vectors
    binary_inter = np.sum(np.logical_and(a, b))
    
    # Calculate the union of binary vectors
    binary_union = np.sum(np.logical_or(a, b))
    
    # Calculate Jaccard similarity coefficient
    jaccard_similarity = binary_inter / binary_union if binary_union != 0 else 0
    
    return jaccard_similarity

# Define k-means function
def kmeans(X, k, similarity='euclidean', max_iters=100):
    # Step 1: Initialize centroids randomly
    centroids = X[np.random.choice(len(X), k, replace=False)]
    
    # Iteration loop
    for _ in range(max_iters):
        # Step 2: Calculate distances based on chosen similarity metric
        if similarity == 'euclidean':
            dist = pairwise_distances(X, centroids, metric='euclidean')
        elif similarity == 'cosine':
            dist = 1 - cosine_similarity(X, centroids)
        elif similarity == 'jaccard':
            # Calculate Jaccard distances
            dist = []
            for i in range(len(X)):
                for c in centroids:
                    d = 1 - calculate_jaccard_similarity(X[i], c)
                    dist.append(d)
            dist = np.array(dist)
            dist = dist.reshape(len(X), k)
        else:
            raise ValueError("The chosen metric is not valid. Use either 'euclidean', 'cosine', or 'jaccard'.")
        
        # Step 3: Assign labels based on minimum distances
        labels_min = np.argmin(dist, axis=1)
        
        # Step 4: Calculate new centroids
        centroids_new = []
        for i in range(k):
            if np.sum(labels_min == i) > 0:
                centroid_mean = X[labels_min == i].mean(axis=0)
                centroids_new.append(centroid_mean)
            else:
                centroid_random = X[np.random.choice(len(X))]
                centroids_new.append(centroid_random)
        centroids_new = np.array(centroids_new)

        # Step 5: Check convergence
        if np.all(centroids_new == centroids):
            break
        
        centroids = centroids_new
    
    # Step 6: Calculate sum of squared errors (SSE)
    SSE = np.sum((X - centroids[labels_min]) ** 2)
    
    # Step 7: Return centroids, labels, and SSE
    return centroids, labels_min, SSE


In [4]:
# Apply k-means with Euclidean similarity
k_euclidean = len(label_read['7'].unique())
centroids_euclidean, labels_euclidean, sse_euclidean = kmeans(features_standardized, k_euclidean, similarity='euclidean')
print("SSE (Euclidean):", sse_euclidean)

# Apply k-means with Cosine similarity
k_cosine = len(label_read['7'].unique())
centroids_cosine, labels_cosine, sse_cosine = kmeans(features_standardized, k_cosine, similarity='cosine')
print("SSE (Cosine):", sse_cosine)

# Apply k-means with Jaccard similarity
k_jaccard = len(label_read['7'].unique())
centroids_jaccard, labels_jaccard, sse_jaccard = kmeans(features_standardized, k_jaccard, similarity='jaccard')
print("SSE (Jaccard):", sse_jaccard)


SSE (Euclidean): 5579767.629608584
SSE (Cosine): 5590735.097939788
SSE (Jaccard): 6679332.000000004
