# All 3 stopping conditions
    “when there is no change in centroid position OR when the
    SSE value increases in the next iteration OR when the maximum preset value (e.g., 500, you
    can set the preset value by yourself) of iteration is complete”

In [1]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import euclidean, cosine
from sklearn.metrics import pairwise_distances_argmin_min

np.random.seed(42)

# Custom distance computation functions
def calc_euclidean_dist(vec1, vec2):
    return euclidean(vec1, vec2)

def calc_cosine_similarity(vec1, vec2):
    return 1 - np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def calc_jaccard_index(vec1, vec2):
    min_sum = np.minimum(vec1, vec2).sum()
    max_sum = np.maximum(vec1, vec2).sum()
    return 1 - min_sum / max_sum if max_sum != 0 else 0
# Implementation of the K-means clustering algorithm
def perform_kmeans(data, num_clusters, dist_func, max_iterations=500):
    selected_indices = np.random.choice(len(data), num_clusters, replace=False)
    cluster_centers = data[selected_indices]
    
    for iter_num in range(max_iterations):
        assigned_clusters = np.array([np.argmin([dist_func(point, center) for center in cluster_centers]) for point in data])
        updated_centers = np.array([data[assigned_clusters == idx].mean(axis=0) for idx in range(num_clusters)])
        
        if np.array_equal(cluster_centers, updated_centers):
            break
        
        cluster_centers = updated_centers
        closest, distances = pairwise_distances_argmin_min(data, cluster_centers, metric=dist_func)
        total_sse = np.sum(distances ** 2)
        
        if iter_num > 0 and total_sse > prev_sse:
            break
        
        prev_sse = total_sse
    
    return assigned_clusters, cluster_centers, total_sse, iter_num

# Functions for assigning labels to clusters and calculating the accuracy
def assign_labels_to_clusters(cluster_assignments, actual_labels):
    label_mapping = {}
    for cluster in np.unique(cluster_assignments):
        cluster_labels, label_counts = np.unique(actual_labels[cluster_assignments == cluster], return_counts=True)
        label_mapping[cluster] = cluster_labels[np.argmax(label_counts)]
    return label_mapping

def compute_accuracy(cluster_assignments, mapped_labels, actual_labels):
    correct_labels = sum(mapped_labels[cluster] == true_label for cluster, true_label in zip(cluster_assignments, actual_labels))
    return correct_labels / len(actual_labels)

# Example usage with a dataset
# Load your data here
# data_points = (your data points)
# labels = (your labels)
data_points = pd.read_csv('kmeans_data/data.csv').values
labels = pd.read_csv('kmeans_data/label.csv').values.squeeze()
num_clusters = len(np.unique(labels))

# Execute K-means using different distance metrics
experiment_results = {}
for dist_func, func_name in [
    (calc_euclidean_dist, 'Euclidean'), 
    (calc_cosine_similarity, 'Cosine'), 
    (calc_jaccard_index, 'Jaccard')
]:
    cluster_assignments, centers, sse, iterations_completed = perform_kmeans(data_points, num_clusters, dist_func)
    label_map = assign_labels_to_clusters(cluster_assignments, labels)
    accuracy_measure = compute_accuracy(cluster_assignments, label_map, labels)
    experiment_results[func_name] = {'SSE': sse, 'Accuracy': accuracy_measure, 'Iterations': iterations_completed}


# print the results
for metric, results in experiment_results.items():
    print(f'Using {metric} distance metric:')
    print(f'  - Sum of Squared Errors: {results["SSE"]}')
    print(f'  - Accuracy: {results["Accuracy"]}')
    print(f'  - Iterations Completed: {results["Iterations"]}')
    print()



Using Euclidean distance metric:
  - Sum of Squared Errors: 25321981136.02127
  - Accuracy: 0.6004600460046005
  - Iterations Completed: 65

Using Cosine distance metric:
  - Sum of Squared Errors: 686.4127802680829
  - Accuracy: 0.6336633663366337
  - Iterations Completed: 28

Using Jaccard distance metric:
  - Sum of Squared Errors: 3733.752493318236
  - Accuracy: 0.6311631163116311
  - Iterations Completed: 31



# Only Iterations stopping

In [2]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import euclidean, cosine
from sklearn.metrics import pairwise_distances_argmin_min

# Custom distance computation functions
def calc_euclidean_dist(vec1, vec2):
    return euclidean(vec1, vec2)

def calc_cosine_similarity(vec1, vec2):
    return 1 - np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def calc_jaccard_index(vec1, vec2):
    min_sum = np.minimum(vec1, vec2).sum()
    max_sum = np.maximum(vec1, vec2).sum()
    return 1 - min_sum / max_sum if max_sum != 0 else 0
# Implementation of the K-means clustering algorithm
def perform_kmeans(data, num_clusters, dist_func, max_iterations=100):
    selected_indices = np.random.choice(len(data), num_clusters, replace=False)
    cluster_centers = data[selected_indices]
    
    for iter_num in range(max_iterations):
        assigned_clusters = np.array([np.argmin([dist_func(point, center) for center in cluster_centers]) for point in data])
        updated_centers = np.array([data[assigned_clusters == idx].mean(axis=0) for idx in range(num_clusters)])
        
        # if np.array_equal(cluster_centers, updated_centers):
        #     break
        
        cluster_centers = updated_centers
        closest, distances = pairwise_distances_argmin_min(data, cluster_centers, metric=dist_func)
        total_sse = np.sum(distances ** 2)
        
        # if iter_num > 0 and total_sse > prev_sse:
        #     break
        
        prev_sse = total_sse
    
    return assigned_clusters, cluster_centers, total_sse, iter_num

# Functions for assigning labels to clusters and calculating the accuracy
def assign_labels_to_clusters(cluster_assignments, actual_labels):
    label_mapping = {}
    for cluster in np.unique(cluster_assignments):
        cluster_labels, label_counts = np.unique(actual_labels[cluster_assignments == cluster], return_counts=True)
        label_mapping[cluster] = cluster_labels[np.argmax(label_counts)]
    return label_mapping

def compute_accuracy(cluster_assignments, mapped_labels, actual_labels):
    correct_labels = sum(mapped_labels[cluster] == true_label for cluster, true_label in zip(cluster_assignments, actual_labels))
    return correct_labels / len(actual_labels)

# Example usage with a dataset
# Load your data here
# data_points = (your data points)
# labels = (your labels)
data_points = pd.read_csv('kmeans_data/data.csv').values
labels = pd.read_csv('kmeans_data/label.csv').values.squeeze()
num_clusters = len(np.unique(labels))

# Execute K-means using different distance metrics
experiment_results = {}
for dist_func, func_name in [
    (calc_euclidean_dist, 'Euclidean'), 
    (calc_cosine_similarity, 'Cosine'), 
    (calc_jaccard_index, 'Jaccard')
]:
    cluster_assignments, centers, sse, iterations_completed = perform_kmeans(data_points, num_clusters, dist_func)
    label_map = assign_labels_to_clusters(cluster_assignments, labels)
    accuracy_measure = compute_accuracy(cluster_assignments, label_map, labels)
    experiment_results[func_name] = {'SSE': sse, 'Accuracy': accuracy_measure, 'Iterations': iterations_completed}


# print the results
for metric, results in experiment_results.items():
    print(f'Using {metric} distance metric:')
    print(f'  - Sum of Squared Errors: {results["SSE"]}')
    print(f'  - Accuracy: {results["Accuracy"]}')
    print(f'  - Iterations Completed: {results["Iterations"]}')
    print()



Using Euclidean distance metric:
  - Sum of Squared Errors: 25429809658.388317
  - Accuracy: 0.6026602660266026
  - Iterations Completed: 99

Using Cosine distance metric:
  - Sum of Squared Errors: 701.4056819144934
  - Accuracy: 0.6065606560656066
  - Iterations Completed: 99

Using Jaccard distance metric:
  - Sum of Squared Errors: 3692.1272769028296
  - Accuracy: 0.5444544454445445
  - Iterations Completed: 99



# Only SSE increase

In [3]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import euclidean, cosine
from sklearn.metrics import pairwise_distances_argmin_min

# Custom distance computation functions
def calc_euclidean_dist(vec1, vec2):
    return euclidean(vec1, vec2)

def calc_cosine_similarity(vec1, vec2):
    return 1 - np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def calc_jaccard_index(vec1, vec2):
    min_sum = np.minimum(vec1, vec2).sum()
    max_sum = np.maximum(vec1, vec2).sum()
    return 1 - min_sum / max_sum if max_sum != 0 else 0
# Implementation of the K-means clustering algorithm
def perform_kmeans(data, num_clusters, dist_func, max_iterations=100):
    selected_indices = np.random.choice(len(data), num_clusters, replace=False)
    cluster_centers = data[selected_indices]
    
    for iter_num in range(max_iterations):
        assigned_clusters = np.array([np.argmin([dist_func(point, center) for center in cluster_centers]) for point in data])
        updated_centers = np.array([data[assigned_clusters == idx].mean(axis=0) for idx in range(num_clusters)])
        
        # if np.array_equal(cluster_centers, updated_centers):
        #     break
        
        cluster_centers = updated_centers
        closest, distances = pairwise_distances_argmin_min(data, cluster_centers, metric=dist_func)
        total_sse = np.sum(distances ** 2)
        
        if iter_num > 0 and total_sse > prev_sse:
            break
        
        prev_sse = total_sse
    
    return assigned_clusters, cluster_centers, total_sse, iter_num

# Functions for assigning labels to clusters and calculating the accuracy
def assign_labels_to_clusters(cluster_assignments, actual_labels):
    label_mapping = {}
    for cluster in np.unique(cluster_assignments):
        cluster_labels, label_counts = np.unique(actual_labels[cluster_assignments == cluster], return_counts=True)
        label_mapping[cluster] = cluster_labels[np.argmax(label_counts)]
    return label_mapping

def compute_accuracy(cluster_assignments, mapped_labels, actual_labels):
    correct_labels = sum(mapped_labels[cluster] == true_label for cluster, true_label in zip(cluster_assignments, actual_labels))
    return correct_labels / len(actual_labels)

# Example usage with a dataset
# Load your data here
# data_points = (your data points)
# labels = (your labels)
data_points = pd.read_csv('kmeans_data/data.csv').values
labels = pd.read_csv('kmeans_data/label.csv').values.squeeze()
num_clusters = len(np.unique(labels))

# Execute K-means using different distance metrics
experiment_results = {}
for dist_func, func_name in [
    (calc_euclidean_dist, 'Euclidean'), 
    (calc_cosine_similarity, 'Cosine'), 
    (calc_jaccard_index, 'Jaccard')
]:
    cluster_assignments, centers, sse, iterations_completed = perform_kmeans(data_points, num_clusters, dist_func)
    label_map = assign_labels_to_clusters(cluster_assignments, labels)
    accuracy_measure = compute_accuracy(cluster_assignments, label_map, labels)
    experiment_results[func_name] = {'SSE': sse, 'Accuracy': accuracy_measure, 'Iterations': iterations_completed}


# print the results
for metric, results in experiment_results.items():
    print(f'Using {metric} distance metric:')
    print(f'  - Sum of Squared Errors: {results["SSE"]}')
    print(f'  - Accuracy: {results["Accuracy"]}')
    print(f'  - Iterations Completed: {results["Iterations"]}')
    print()



Using Euclidean distance metric:
  - Sum of Squared Errors: 25318282453.85691
  - Accuracy: 0.5889588958895889
  - Iterations Completed: 99

Using Cosine distance metric:
  - Sum of Squared Errors: 684.7261814001995
  - Accuracy: 0.6171617161716172
  - Iterations Completed: 29

Using Jaccard distance metric:
  - Sum of Squared Errors: 3663.7049875376615
  - Accuracy: 0.6035603560356035
  - Iterations Completed: 19



# Only No centroid Change

In [4]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import euclidean, cosine
from sklearn.metrics import pairwise_distances_argmin_min

# Custom distance computation functions
def calc_euclidean_dist(vec1, vec2):
    return euclidean(vec1, vec2)

def calc_cosine_similarity(vec1, vec2):
    return 1 - np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def calc_jaccard_index(vec1, vec2):
    min_sum = np.minimum(vec1, vec2).sum()
    max_sum = np.maximum(vec1, vec2).sum()
    return 1 - min_sum / max_sum if max_sum != 0 else 0
# Implementation of the K-means clustering algorithm
def perform_kmeans(data, num_clusters, dist_func, max_iterations=100):
    selected_indices = np.random.choice(len(data), num_clusters, replace=False)
    cluster_centers = data[selected_indices]
    
    for iter_num in range(max_iterations):
        assigned_clusters = np.array([np.argmin([dist_func(point, center) for center in cluster_centers]) for point in data])
        updated_centers = np.array([data[assigned_clusters == idx].mean(axis=0) for idx in range(num_clusters)])
        
        if np.array_equal(cluster_centers, updated_centers):
            break
        
        cluster_centers = updated_centers
        closest, distances = pairwise_distances_argmin_min(data, cluster_centers, metric=dist_func)
        total_sse = np.sum(distances ** 2)
        
        # if iter_num > 0 and total_sse > prev_sse:
        #     break
        
        prev_sse = total_sse
    
    return assigned_clusters, cluster_centers, total_sse, iter_num

# Functions for assigning labels to clusters and calculating the accuracy
def assign_labels_to_clusters(cluster_assignments, actual_labels):
    label_mapping = {}
    for cluster in np.unique(cluster_assignments):
        cluster_labels, label_counts = np.unique(actual_labels[cluster_assignments == cluster], return_counts=True)
        label_mapping[cluster] = cluster_labels[np.argmax(label_counts)]
    return label_mapping

def compute_accuracy(cluster_assignments, mapped_labels, actual_labels):
    correct_labels = sum(mapped_labels[cluster] == true_label for cluster, true_label in zip(cluster_assignments, actual_labels))
    return correct_labels / len(actual_labels)

# Example usage with a dataset
# Load your data here
# data_points = (your data points)
# labels = (your labels)
data_points = pd.read_csv('kmeans_data/data.csv').values
labels = pd.read_csv('kmeans_data/label.csv').values.squeeze()
num_clusters = len(np.unique(labels))

# Execute K-means using different distance metrics
experiment_results = {}
for dist_func, func_name in [
    (calc_euclidean_dist, 'Euclidean'), 
    (calc_cosine_similarity, 'Cosine'), 
    (calc_jaccard_index, 'Jaccard')
]:
    cluster_assignments, centers, sse, iterations_completed = perform_kmeans(data_points, num_clusters, dist_func)
    label_map = assign_labels_to_clusters(cluster_assignments, labels)
    accuracy_measure = compute_accuracy(cluster_assignments, label_map, labels)
    experiment_results[func_name] = {'SSE': sse, 'Accuracy': accuracy_measure, 'Iterations': iterations_completed}


# print the results
for metric, results in experiment_results.items():
    print(f'Using {metric} distance metric:')
    print(f'  - Sum of Squared Errors: {results["SSE"]}')
    print(f'  - Accuracy: {results["Accuracy"]}')
    print(f'  - Iterations Completed: {results["Iterations"]}')
    print()


Using Euclidean distance metric:
  - Sum of Squared Errors: 25429879411.272114
  - Accuracy: 0.6043604360436043
  - Iterations Completed: 84

Using Cosine distance metric:
  - Sum of Squared Errors: 693.352972196105
  - Accuracy: 0.6142614261426143
  - Iterations Completed: 53

Using Jaccard distance metric:
  - Sum of Squared Errors: 3693.1585203697045
  - Accuracy: 0.5473547354735474
  - Iterations Completed: 45

