In [1]:
import numpy as np
from numpy import genfromtxt
from math import sqrt
import random

In [2]:
dataset = genfromtxt('kmeans_data/data.csv', delimiter=',')
labels = genfromtxt('kmeans_data/label.csv', delimiter=',')

In [3]:
class Point():
    def __init__(self, features, label):
        self.features = features
        self.label = label

In [4]:
points = np.array([Point(dataset[i], labels[i]) for i in range(len(dataset))])

In [5]:
a = np.array([1, 2, 6, 6])
b = np.array([4, 5, 5, 6])
less = np.sum(a[a <= b]) + np.sum(b[b < a])
more = np.sum(a[a >= b]) + np.sum(b[b > a])
less, more, less / more

(14, 21, 0.6666666666666666)

In [6]:
def euclidean_dist(a, b):
    return sqrt(np.sum((a - b) * (a - b)))
def cosine_dist(a, b):
    num = np.dot(a, b)
    denom = sqrt(np.dot(a, a)) * sqrt(np.dot(b, b))
    return  1 - (num / denom)
def jaccard_dist(a, b):
    less = np.sum(a[a <= b]) + np.sum(b[b < a])
    more = np.sum(a[a >= b]) + np.sum(b[b > a])
    return 1 - (less / more)

In [7]:
def sse_dist(a, b):
    return np.sum((a - b) * (a - b))
def compute_sse(clusters, centroids):
    # Safeguards against the first iteration
    if centroids.shape[0] == 0:
        return float("inf")
    result = 0
    for i in range(len(centroids)):
        centroid = centroids[i]
        cluster = clusters[i]
        for point in cluster:
            result += sse_dist(point.features, centroid)
    return result

In [8]:
def get_accuracy(clusters):
    num_correct = 0
    total = 0
    for cluster in clusters:
        total += len(cluster)
        freq = {}
        for point in cluster:
            if point.label not in freq:
                freq[point.label] = 1
            else:
                freq[point.label] += 1
        max_freq = 0
        for key in freq.keys():
            max_freq = max(max_freq, freq[key])
        num_correct += max_freq
    return num_correct / total

In [9]:
def converged(condition, centroids, prevCentroids, **kwargs):
    print(f'condition = {condition}')
    # skips the first iteration
    if kwargs["iteration"] == 0:
        print(f'early termination')
        return False
    elif condition == "no_change":
        return np.array_equal(prevCentroids, centroids)
    elif condition == "sse_increased":
        clusters = kwargs["clusters"]
        return compute_sse(clusters, centroids) > compute_sse(clusters, prevCentroids)
    elif condition == "preset":
        print(f'iteration = {iteration} limit = {limit}')
        return kwargs["iteration"] >= kwargs["preset"]
    elif condition == "all":
        no_change = np.array_equal(prevCentroids, centroids)
        if no_change:
            return True
        clusters = kwargs["clusters"]
        sse_increased = compute_sse(clusters, centroids) > compute_sse(clusters, prevCentroids)
        if sse_increased:
            return True
        preset = (kwargs["iteration"] >= kwargs["limit"])
        return preset

In [12]:
def kmeans(points, k, condition, distanceFn=None, preset=None):
    if distanceFn is None:
        print("Error: distance function not defined")
        return
    
    prevCentroids = np.empty(k)
    centroids = np.array([obj.features for obj in np.random.choice(points, k)])
    iteration = 0
    clusters = None
    
    converged = has_converged(condition, centroids, prevCentroids, clusters=clusters, iteration=iteration, preset=preset)
    while not converged:
        iteration += 1
        print(iteration)
        
        # Assign each instance to the current centroids
        # and create clusters
        clusters = [[] for _ in range(k)]
        for point in points:
            # find closest centroid to point
            minDistance = distanceFn(point.features, centroids[0])
            minDistanceIndex = 0
            for i in range(1, len(centroids)):
                d = distanceFn(point.features, centroids[i])
                if d < minDistance:
                    minDistance = d
                    minDistanceIndex = i
            clusters[minDistanceIndex].append(point)
        
        # Current centroids become previous centroids
        prevCentroids = centroids
        
        # Recompute centroids in accordance with
        # the newly computed clusters
        NUM_ATTRS = 784
        centroids = np.zeros((k, NUM_ATTRS))
        for i in range(len(clusters)):
            centroid = np.zeros(NUM_ATTRS)
            for point in clusters[i]:
                centroid += point.features
            if len(clusters[i]) > 0:
                centroid /= len(clusters[i])
            centroids[i] = centroid
        converged = has_converged(condition, centroids, prevCentroids, clusters=clusters, iteration=iteration, preset=preset)
    result = {
        "clusters": clusters,
        "centroids": centroids,
        "sse": compute_sse(clusters, centroids),
        "accuracy": get_accuracy(clusters)
    }
    return result

In [13]:
print(f'euclidean kmeans...')
result_euclidean = kmeans(points, 10, distanceFn=euclidean_dist, condition="preset", preset=2)
print(f'euclidean kmeans sse: {result_euclidean["sse"]}')
print(f'euclidean kmeans accuracy: {result_euclidean["accuracy"]}')

euclidean kmeans...
condition = preset
early termination
1
condition = preset


NameError: name 'iteration' is not defined

In [None]:
print(f'cosine kmeans...')
result_cosine = kmeans(points, 10, distanceFn=cosine_dist, condition='all')
print(f'cosine kmeans sse: {result_cosine["sse"]}')
print(f'cosine kmeans accuracy: {result_cosine["accuracy"]}')

In [None]:
print(f'jaccard kmeans...')
result_jaccard = kmeans(points, 10, distanceFn=jaccard_dist, condition='all')
print(f'jaccard kmeans sse: {result_jaccard["sse"]}')
print(f'jaccard kmeans accuracy: {result_jaccard["accuracy"]}')