TODO
====

Add all ml when assigning a point to a cluster (This is not up to the specifications of the algorithm, but seems to improve it significantly). 

In [12]:
import random
import math

def cop_kmeans (dataset, k, ml=[], cl=[]):
    
    ml, cl = transitive_closure(ml, cl, len(dataset))
    
    centers = initialize_centers(dataset, k)
    clusters = [-1] * len(dataset)
    
    converged = False
    while not converged:
        clusters_ = [-1] * len(dataset)
        for i, d in enumerate(dataset):
            indices = closest_clusters(centers, d)
            counter = 0
            found_cluster = False
            while (not found_cluster) and counter < len(indices):
                index = indices[counter]
                if not violate_constraints(i, index, clusters_, ml, cl):
                    found_cluster = True
                    clusters_[i] = index
                counter += 1
                
            if not found_cluster:
                return None
        clusters_, centers = compute_centers(clusters_, dataset)
        
        converged = True
        i = 0
        while converged and i < len(dataset):
            if clusters[i] != clusters_[i]:
                converged = False
            i += 1
        clusters = clusters_
        
    return clusters, centers

def euclidean_distance(point1, point2):
    return math.sqrt(sum([(float(i)-float(j))**2 for (i,j) in zip(point1, point2)]))

def closest_clusters(centers, datapoint):
    distances = [euclidean_distance(center, datapoint) for 
                 center in centers]
    return sorted(range(len(distances)), key=lambda x: distances[x])

# under-specified in the paper
def initialize_centers(dataset, k):
    ids = range(len(dataset))
    random.shuffle(ids)
    return [dataset[id] for id in ids[:k]]

# TODO add all ml-ed instances to the assinged cluster
def violate_constraints(data_index, cluster_index, clusters, ml, cl):
    for i in ml[data_index]:
        if clusters[i] != -1 and clusters[i] != cluster_index:
            return True
    
    for i in cl[data_index]:
        if clusters[i] == cluster_index:
            return True

    return False

def compute_centers(clusters, dataset):
    ids = list(set(clusters))
    c_to_id = dict()
    for j, c in enumerate(ids):
        c_to_id[c] = j
    for j, c in enumerate(clusters):
        clusters[j] = c_to_id[c]
    
    k = len(ids)
    dim = len(dataset[0])
    centers = [[0.0] * dim for i in range(k)]
    counts = [0] * k
    for j, c in enumerate(clusters):
        for i in range(dim):
            centers[c][i] += dataset[j][i]
        counts[c] += 1
    for j in range(k):
        for i in range(dim):
            centers[j][i] = centers[j][i]/float(counts[j])
    return clusters, centers

def transitive_closure(ml, cl, n):
    ml_graph = dict()
    cl_graph = dict()
    for i in range(n):
        ml_graph[i] = set()
        cl_graph[i] = set()
    
    def add_both(d, i, j):
        d[i].add(j)
        d[j].add(i)

    for (i,j) in ml:
        add_both(ml_graph, i, j)
    
    def dfs(i, graph, visited, component):
        visited[i] = True
        for j in graph[i]:
            if not visited[j]:
                dfs(j, graph, visited, component)
        component.append(i)
        
    visited = [False] * n
    for i in range(n):
        if not visited[i]:
            component = []
            dfs(i, ml_graph, visited, component)
            for x1 in component:
                for x2 in component:
                    if x1 != x2:
                        ml_graph[x1].add(x2)
    for (i,j) in cl:
        add_both(cl_graph, i, j)
        for y in ml_graph[j]:
            add_both(cl_graph, i, y)
        for x in ml_graph[i]:
            add_both(cl_graph, x, j)
            for y in ml_graph[j]:
                add_both(cl_graph, x, y)
    
    for i in ml_graph:
        for j in ml_graph[i]:
            if j != i and j in cl_graph[i]:
                raise Exception('inconsistent constraints between %d and %d' %(i,j))
        
    return ml_graph, cl_graph

In [4]:
def read_data(datafile):
    data = []
    with open(datafile, 'r') as f:
        for line in f:
            line = line.strip()
            if line != '':
                d = [float(i) for i in line.split()]
                data.append(d)
    return data

def read_constraints(consfile):
    ml, cl = [], []
    with open(consfile, 'r') as f:
        for line in f:
            line = line.strip()
            if line != '':
                line = line.split()
                constraint = (int(line[0]), int(line[1]))
                c = int(line[2])
                if c == 1:
                    ml.append(constraint)
                if c == -1:
                    cl.append(constraint)
    return ml, cl
    
def run(datafile, consfile, k, outfile):
    data = read_data(datafile)
    ml, cl = read_constraints(consfile)
    cop_kmeans(data, k, ml, cl)
#     limit = 1000
#     result = None
#     for i in range(limit):
#         result = cop_kmeans(data, k, ml, cl)
#         if result != None:
#             break
#     if result != None:
#         with open(outfile, 'w') as f:
#             for cluster in results[0]:
#                 f.write('%d\n' %cluster)
#     else:
#         print ('no result')
        

In [13]:
run ('../../../temp/examples/iris.data', '../../../temp/examples/iris.constraints', 3, './test2')

(set([]), set([0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 53, 55, 56, 58, 59, 60, 61, 63, 64, 65, 66, 67, 68, 69, 70, 72, 74, 75, 76, 77, 78, 79, 80, 82, 83, 84, 85, 86, 87, 89, 91, 93, 94, 96, 97, 98, 99]))
