In [487]:
dataset = [
    [0.3, 2, 1.0, 5],
    [-2, 3, 1.5, 0.2],
    [1.1, 0.5, 0.1, -1],
    [5, 1, 0.2, 0.5],
    [1, 3, 0.5, 1]
]

ml = [(0, 1), (2, 3)]
cl = [(1, 4)]
cop_kmeans(dataset, 2, ml, cl)

([0, 0, 1, 1, 1],
 [[-0.85, 2.5, 1.25, 2.6],
  [2.3666666666666667, 1.5, 0.26666666666666666, 0.16666666666666666]])

In [477]:
import random
import math

def cop_kmeans (dataset, k, ml=[], cl=[]):
    
    ml, cl = transitive_closure(ml, cl, len(dataset))
    
    centers = initialize_centers(dataset, k)
    clusters = [-1] * len(dataset)
    
    converged = False
    while not converged:
        clusters_ = [-1] * len(dataset)
        for i, d in enumerate(dataset):
            found_cluster = False
            indices = closest_clusters(centers, d)
            counter = 0
            while (not found_cluster) and counter < len(indices):
                index = indices[counter]
                if not violate_constraints(i, index, clusters_, ml, cl):
                    found_cluster = True
                    clusters_[i] = index
                counter += 1
                
            if not found_cluster:
                return None
        centers = compute_centers(clusters_, dataset)
        
        converged = True
        i = 0
        while converged and i < len(dataset):
            if clusters[i] != clusters_[i]:
                converged = False
            i += 1
        clusters = clusters_
        
    return clusters, centers

def euclidean_distance(point1, point2):
    return math.sqrt(sum([(float(i)-float(j))**2 for (i,j) in zip(point1, point2)]))

def closest_clusters(centers, datapoint):
    distances = [euclidean_distance(center, datapoint) for 
                 center in centers]
    return sorted(range(len(distances)), key=lambda x: distances[x])

# under-specified in the paper
def initialize_centers(dataset, k):
    ids = range(len(dataset))
    random.shuffle(ids)
    return [dataset[id] for id in ids[:k]]

def violate_constraints(data_index, cluster_index, clusters, ml, cl):
    for i in ml[data_index]:
        if clusters[i] != -1 and clusters[i] != cluster_index:
            return True
    
    for i in cl[data_index]:
        if clusters[i] == cluster_index:
            return True

    return False

def compute_centers(clusters, dataset):
    ids = list(set(clusters))
    c_to_id = dict()
    for j, c in enumerate(ids):
        c_to_id[c] = j
    
    k = len(ids)
    dim = len(dataset[0])
    centers = [[0.0] * dim for i in range(k)]
    counts = [0] * k
    for j, c in enumerate(clusters):
        c = c_to_id[c]
        for i in range(dim):
            centers[c][i] += dataset[j][i]
        counts[c] += 1
    for j in range(k):
        for i in range(dim):
            centers[j][i] = centers[j][i]/float(counts[j])
    return centers

def transitive_closure(ml, cl, n):
    ml_graph = dict()
    cl_graph = dict()
    for i in range(n):
        ml_graph[i] = set()
        cl_graph[i] = set()
    
    def add_both(d, i, j):
        d[i].add(j)
        d[j].add(i)

    for (i,j) in ml:
        add_both(ml_graph, i, j)
    
    def dfs(i, graph, visited, component):
        visited[i] = True
        for j in graph[i]:
            if not visited[j]:
                dfs(j, graph, visited, component)
        component.append(i)
        
    visited = [False] * n
    for i in range(n):
        if not visited[i]:
            component = []
            dfs(i, ml_graph, visited, component)
            for x1 in component:
                for x2 in component:
                    if x1 != x2:
                        ml_graph[x1].add(x2)

    for (i,j) in cl:
        add_both(cl_graph, i, j)
        for y in ml_graph[j]:
            add_both(cl_graph, i, y)
        for x in ml_graph[i]:
            add_both(cl_graph, x, j)
            for y in ml_graph[j]:
                add_both(cl_graph, x, y)
    
    for i in ml_graph:
        for j in ml_graph[j]:
            if j in cl_graph[i]:
                raise Exception('inconsistent constraints')
        
    return ml_graph, cl_graph