In [1]:
import sys
sys.path.append('..')

In [13]:
from cop_kmeans import cop_kmeans, euclidean_distance

def read_data(datafile):
    data = []
    with open(datafile, 'r') as f:
        for line in f:
            line = line.strip()
            if line != '':
                d = [float(i) for i in line.split()]
                data.append(d)
    return data

def read_constraints(consfile):
    ml, cl = [], []
    with open(consfile, 'r') as f:
        for line in f:
            line = line.strip()
            if line != '':
                line = line.split()
                constraint = (int(line[0]), int(line[1]))
                c = int(line[2])
                if c == 1:
                    ml.append(constraint)
                if c == -1:
                    cl.append(constraint)
    return ml, cl

def cluster_quality(cluster):
    if len(cluster) == 0:
        return 0.0
       
    quality = 0.0
    for i in range(len(cluster)):
        for j in range(i, len(cluster)):
            quality += euclidean_distance(cluster[i], cluster[j])
    return quality / len(cluster)
    
def compute_quality(data, cluster_indices):
    clusters = dict()
    for i, c in enumerate(cluster_indices):
        if c in clusters:
            clusters[c].append(data[i])
        else:
            clusters[c] = [data[i]]
    return sum(cluster_quality(c) for c in clusters.values())
    

def run_iterate(datafile, consfile, k, n_runs=500):
    data = read_data(datafile)
    ml, cl = read_constraints(consfile)
    
    num_sat = 0
    best_quality = None
    for i in range(n_runs):
        result = cop_kmeans(data, k, ml, cl)
        if result != None:
            num_sat += 1
            clusters, centers = result
            quality = compute_quality(data, clusters)
            if not best_quality or quality < best_quality:
                best_quality = quality
    return (num_sat/n_runs*100), best_quality


In [24]:
cons_nums = [2, 60]
results = []
for c_num in cons_nums:
    sat, qual = run_iterate('../../data/iris/iris.data', 
                            '../../data/iris/constraints/iris.%d.cons'%c_num, 
                            5, n_runs=10)
    results.append({'c': c_num, 'sat': sat, 'quality': qual})

In [26]:
import csv
with open('./iris.resluts', 'w') as f:
    w = csv.DictWriter(f, results[0].keys())
    w.writeheader()
    w.writerows(results)