In [7]:
import numpy as np
import subprocess
from time import clock
from matplotlib import pyplot as plt
import pickle

from sklearn.cluster import KMeans
from dummy import dist
from classic import classic_k_means
from yinyang import yinyang_k_means
from my_k_means import my_k_means
from my_k_means_turbo import turbo
from get_data import generate, mnist, kegg_net

Preparing data

In [8]:
n, d, seed = 16384, 64, 42
data1 = generate(n, d, seed = seed)
data2 = generate(n, d, true_d = 6, true_k = 200, noise = 0.025, seed = seed)
data3 = mnist(n, d, noise = 0.01, seed = seed)
data4 = kegg_net(n, noise = 0.01, seed = seed)

Settings

In [9]:
shutdown_after = True

datasets = [data1, data2, data3, data4]
datanames = ["full_random", "clustered_random", "mnist", "kegg_net"]
algs = [classic_k_means, yinyang_k_means, my_k_means, turbo]
algnames = ["classic", "yinyang", "my_k_means", "turbo"]
k_range = [4, 8, 16, 32, 64, 128, 256]


In [10]:
if shutdown_after:
    confirmation = input("Shutdown after execution?\n")
else:
    confirmation = "no"
    
log = {}

for data, dataname in zip(datasets, datanames):
    print("n,d = ", data.shape)
    for k in k_range:
        results = {}
        answers = {}
        for alg, algname in zip(algs, algnames):
            start = clock()
            if algname == "classic":
                cl = alg(data, k, empty_strat = 'farthest point')
            else:
                cl = alg(data, k, empty_strat = 'farthest point', groups_strat = 'clustered')
            cl.fit()
            cl_t = clock() - start
            answers[algname] = {"labels" : cl.best, 
                                "centers" : cl.clusters if (algname in ["classic", "yinyang"]) else cl.clusters[-1]}
            results[algname] = {"dist_calcs" : np.array(cl.dist_calcs), 
                                "assign_times" : np.array(cl.assign_times), 
                                "migrations" : np.array(cl.migrations),
                                "total_time": cl_t}
        #checking consistenсy
        results["label_consistency"] = np.array([[np.sum(answers[name1]["labels"] == answers[name2]["labels"]) 
                                                  for name1 in algnames] for name2 in algnames])
        results["distance_consistency"] = np.array([[np.sum((answers[name1]["centers"] - answers[name2]["centers"])**2) 
                                                     for name1 in algnames] for name2 in algnames])
        #dynamics
        results["migrations"] = np.array(cl.migrations)
        results["stable_clusters_num"] = np.array([np.sum([1 for dc in cl.clusters[t] - cl.clusters[t+1] 
                                                           if sum(dc**2) == 0.]) 
                                                   for t in range(len(cl.clusters) - 1)])
        results["shape"] = data.shape
        log[(dataname, k)] = results
        
with open('log.pkl', 'wb') as f:
    pickle.dump(log, f) 
del log

Shutdown after execution?
yes
n,d =  (16384, 64)
n,d =  (16384, 64)
n,d =  (16384, 64)
n,d =  (16384, 28)


In [11]:
if confirmation == "yes":
    p = subprocess.call('shutdown -s -t 120')

In [6]:

        
'''
start = clock()
standard = KMeans(n_clusters = k, init = data[:k].copy(), 
                  algorithm = 'full', n_init = 1, tol = 0.,
                  precompute_distances = False).fit(data)
std_t = clock() - start
print("{:.4}".format(std_t).ljust(8), end = " | ")

start = clock()
Elkan = KMeans(n_clusters = k, init = data[:k].copy(), 
               algorithm = 'elkan', n_init = 1, tol = 0., 
               precompute_distances = False).fit(data) 
Elkan_t = clock() - start
print("{:.4}".format(std_t/Elkan_t).ljust(8))



print(max([dist(clas.clusters[i],yin.clusters[i]) for i in range(k)]), 
       sum(clas.best == yin.best), end = " | ")

print(max([dist(clas.clusters[i],my.clusters[-1][i]) for i in range(k)]), 
       sum(clas.best == my.best), end = " | ")

print(max([dist(clas.clusters[i],tur.clusters[-1][i]) for i in range(k)]), 
       sum(clas.best == tur.best), end = " | ")

print(max([dist(clas.clusters[i],standard.cluster_centers_[i]) for i in range(k)]), 
       sum(clas.best == standard.labels_), end = " | ")

print(max([dist(clas.clusters[i],Elkan.cluster_centers_[i]) for i in range(k)]), 
       sum(clas.best == Elkan.labels_), end = " | ")
'''
        


'\nstart = clock()\nstandard = KMeans(n_clusters = k, init = data[:k].copy(), \n           algorithm = \'full\', n_init = 1, tol = 0.,\n           precompute_distances = False).fit(data)\nstd_t = clock() - start\nprint("{:.4}".format(std_t).ljust(8), end = " | ")\n\nstart = clock()\nElkan = KMeans(n_clusters = k, init = data[:k].copy(), \n        algorithm = \'elkan\', n_init = 1, tol = 0., \n        precompute_distances = False).fit(data) \nElkan_t = clock() - start\nprint("{:.4}".format(std_t/Elkan_t).ljust(8))\n\n\n\nprint(max([dist(clas.clusters[i],yin.clusters[i]) for i in range(k)]), \nsum(clas.best == yin.best), end = " | ")\n\nprint(max([dist(clas.clusters[i],my.clusters[-1][i]) for i in range(k)]), \nsum(clas.best == my.best), end = " | ")\n\nprint(max([dist(clas.clusters[i],tur.clusters[-1][i]) for i in range(k)]), \nsum(clas.best == tur.best), end = " | ")\n\nprint(max([dist(clas.clusters[i],standard.cluster_centers_[i]) for i in range(k)]), \nsum(clas.best == standard.label