In [15]:
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.cluster import AgglomerativeClustering, KMeans, SpectralClustering
from kemlglearn.cluster import Leader
from sklearn.mixture import GaussianMixture
import numpy as np
import datetime
import pandas as pd
import os
import time

if "notebooks" in os.getcwd():
    os.chdir("..")

from src.Dataset import read_dataset, get_binary_dataset
from src.ConsensusKMeans import ConsensusKMeans
import config



In [16]:
ls_datasets = ['iris'] # 'pendigits', takes too long
d_results = {}
d_time = {}
# for dataset in ls_datasets:
# Load data
dataset = 'iris'
X, y = read_dataset(dataset)
k = len(np.unique(y))
    
# Get the binary dataset
# tic = time.time()
X_b, ls_partitions, cluster_sizes = get_binary_dataset(X, k, r=100)
# toc = time.time()
# d_binary_generation_time[dataset] = toc - tic


Clustering Progress: 100%|██████████| 100/100 [00:05<00:00, 19.25it/s]

X_b shape: (150, 725)





In [17]:

# Instantiate ConsensusKMeans for every distance
KCC = ConsensusKMeans(n_clusters=k, type='Ucos', normalize=False, cluster_sizes=cluster_sizes)

# Hierarchical sklearn clustering

HC_ward = AgglomerativeClustering(n_clusters=k, linkage='ward')
HC_avg = AgglomerativeClustering(n_clusters=k, linkage='average')
HC_single = AgglomerativeClustering(n_clusters=k, linkage='single')
HC_comp = AgglomerativeClustering(n_clusters=k, linkage='complete')

# Prototype based clustering
KM = KMeans(n_clusters=k, n_init=10)

# Gaussian Mixture Model
GMM_f = GaussianMixture(n_components=k, covariance_type='full', n_init=10)
GMM_t = GaussianMixture(n_components=k, covariance_type='tied', n_init=10)
GMM_d = GaussianMixture(n_components=k, covariance_type='diag', n_init=10)
GMM_s = GaussianMixture(n_components=k, covariance_type='spherical', n_init=10)

# Spectral Clustering
SC = SpectralClustering(n_clusters=k, n_init=10)




d_configs = {
    HC_ward: {'linkage': 'ward'},
    HC_avg: {'linkage': 'average'},
    HC_single: {'linkage': 'single'},
    HC_comp: {'linkage': 'complete'},
    KM: {'n_init': 10},
    GMM_f: {'covariance_type': 'full'},
    GMM_t: {'covariance_type': 'tied'},
    GMM_d: {'covariance_type': 'diag'},
    GMM_s: {'covariance_type': 'spherical'},
    SC: {'n_init': 10},
    KCC: {'n_init': 10,'type': 'Ucos'},
    
}

In [18]:
def evaluate_clustering(clustering, X, y):

    tic = time.time()
    try:
        y_pred = clustering.fit_predict(X)
    except:
        y_pred = clustering.fit(X).labels_
        
    toc = time.time()

    ARI = adjusted_rand_score(y, y_pred)

    print(clustering.__class__.__name__, "{}".format(d_configs[clustering]))
    print(f" --ARI: {ARI} ------ Time: {toc - tic}")

    return ARI, toc - tic

In [19]:
ari_hc_w, time_hc = evaluate_clustering(HC_ward, X, y)
ari_hc_a, time_hc = evaluate_clustering(HC_avg, X, y)
ari_hc_s, time_hc = evaluate_clustering(HC_single, X, y)
ari_hc_c, time_hc = evaluate_clustering(HC_comp, X, y)

ari_km, time_km = evaluate_clustering(KM, X, y)
ari_gmm_f, time_gmm_t = evaluate_clustering(GMM_f, X, y)
ari_gmm_t, time_gmm_t = evaluate_clustering(GMM_t, X, y)
ari_gmm_d, time_gmm_d = evaluate_clustering(GMM_d, X, y)
ari_gmm_s, time_gmm = evaluate_clustering(GMM_s, X, y)

ari_sc, time_sc = evaluate_clustering(SC, X, y)

ari_kcc, time_kcc = evaluate_clustering(KCC, X_b, y)

AgglomerativeClustering {'linkage': 'ward'}
 --ARI: 0.7311985567707746 ------ Time: 0.0014128684997558594
AgglomerativeClustering {'linkage': 'average'}
 --ARI: 0.7591987071071522 ------ Time: 0.0009963512420654297
AgglomerativeClustering {'linkage': 'single'}
 --ARI: 0.5637510205230709 ------ Time: 0.0010170936584472656
AgglomerativeClustering {'linkage': 'complete'}
 --ARI: 0.6422512518362898 ------ Time: 0.0
KMeans {'n_init': 10}
 --ARI: 0.7302382722834697 ------ Time: 0.04461932182312012
GaussianMixture {'covariance_type': 'full'}
 --ARI: 0.9038742317748124 ------ Time: 0.12523126602172852
GaussianMixture {'covariance_type': 'tied'}
 --ARI: 0.8856970310281228 ------ Time: 0.059294700622558594
GaussianMixture {'covariance_type': 'diag'}
 --ARI: 0.7591987071071522 ------ Time: 0.04039597511291504
GaussianMixture {'covariance_type': 'spherical'}
 --ARI: 0.7302382722834697 ------ Time: 0.03468608856201172
SpectralClustering {'n_init': 10}
 --ARI: 0.7455038681804481 ------ Time: 0.07962

100%|██████████| 10/10 [00:00<00:00, 16.71it/s]

ConsensusKMeans {'n_init': 10, 'type': 'Ucos'}
 --ARI: 0.7455038681804481 ------ Time: 0.600297212600708



