In [None]:
from sklearn.cluster import MiniBatchDPMeans, DPMeans,KMeans, MiniBatchKMeans, DBSCAN, MeanShift, AgglomerativeClustering, OPTICS
from evaluations import *
# from dpmmpython.dpmmwrapper import DPMMPython
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import black_box as bb
from sklearn.metrics import normalized_mutual_info_score as nmi
from time import time
from tqdm import tqdm
from torchvision.datasets import MNIST
from sklearn.decomposition import PCA

In [None]:
EXP_NAME = "2DGaussian"
N=50000
D=2
K_count=20
data,gt = DPMMPython.generate_gaussian_data(N, D, K_count, 100.0)
data= data.T
data -= np.mean(data)
data /= np.std(data)

(X_train,y_train),(X_val,y_val),(X_test,y_test) = train_val_test_split(data,gt,0.01,0.1)

In [None]:
plt.scatter(X_test[:,0],X_test[:,1],s=1,c=y_test)

In [None]:
np.save('opt_res/x_train_gau.npy',X_train)
np.save('opt_res/y_train_gau.npy',y_train)
np.save('opt_res/x_test_gau.npy',X_test)
np.save('opt_res/y_test_gau.npy',y_test)

In [None]:
X_train = np.load('opt_res/x_train_gau.npy')
y_train = np.load('opt_res/y_train_gau.npy')
X_test = np.load('opt_res/x_test_gau.npy')
y_test = np.load('opt_res/y_test_gau.npy')

### DBSCAN

In [None]:
def search_dbscan(pars):    
    labels = DBSCAN(eps=pars[0], min_samples=int(np.round(pars[1]))).fit_predict(X_test)
    return -nmi(labels,y_test)
                                                 

In [None]:
best_params = bb.search_min(f = search_dbscan,  # given function
                            domain = [  # ranges of each parameter
                                [0.001, 10.],
                                [1., 10.]
                                ],
                            budget = 400,  # total number of function calls available
                            batch = 16,
                            resfile = 'opt_res/dbscan_gau.csv')

In [None]:
params = np.array([0.09401224, 1.23128527])
timings =[]
nmis = []
ks =[]
for i in tqdm(range(3)):
    tic = time()
    labels = DBSCAN(eps=params[0], min_samples=int(np.round(params[1]))).fit_predict(X_train)
    toc = time()-tic
    timings.append(toc)
    nmis.append(nmi(labels,y_train))
    ks.append(len(np.unique(labels)))

In [None]:
print(np.mean(timings))
print(np.std(timings))
print(np.mean(nmis))
print(np.std(nmis))
print(np.mean(ks))
print(np.std(ks))

### MeanShift

In [None]:
def search_meanshit(pars):    
    labels = MeanShift(bandwidth=pars[0]).fit_predict(X_test)
    return -nmi(labels,y_test)

In [None]:
best_params = bb.search_min(f = search_meanshit,  # given function
                            domain = [  # ranges of each parameter
                                [0.1, 10.],
                                ],
                            budget = 20,  # total number of function calls available
                            batch = 10,
                            resfile = 'opt_res/meanshift_gau.csv')

In [None]:
best_params = np.array([0.76829273]) #Evaluated seperetly due to bb optimize multi process bug with notebooks.

In [None]:
params = best_params

In [None]:
timings =[]
nmis = []
ks =[]
for i in tqdm(range(3)):
    tic = time()
    labels = MeanShift(bandwidth=params[0]).fit_predict(X_train)
    toc = time()-tic
    timings.append(toc)
    nmis.append(nmi(labels,y_train))
    ks.append(len(np.unique(labels)))
    print(timings[-1],nmis[-1],ks[-1])

In [None]:
print(np.mean(timings))
print(np.std(timings))
print(np.mean(nmis))
print(np.std(nmis))
print(np.mean(ks))
print(np.std(ks))

### Agglomerative CLustering

In [None]:
def search_agg(pars):    
    labels = AgglomerativeClustering(n_clusters=None,distance_threshold=pars[0]).fit_predict(X_test)
    return -nmi(labels,y_test)

In [None]:
best_params = bb.search_min(f = search_agg,  # given function
                            domain = [  # ranges of each parameter
                                [0.1, 100.],
                                ],
                            budget = 100,  # total number of function calls available
                            batch = 16,
                            resfile = 'opt_res/agg_gau.csv')

In [None]:
best_params = np.array([5.04173156])
timings =[]
nmis = []
ks =[]
for i in tqdm(range(3)):
    tic = time()
    labels = AgglomerativeClustering(n_clusters=None,distance_threshold=best_params[0]).fit_predict(X_train)
    toc = time()-tic
    timings.append(toc)
    nmis.append(nmi(labels,y_train))
    ks.append(len(np.unique(labels)))
    print(timings[-1],nmis[-1],ks[-1])

In [None]:
print(np.mean(timings))
print(np.std(timings))
print(np.mean(nmis))
print(np.std(nmis))
print(np.mean(ks))
print(np.std(ks))

### OPTICS

In [None]:
def search_optics(pars):    
    labels = OPTICS(min_samples=int(np.floor(pars[0]))).fit_predict(X_test)
    return -nmi(labels,y_test)

In [None]:
best_params = bb.search_min(f = search_optics,  # given function
                            domain = [  # ranges of each parameter
                                [1, 1000.],
                                ],
                            budget = 100,  # total number of function calls available
                            batch = 16,
                            resfile = 'opt_res/optics_gau.csv')

In [None]:
best_params = np.array([7.55251921])

In [None]:
search_optics(best_params)

In [None]:
best_params = np.array([55.05066056])
timings =[]
nmis = []
ks =[]
for i in tqdm(range(3)):
    tic = time()
    labels = OPTICS(min_samples=int(np.floor(best_params[0]))).fit_predict(X_train)
    toc = time()-tic
    timings.append(toc)
    nmis.append(nmi(labels,y_train))
    ks.append(len(np.unique(labels)))
    print(timings[-1],nmis[-1],ks[-1])

In [None]:
print(np.mean(timings))
print(np.std(timings))
print(np.mean(nmis))
print(np.std(nmis))
print(np.mean(ks))
print(np.std(ks))

### P-DP-Means

In [None]:
def search_dpmeans(pars):    
    labels = DPMeans(delta=pars[0]).fit_predict(X_test)
    return -nmi(labels,y_test)

In [None]:
best_params = bb.search_min(f = search_dpmeans,  # given function
                            domain = [  # ranges of each parameter
                                [1, 100.],
                                ],
                            budget = 100,  # total number of function calls available
                            batch = 16,
                            resfile = 'opt_res/pdpmeans.csv')

In [None]:
best_params = np.array([1.14217654])
timings =[]
nmis = []
ks =[]
for i in tqdm(range(4)):
    tic = time()
    labels = DPMeans(delta=best_params[0],n_init=1).fit_predict(X_train)
    toc = time()-tic
    timings.append(toc)
    nmis.append(nmi(labels,y_train))
    ks.append(len(np.unique(labels)))
    print(timings[-1],nmis[-1],ks[-1])

In [None]:
print(np.mean(timings))
print(np.std(timings))
print(np.mean(nmis))
print(np.std(nmis))
print(np.mean(ks))
print(np.std(ks))

### MiniBatch PDC-DP-Means

In [None]:
def search_dpmeans(pars):    
    labels = MiniBatchDPMeans(delta=pars[0], batch_size= int(np.round(pars[1]))).fit_predict(X_test)
    return -nmi(labels,y_test)

In [None]:
best_params = bb.search_min(f = search_dpmeans,  # given function
                            domain = [  # ranges of each parameter
                                [0.1, 20.],
                                [10,2000]],
                            budget = 200,  # total number of function calls available
                            batch = 16,
                            resfile = 'opt_res/mbpdpmeans.csv')

In [None]:
best_params = np.array([10.52243421, 10.00000611])

In [None]:
search_dpmeans(best_params)

In [None]:
best_params = np.array([1.53345112, 248.66581611])
timings =[]
nmis = []
ks =[]
for i in tqdm(range(3)):
    tic = time()
    labels = MiniBatchDPMeans(delta=best_params[0], batch_size= int(np.round(best_params[1]))).fit_predict(X_train)
    toc = time()-tic
    timings.append(toc)
    nmis.append(nmi(labels,y_train))
    ks.append(len(np.unique(labels)))
    print(timings[-1],nmis[-1],ks[-1])

In [None]:
print(np.mean(timings))
print(np.std(timings))
print(np.mean(nmis))
print(np.std(nmis))
print(np.mean(ks))
print(np.std(ks))