Importing libraries

In [5]:
import faiss
import numpy as np
import sklearn as sk
import joblib
from sklearn.cluster import Birch
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import DBSCAN
import hdbscan
from sklearn.cluster import AgglomerativeClustering

## Loading Data

This would look different for different data

In [6]:
index = faiss.read_index("Validation/validation_index.faiss")

In [11]:
index = faiss.read_index("validation_it/validation_index.faiss")

KeyboardInterrupt: 

In [7]:
keys = joblib.load("Validation/rest_keys")

In [2]:
keys = joblib.load("validation_it/rest_keys")

In [8]:
ids = joblib.load("Validation/rest__ids")

In [16]:
ids = np.array(ids)

In [3]:
ids = joblib.load("validation_it/rest__ids")

For clustering, first PCA is necessary

In [9]:
pca_keys = PCA(n_components = 20).fit(keys[:50_000]).transform(keys)

In [None]:
# If necessary it can be save
joblib.dump(pca_keys, "pca_keys")

In [7]:
pca_keys = joblib.load("it_keys_pca_to_20.pkl")

## Clustering with the datastore as a whole

In [None]:
def from_labels_to_keys_and_ids(labels, keys, ids):

    all_different_ids = np.unique(ids)

    new_keys = []
    new_ids = []

    for label in all_different_ids:
        i = np.where(labels == label)[0]
        if len(i) > 0:
            new_keys.append(np.mean(keys[i], axis=0))
            
            # Get the most frequent ID
            counts = np.bincount(ids[i])
            most_freq_id = np.argmax(counts)
            new_ids.append(most_freq_id)

    return new_keys, new_ids

# if a clusteringm ethod returns clusters of -1 for items that are not part of a cluster, like HDBSCAN and DBSCAN do use the following

def from_labels_to_keys_and_ids_with_minus_1(labels, keys, ids):

    all_different_ids = np.unique(ids)

    new_keys = []
    new_ids = []

    for label in all_different_ids[1:]:
        i = np.where(labels == label)[0]
        if len(i) > 0:
            new_keys.append(np.mean(keys[i], axis=0))
            
            # Get the most frequent ID
            counts = np.bincount(ids[i])
            most_freq_id = np.argmax(counts)
            new_ids.append(most_freq_id)

    i = np.where(labels == -1)
    for item in ids[i]:
        new_ids.append(item)

    for item in keys[i]:
        new_keys.append(item)

    return new_keys, new_ids

### HDBSCAN

In [3]:
def hdbscan_cluster(X):
    clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean')
    return clusterer.fit_predict(X)

In [None]:
hdbscan_labels = hdbscan_cluster(pca_keys)

In [None]:
new_keys, new_ids = from_labels_to_keys_and_ids_with_minus_1(hdbscan_labels)

6839

In [None]:
joblib.dump(new_keys, "name")
joblib.dump(new_ids, "name")

['Validation/validation_results/hdbscan_labels_commonvoice.pkl']

In [None]:
#If necessary to save memory
hdbscan_labels = None
new_keys = None
new_ids = None

## Minibatch kmeans

In [5]:
from sklearn.cluster import MiniBatchKMeans

In [None]:
def generate_large_centroid_pool(X, target_centroids=1_200_000, reps = 50):
    all_centroids = []
    n_clusters_per_iter = target_centroids // reps

    for i in range(reps):
        mbk = MiniBatchKMeans(
            n_clusters=n_clusters_per_iter,
            batch_size = 512,
            n_init=1,
            random_state=i  # Different seed each time
            max_iter = 10
        )
        mbk.fit(X)
        all_centroids.append(mbk.cluster_centers_)

    return np.vstack(all_centroids)


In [None]:
def massive_clustering(X, final_clusters=1_200_000, chunk_size=100_000, local_clusters=1000):
    all_centroids = generate_large_centroid_pool(X)
    print(len(all_centroids))

    mbk_final = MiniBatchKMeans(n_clusters=final_clusters, batch_size=1000, random_state=42)
    final_labels = mbk_final.fit_predict(all_centroids)

    return final_labels

In [None]:
massive_mbk_labels = massive_clustering(pca_keys)

In [None]:
new_keys, new_ids = from_labels_to_keys_and_ids(massive_mbk_labels)

In [None]:
joblib.dump(new_keys, "name")
joblib.dump(new_ids, "name")

In [None]:
#If necessary to save memory
hdbscan_labels = None
new_keys = None
new_ids = None

k-Means (with the faiss library)

In [8]:
k = int(len(pca_keys) / 2)
index = faiss.IndexFlatL2(pca_keys.shape[1])
kmeans = faiss.Clustering(pca_keys.shape[1], k)
kmeans.train(pca_keys, index)
_, labels = index.search(pca_keys, 1)

In [None]:
new_keys, new_ids = from_labels_to_keys_and_ids(labels)

In [None]:
joblib.dump(new_keys, "name")
joblib.dump(new_ids, "name")

['Validation/Faiss_thing_result_d2.pkl']

In [None]:
#If necessary to save memory
hdbscan_labels = None
new_keys = None
new_ids = None

# Clustering only the same id

In [10]:
def save_keys_and_ids_from_labels(indices_of_labels, labels):
    dict_keys = list(labels.keys())
    dict_keys[:5], len(dict_keys)

    new_vectors = []
    new_ids = []
    
    for id in dict_keys:
        temporary_labels = labels[id]
        temporary_indexes = indices_of_labels[id]
        temp_keys = keys[temporary_indexes]
        for label in np.unique(temporary_labels):
            new_vectors.append(np.mean(temp_keys[np.where(temporary_labels == label)], 0))
            new_ids.append(id)

    clustered_indices = [item for key in labels.keys() for item in indices_of_labels[key]]

    all_indices = np.arange(len(keys))
    non_clustered_indices = np.setdiff1d(all_indices, clustered_indices)

    non_clustered_keys = list(keys[non_clustered_indices])
    non_clustered_ids = list(ids[non_clustered_indices])

    all_new_keys = non_clustered_keys + new_vectors
    all_new_ids = non_clustered_ids + new_ids

    #if wanted:
    all_new_keys = np.array(all_new_keys)
    all_new_ids = np.array(all_new_ids)

    return all_new_keys, all_new_ids

### DBSCAN

In [None]:
def dbscan_cluster(X):
    dbscan = DBSCAN(eps = 1.5, min_samples=1)
    return dbscan.fit_predict(X)

In [None]:
all_different_ids = np.unique(ids)

indices_of_labels = dict()
labels = dict()

for id in all_different_ids:
    #print(id)
    indices = np.where(ids == id)[0]
    sub = np.array(pca_keys[indices]).astype(np.float32)
    if len(sub) > 50:
        labels[id] = dbscan_cluster(sub)
        print(len(labels[id]), len(np.unique(labels[id])))
        indices_of_labels[id] = indices
    

In [None]:
new_keys, new_ids = save_keys_and_ids_from_labels(indices_of_labels, labels)

In [None]:
joblib.dump(new_keys, "name")
joblib.dump(new_ids, "name")
#If necessary to save memory
hdbscan_labels = None
new_keys = None
new_ids = None

### Birch

In [11]:
all_different_ids = np.unique(ids)
all_different_ids[:5], len(all_different_ids)

(array([0, 1, 6, 7, 8]), 5584)

In [12]:
def birch_cluster(X):
    length = len(X)
    clusterer = Birch(threshold=0.3, branching_factor=25, n_clusters= min(20, int(length / 4)))
    return clusterer.fit_predict(X)

In [13]:
def birch_cluster_in_steps(X, batch_size=50):
    X = X.astype(np.float32)
    length = len(X)
    clusterer = Birch(threshold = 1, branching_factor=25, n_clusters=None)

    for i in range(0, len(X), batch_size):
        batch = X[i:i + batch_size]
        clusterer.partial_fit(batch)

    return clusterer.predict(X)

In [14]:
indices_of_labels = dict()
labels = dict()

for id in all_different_ids:
    #print(id)
    indices = np.where(ids == id)[0]
    sub = np.array(pca_keys[indices]).astype(np.float32)
    if len(sub) > 50:
        labels[id] = birch_cluster_in_steps(sub)
        print(len(labels[id]), len(np.unique(labels[id])))
        indices_of_labels[id] = indices
    

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



470 344
89 60
4970 4487
423 326
30351 15842
399 394
65 64
1689 1010
322 94
461 157
56 21
2666 756
1189 608
488 157
333 97
522 144
356 85
142 42
972 306
1207 240
325 85
656 218
1077 167
951 216
186 50
901 483
1243 768
496 99
1667 334
1494 350
1769 317
57 32
212 114
1299 345
559 203
2334 705
474 217
537 64
1061 189
106 41
1727 192
8794 5012
64 15
478 60
654 128
241 122
62 16
227 95
303 103
469 225
809 223
1275 636
216 66
67 29
925 221
827 176
323 50
113 29
507 241
587 192
246 116
625 106
3876 3217
506 405
232 19
236 117
1541 629
164 38
506 120
269 93
1178 375
93 3
1672 208
7896 5512
1553 814
241 29
370 189
1452 585
120 49
94 56
519 79
117 26
481 196
2733 2046
1288 574
590 193
63 34
214 21
1203 96
2468 1129
152 24
1882 282
602 205
70 15
294 74
3343 947
685 83
221 63
558 270
58 31
1850 1272
589 59
513 43
65 8
597 250
333 126
76 76
539 174
137 39
78 47
12967 10638
66 26
1421 586
2717 612
323 25
662 144
290 122
408 76
229 27
132 24
464 294
227 63
142 23
474 334
142 28
120 13
590 52
53 10
72 

In [17]:
new_keys, new_ids = save_keys_and_ids_from_labels(indices_of_labels, labels)

In [None]:
joblib.dump(new_keys, "name")
joblib.dump(new_ids, "name")
#If necessary to save memory
hdbscan_labels = None
new_keys = None
new_ids = None

### Mini Batch K-means

In [None]:
def cluster_minibatchkmeans(X):
    data_size = len(X)
    #Batch size for smaller dataset was batch_size = int(data_size**0.4), no reassignment ration. max_no_imp was 10
    clusterer = MiniBatchKMeans(init ='k-means++', n_clusters = int(data_size / 3),
                        batch_size = min(1000, int(data_size**0.25)), n_init = 1,
                        max_no_improvement = 5, verbose = 0, reassignment_ratio=0.01)
    return clusterer.fit_predict(X)

In [None]:
indices_of_labels = dict()
labels = dict()

for id in all_different_ids:
    indices = np.where(ids == id)[0]
    sub = np.array(pca_keys[indices])
    if len(sub) > 50:
        print(id)
        labels[id] = cluster_minibatchkmeans(sub)
        indices_of_labels[id] = indices

In [None]:
new_keys, new_ids = save_keys_and_ids_from_labels(indices_of_labels, labels)

In [None]:
joblib.dump(new_keys, "name")
joblib.dump(new_ids, "name")
#If necessary to save memory
hdbscan_labels = None
new_keys = None
new_ids = None

### Hdhbscan

In [7]:
def hdbscan_cluster(X):
    clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean')
    return clusterer.fit_predict(X)

In [None]:
indices_of_labels = dict()
labels = dict()

for id in all_different_ids:
    indices = np.where(ids == id)[0]
    sub = np.array(pca_keys[indices])
    if len(sub) > 50:
        print(id)
        labels_hdbscan = hdbscan_cluster(sub)
        return_i = [i for i in range(len(labels_hdbscan)) if labels_hdbscan[i] != -1]
        return_labels = [item for item in labels_hdbscan if item != -1]
        labels[id] = return_labels
        indices_of_labels[id] = indices[return_i]

In [None]:
new_keys, new_ids = save_keys_and_ids_from_labels(indices_of_labels, labels)

['Validation/validation_results/hdbscan_per_id_it_ids.pkl']

In [None]:
joblib.dump(new_keys, "name")
joblib.dump(new_ids, "name")
#If necessary to save memory
hdbscan_labels = None
new_keys = None
new_ids = None

### AgglomerativeClustering / Hierarchical

In [None]:
def Agglomerative_Clustering(X):
    length = len(X)
    n_clusters = int(length / 3)
    agg = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
    return agg.fit_predict(X)

In [None]:
indices_of_labels = dict()
labels = dict()

for id in all_different_ids:
    indices = np.where(ids == id)[0]
    sub = np.array(pca_keys[indices])
    if len(sub) > 50:
        print(id)
        labels[id] = AgglomerativeClustering(sub)
        indices_of_labels[id] = indices
    

In [None]:
new_keys, new_ids = save_keys_and_ids_from_labels(indices_of_labels, labels)

In [None]:
joblib.dump(new_keys, "name")
joblib.dump(new_ids, "name")
#If necessary to save memory
hdbscan_labels = None
new_keys = None
new_ids = None

### Randomly removing

In [None]:
import random

In [None]:
num_delete = int(0.333 * index.ntotal)
total_in_index = index.ntotal
indices = np.array(random.sample(range(total_in_index), num_delete))

In [None]:
new_keys = keys[indices]
new_ids = ids[indices]

In [None]:
joblib.dump(new_keys, "name")
joblib.dump(new_ids, "name")
#If necessary to save memory
hdbscan_labels = None
new_keys = None
new_ids = None