## This is a smarter way to evaluate the crucial steps of the BERTopic model, avoiding to have to create a complete model for each configuration

In [1]:
# Import the sections and create manual embeddings

from bertopic import BERTopic
import statistics
import numpy as np
import json
from bertopic.backend import BaseEmbedder
from sentence_transformers import SentenceTransformer
from umap import UMAP

# First we import the files
our_flyers_path = 'flyers_text.json'

with open(our_flyers_path, 'r') as file:
    json_file = json.load(file)

# The list of sections to analyze is the list of values of each value of the original dictionary (json file)
sections_dict = {} # The dictionary is to avoid repeated elements
saw_sections = []
counter = -1
for document in json_file.values(): 
    counter += 1
    for section in document.values(): 
        if section not in saw_sections:
            sections_dict[section] = counter
            saw_sections.append(section)

sections = list(sections_dict.keys())
document_id_list = list(sections_dict.values())

print("Imported sections:", len(sections))

class CustomEmbedder(BaseEmbedder):
    def __init__(self, embedding_model):
        super().__init__()
        self.embedding_model = embedding_model

    def embed(self, documents, verbose=False):
        embeddings = self.embedding_model.encode(documents, show_progress_bar=verbose)
        return embeddings 
    
embedding_model = SentenceTransformer("distiluse-base-multilingual-cased-v1")
custom_embedder = CustomEmbedder(embedding_model=embedding_model)

print("Creating embeddings manually...")
manual_embeddings = custom_embedder.embed(sections, verbose=True)
print("Manual 384-dimensional embeddings created!")

2024-01-17 03:05:45.609510: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-01-17 03:05:45.731634: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-17 03:05:46.876266: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
2024-01-17 03:05:46.876360

Imported sections: 11827
Creating embeddings manually...


Batches:   0%|          | 0/370 [00:00<?, ?it/s]

Manual 384-dimensional embeddings created!


## Evaluation

In [2]:
from sklearn.metrics import davies_bouldin_score, calinski_harabasz_score, silhouette_score
from sklearn.metrics.cluster import adjusted_rand_score, adjusted_mutual_info_score, normalized_mutual_info_score
#from sklearn.cluster import HDBSCAN
from hdbscan import HDBSCAN
import time
import matplotlib.pyplot as plt
import numpy as np

def add_noise(embeddings, noise_sd=0.01): 
    
    noisy_embeddings = embeddings.copy()

    for index,embedding in enumerate(noisy_embeddings): 
        noise = np.random.normal(0,noise_sd, len(embedding))
        noisy_embeddings[index] += noise
        
    return noisy_embeddings

def geometric_evaluation(embeddings_, labels_): 

    # Distance metrics
    DB_score = davies_bouldin_score(embeddings_, labels_) # The lower the better
    CH_score = calinski_harabasz_score(embeddings_, labels_) # The higher the better
    S_score = silhouette_score(embeddings_, labels_, metric="cosine") # The higher the better
    
    return DB_score, CH_score, S_score

def robustness_evaluation(labels, noisy_labels): 

    # Robustness metrics
    AR_score = adjusted_rand_score(labels,noisy_labels) # The higher the better
    AMI_score = adjusted_mutual_info_score(labels,noisy_labels) # The higher the better
    NMI_score = normalized_mutual_info_score(labels,noisy_labels) # The higher the better
    
    return AR_score, AMI_score, NMI_score

def semantic_evaluation(labels):
    
    # Create sublists with the labels assigned to sections in the same document (including outliers)
    prev = 0
    doc_labels = []
    docs_labels_list = []
    for index,element in enumerate(document_id_list): 
        if element == prev:
            doc_labels.append(labels[index])
        else: 
            docs_labels_list.append(doc_labels)
            doc_labels = []
            doc_labels.append(labels[index])
        prev = element
    docs_labels_list.append(doc_labels)

    # Take the cluster that is the most frequent and see how frequent it is in the list
    scores_list = []
    for doc in docs_labels_list: # Each doc is a lists with the labels of its sections
        # Find the most repeated label in the document
        most_freq_label = max(set(doc), key=doc.count)
        # Find how many time does that label appear and compare it with the total quantity of labels (sections) of the doc
        label_repetitions = doc.count(most_freq_label)
        score = (label_repetitions/len(doc))*100
        scores_list.append(score)

    # Compute the total score weighting the score of each doc with its number of sections
    sum_of_scores = 0
    for index,doc in enumerate(docs_labels_list): 
        sum_of_scores += scores_list[index]*len(doc)

    weighted_score = sum_of_scores/len(labels)

    '''
    This metric as it is is not very intuitive, because it is upper bounded by 84.68% (because we discard repeated sections) and
    lower bounded by the score that we would get if no repeated section would be in any document (11.16%) (REALLYYYYY??????)

    Then, we should rescale it!
    '''

    return weighted_score

initial_time = time.time()

iters = [1,2,3,4]

for it in iters:

    dimension_of_embeddings = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
    min_cluster_sizes = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
    counter = 0

    list_of_lists_of_number_of_clusters = []
    list_of_lists_of_outliers = []
    list_of_lists_of_DB_scores = []
    list_of_lists_of_CH_scores = []
    list_of_lists_of_S_scores = []
    list_of_lists_of_AR_scores = []
    list_of_lists_of_AMI_scores = []
    list_of_lists_of_NMI_scores = []
    list_of_lists_of_sem_scores = []

    for i in dimension_of_embeddings: 

        list_of_number_of_clusters = []
        list_of_outliers = []
        list_of_DB_scores = []
        list_of_CH_scores = []
        list_of_S_scores = []
        list_of_AR_scores = []
        list_of_AMI_scores = []
        list_of_NMI_scores = []
        list_of_sem_scores = []

        for j in min_cluster_sizes:

            counter += 1
            if counter == 1: 
                print("Starting iteration:", it - iters[0])
            if counter % 5 == 0:
                print(f"Evaluating configation {counter} of {len(dimension_of_embeddings)*len(min_cluster_sizes)}")
            #print("Executing reduction of dimensionality by UMAP model...")
            umap_model = UMAP(n_neighbors=15, n_components=i, min_dist=0.0, metric='cosine')
            embeddings = umap_model.fit_transform(manual_embeddings)
            #print("UMAP embeddings created!")

            #print("Generating noisy embeddings...")
            noisy_embeddings = add_noise(embeddings, noise_sd=0.05)

            #print("Executing HDBSCAN...")
            hdbscan_model = HDBSCAN(min_cluster_size=j, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
            #hdbscan_model = HDBSCAN(min_cluster_size=j, metric='cosine', cluster_selection_method='eom')
            labels = hdbscan_model.fit_predict(embeddings)
            number_of_clusters = len(set(labels))

            noisy_labels = hdbscan_model.fit_predict(noisy_embeddings)
            #print("Clusters computed by HDBSCAN!")

            #print("Remove outliers before applying geometrical evaluation")
            labels_ = []
            embeddings_ = []
            noisy_labels_ = []
            outliers_counter = 0
            for index,element in enumerate(labels): 
                if element != -1: 
                    labels_.append(element)
                    embeddings_.append(embeddings[index])
                    noisy_labels_.append(noisy_labels[index]) # ATTENTION: I DON'T REMEMBER THE OUTLIERS IN THE NOISY_LABELS, 
                    # BUT THOSE POSITIONS WHERE OUTLIERS WHERE SPOTTED IN THE HEALTHY LABELS! IF CLUSTERS ARE ROBUST, THEY SHOULD BE THE SAME!!!!!
                else: 
                    outliers_counter += 1


            #print("Computing geometric evaluation...")
            DB_score, CH_score, S_score = geometric_evaluation(embeddings_, labels_)

            #print("Computing robustness evaluation...")
            AR_score, AMI_score, NMI_score = robustness_evaluation(labels_, noisy_labels_)

            #print("Computing semantic evaluation...")
            semantic_score = semantic_evaluation(labels)

            list_of_number_of_clusters.append(number_of_clusters)
            list_of_outliers.append(outliers_counter)
            list_of_DB_scores.append(DB_score)
            list_of_CH_scores.append(CH_score)
            list_of_S_scores.append(S_score)
            list_of_AR_scores.append(AR_score)
            list_of_AMI_scores.append(AMI_score)
            list_of_NMI_scores.append(NMI_score)
            list_of_sem_scores.append(semantic_score)

        list_of_lists_of_number_of_clusters.append(list_of_number_of_clusters)
        list_of_lists_of_outliers.append(list_of_outliers)
        list_of_lists_of_DB_scores.append(list_of_DB_scores)
        list_of_lists_of_CH_scores.append(list_of_CH_scores)
        list_of_lists_of_S_scores.append(list_of_S_scores)
        list_of_lists_of_AR_scores.append(list_of_AR_scores)
        list_of_lists_of_AMI_scores.append(list_of_AMI_scores)
        list_of_lists_of_NMI_scores.append(list_of_NMI_scores)
        list_of_lists_of_sem_scores.append(list_of_sem_scores)
        
    list_of_lists_of_number_of_clusters_np = np.array(list_of_lists_of_number_of_clusters)
    np.savetxt(f'scoring/list_{it}_of_lists_of_number_of_clusters.csv', list_of_lists_of_number_of_clusters_np, delimiter=',')
    list_of_lists_of_outliers_np = np.array(list_of_lists_of_outliers)
    np.savetxt(f'scoring/list_{it}_of_lists_of_outliers.csv', list_of_lists_of_outliers, delimiter=',')
    list_of_lists_of_DB_scores_np = np.array(list_of_lists_of_DB_scores)
    np.savetxt(f'scoring/list_{it}_of_lists_of_DB_scores_np.csv', list_of_lists_of_DB_scores_np, delimiter=',')
    list_of_lists_of_CH_scores_np = np.array(list_of_lists_of_CH_scores)
    np.savetxt(f'scoring/list_{it}_of_lists_of_CH_scores_np.csv', list_of_lists_of_CH_scores_np, delimiter=',')
    list_of_lists_of_S_scores_np = np.array(list_of_lists_of_S_scores)
    np.savetxt(f'scoring/list_{it}_of_lists_of_S_scores_np.csv', list_of_lists_of_S_scores_np, delimiter=',')
    list_of_lists_of_AR_scores_np = np.array(list_of_lists_of_AR_scores)
    np.savetxt(f'scoring/list_{it}_of_lists_of_AR_scores_np.csv', list_of_lists_of_AR_scores_np, delimiter=',')
    list_of_lists_of_AMI_scores_np = np.array(list_of_lists_of_AMI_scores)
    np.savetxt(f'scoring/list_{it}_of_lists_of_AMI_scores_np.csv', list_of_lists_of_AMI_scores_np, delimiter=',')
    list_of_lists_of_NMI_scores_np = np.array(list_of_lists_of_NMI_scores)
    np.savetxt(f'scoring/list_{it}_of_lists_of_NMI_scores_np.csv', list_of_lists_of_NMI_scores_np, delimiter=',')
    list_of_lists_of_sem_scores_np = np.array(list_of_lists_of_sem_scores)
    np.savetxt(f'scoring/list_{it}_of_lists_of_sem_scores_np.csv', list_of_lists_of_sem_scores_np, delimiter=',')
    
    list_of_lists_of_number_of_clusters_np = np.genfromtxt(f'scoring/list_{it}_of_lists_of_number_of_clusters.csv', delimiter=',')
    list_of_lists_of_outliers_np = np.genfromtxt(f'scoring/list_{it}_of_lists_of_outliers.csv', delimiter=',')
    list_of_lists_of_DB_scores_np = np.genfromtxt(f'scoring/list_{it}_of_lists_of_DB_scores_np.csv', delimiter=',')
    list_of_lists_of_CH_scores_np = np.genfromtxt(f'scoring/list_{it}_of_lists_of_CH_scores_np.csv', delimiter=',')
    list_of_lists_of_S_scores_np = np.genfromtxt(f'scoring/list_{it}_of_lists_of_S_scores_np.csv', delimiter=',')
    list_of_lists_of_AR_scores_np = np.genfromtxt(f'scoring/list_{it}_of_lists_of_AR_scores_np.csv', delimiter=',')
    list_of_lists_of_AMI_scores_np = np.genfromtxt(f'scoring/list_{it}_of_lists_of_AMI_scores_np.csv', delimiter=',')
    list_of_lists_of_NMI_scores_np = np.genfromtxt(f'scoring/list_{it}_of_lists_of_NMI_scores_np.csv', delimiter=',')
    list_of_lists_of_sem_scores_np = np.genfromtxt(f'scoring/list_{it}_of_lists_of_sem_scores_np.csv', delimiter=',')
    
    arrays_list = [list_of_lists_of_number_of_clusters_np, list_of_lists_of_outliers_np, list_of_lists_of_DB_scores_np, \
                   list_of_lists_of_CH_scores_np, list_of_lists_of_S_scores_np, \
                  list_of_lists_of_AR_scores_np, list_of_lists_of_AMI_scores_np, list_of_lists_of_NMI_scores_np, list_of_lists_of_sem_scores_np]
    titles_list = [f'Clusters {it}', f'Outliers {it}', f'DB {it} scores (geometry)', f'CH {it} scores (geometry)', f'S {it} score (geometry)', \
                   f'AR {it} score (robustness)', f'AMI {it} scores (robustness)', f'NMI {it} scores (robustness)', f'Semantic {it} scores']

    for index,array in enumerate(arrays_list): 
        plt.imshow(np.flipud(array), cmap='coolwarm') 
        plt.colorbar() 
        plt.title(titles_list[index]) 
        plt.xticks(np.arange(array.shape[1]), [5, 10, 15, 20, 25, 30, 35, 40, 45, 50])
        plt.yticks(np.arange(array.shape[0]), [50, 45, 40, 35, 30, 25, 20, 15, 10, 5])
        plt.xlabel('Minimum cluster size') 
        plt.ylabel('Embeddings dimensionality')
        plt.savefig("images/"+titles_list[index]+'.png')
        plt.close()
    
final_time = time.time()
used_time = final_time - initial_time
print("Used time:", used_time/3600, "hours.")

Starting iteration: 0
Evaluating configation 5 of 100
Evaluating configation 10 of 100
Evaluating configation 15 of 100
Evaluating configation 20 of 100
Evaluating configation 25 of 100
Evaluating configation 30 of 100
Evaluating configation 35 of 100
Evaluating configation 40 of 100
Evaluating configation 45 of 100
Evaluating configation 50 of 100
Evaluating configation 55 of 100
Evaluating configation 60 of 100
Evaluating configation 65 of 100
Evaluating configation 70 of 100
Evaluating configation 75 of 100
Evaluating configation 80 of 100
Evaluating configation 85 of 100
Evaluating configation 90 of 100
Evaluating configation 95 of 100
Evaluating configation 100 of 100
Starting iteration: 1
Evaluating configation 5 of 100
Evaluating configation 10 of 100
Evaluating configation 15 of 100
Evaluating configation 20 of 100
Evaluating configation 25 of 100
Evaluating configation 30 of 100
Evaluating configation 35 of 100
Evaluating configation 40 of 100
Evaluating configation 45 of 100


## RESULTS VISUALIZATION

In [15]:
import matplotlib.pyplot as plt
import numpy as np

it = 10

list_of_lists_of_number_of_clusters_np = np.array(list_of_lists_of_number_of_clusters)
np.savetxt(f'scoring/list_{it}_of_lists_of_number_of_clusters.csv', list_of_lists_of_number_of_clusters_np, delimiter=',')
list_of_lists_of_outliers_np = np.array(list_of_lists_of_outliers)
np.savetxt(f'scoring/list_{it}_of_lists_of_outliers.csv', list_of_lists_of_outliers, delimiter=',')
list_of_lists_of_DB_scores_np = np.array(list_of_lists_of_DB_scores)
np.savetxt(f'scoring/list_{it}_of_lists_of_DB_scores_np.csv', list_of_lists_of_DB_scores_np, delimiter=',')
list_of_lists_of_CH_scores_np = np.array(list_of_lists_of_CH_scores)
np.savetxt(f'scoring/list_{it}_of_lists_of_CH_scores_np.csv', list_of_lists_of_CH_scores_np, delimiter=',')
list_of_lists_of_S_scores_np = np.array(list_of_lists_of_S_scores)
np.savetxt(f'scoring/list_{it}_of_lists_of_S_scores_np.csv', list_of_lists_of_S_scores_np, delimiter=',')
list_of_lists_of_AR_scores_np = np.array(list_of_lists_of_AR_scores)
np.savetxt(f'scoring/list_{it}_of_lists_of_AR_scores_np.csv', list_of_lists_of_AR_scores_np, delimiter=',')
list_of_lists_of_AMI_scores_np = np.array(list_of_lists_of_AMI_scores)
np.savetxt(f'scoring/list_{it}_of_lists_of_AMI_scores_np.csv', list_of_lists_of_AMI_scores_np, delimiter=',')
list_of_lists_of_NMI_scores_np = np.array(list_of_lists_of_NMI_scores)
np.savetxt(f'scoring/list_{it}_of_lists_of_NMI_scores_np.csv', list_of_lists_of_NMI_scores_np, delimiter=',')
list_of_lists_of_sem_scores_np = np.array(list_of_lists_of_sem_scores)
np.savetxt(f'scoring/list_{it}_of_lists_of_sem_scores_np.csv', list_of_lists_of_sem_scores_np, delimiter=',')

In [16]:
list_of_lists_of_number_of_clusters_np = np.genfromtxt(f'scoring/list_{it}_of_lists_of_number_of_clusters.csv', delimiter=',')
list_of_lists_of_outliers_np = np.genfromtxt(f'scoring/list_{it}_of_lists_of_outliers.csv', delimiter=',')
list_of_lists_of_DB_scores_np = np.genfromtxt(f'scoring/list_{it}_of_lists_of_DB_scores_np.csv', delimiter=',')
list_of_lists_of_CH_scores_np = np.genfromtxt(f'scoring/list_{it}_of_lists_of_CH_scores_np.csv', delimiter=',')
list_of_lists_of_S_scores_np = np.genfromtxt(f'scoring/list_{it}_of_lists_of_S_scores_np.csv', delimiter=',')
list_of_lists_of_AR_scores_np = np.genfromtxt(f'scoring/list_{it}_of_lists_of_AR_scores_np.csv', delimiter=',')
list_of_lists_of_AMI_scores_np = np.genfromtxt(f'scoring/list_{it}_of_lists_of_AMI_scores_np.csv', delimiter=',')
list_of_lists_of_NMI_scores_np = np.genfromtxt(f'scoring/list_{it}_of_lists_of_NMI_scores_np.csv', delimiter=',')
list_of_lists_of_sem_scores_np = np.genfromtxt(f'scoring/list_{it}_of_lists_of_sem_scores_np.csv', delimiter=',')
list_of_lists_of_number_of_clusters_np

array([[3., 3., 3., 3., 3., 3., 3., 3., 3., 3.],
       [3., 3., 3., 3., 3., 3., 3., 3., 3., 3.],
       [3., 3., 3., 3., 3., 3., 3., 3., 3., 3.],
       [3., 3., 3., 3., 3., 3., 3., 3., 3., 3.],
       [3., 3., 3., 3., 3., 3., 3., 3., 3., 3.],
       [3., 3., 3., 3., 3., 3., 3., 3., 3., 3.],
       [3., 3., 3., 3., 3., 3., 3., 3., 3., 3.],
       [3., 3., 3., 3., 3., 3., 3., 3., 3., 3.],
       [3., 3., 3., 3., 3., 3., 3., 3., 3., 3.],
       [3., 3., 3., 3., 3., 3., 3., 3., 3., 3.]])

In [13]:
# DB scores
list_of_lists_of_DB_scores_np_ = np.flipud(list_of_lists_of_number_of_clusters_np)
list_of_lists_of_DB_scores_np_
'''plt.imshow(list_of_lists_of_DB_scores_np_, cmap='coolwarm') 
plt.colorbar() 
plt.title('DB scores (geometry)') 
plt.xticks(np.arange(list_of_lists_of_DB_scores_np.shape[1]), [5, 10, 15, 20, 25, 30, 35, 40, 45, 50])
plt.yticks(np.arange(list_of_lists_of_DB_scores_np.shape[0]), [50, 45, 40, 35, 30, 25, 20, 15, 10, 5])
plt.xlabel('Minimum cluster size') 
plt.ylabel('Embeddings dimensionality')
plt.show()'''

"plt.imshow(list_of_lists_of_DB_scores_np_, cmap='coolwarm') \nplt.colorbar() \nplt.title('DB scores (geometry)') \nplt.xticks(np.arange(list_of_lists_of_DB_scores_np.shape[1]), [5, 10, 15, 20, 25, 30, 35, 40, 45, 50])\nplt.yticks(np.arange(list_of_lists_of_DB_scores_np.shape[0]), [50, 45, 40, 35, 30, 25, 20, 15, 10, 5])\nplt.xlabel('Minimum cluster size') \nplt.ylabel('Embeddings dimensionality')\nplt.show()"

In [29]:
arrays_list = [list_of_lists_of_number_of_clusters_np, list_of_lists_of_outliers_np, list_of_lists_of_DB_scores_np, \
               list_of_lists_of_CH_scores_np, list_of_lists_of_S_scores_np, \
              list_of_lists_of_AR_scores_np, list_of_lists_of_AMI_scores_np, list_of_lists_of_NMI_scores_np, list_of_lists_of_sem_scores_np]
titles_list = [f'Clusters {it}', f'Outliers {it}', f'DB {it} scores (geometry)', f'CH {it} scores (geometry)', f'S {it} score (geometry)', \
               f'AR {it} score (robustness)', f'AMI {it} scores (robustness)', f'NMI {it} scores (robustness)', f'Semantic {it} scores']

for index,array in enumerate(arrays_list): 
    plt.imshow(np.flipud(array), cmap='coolwarm') 
    plt.colorbar() 
    plt.title(titles_list[index]) 
    plt.xticks(np.arange(array.shape[1]), [5, 10, 15, 20, 25, 30, 35, 40, 45, 50])
    plt.yticks(np.arange(array.shape[0]), [50, 45, 40, 35, 30, 25, 20, 15, 10, 5])
    plt.xlabel('Minimum cluster size') 
    plt.ylabel('Embeddings dimensionality')
    plt.savefig("images/"+titles_list[index]+'.png')
    plt.close()

## A variant of the previous evaluation strategy to test a concrete model multiple times and get its results

In [17]:
# Import the sections and create manual embeddings




    


Imported sections: 11827
Creating embeddings manually...


Batches:   0%|          | 0/370 [00:00<?, ?it/s]

Manual 384-dimensional embeddings created!


In [4]:
print("Importing libraries...")
from bertopic import BERTopic
import statistics
import numpy as np
import json
from bertopic.backend import BaseEmbedder
from sentence_transformers import SentenceTransformer
from umap import UMAP
from sklearn.metrics import davies_bouldin_score, calinski_harabasz_score, silhouette_score
from sklearn.metrics.cluster import adjusted_rand_score, adjusted_mutual_info_score, normalized_mutual_info_score
#from sklearn.cluster import HDBSCAN
from hdbscan import HDBSCAN
import time
import matplotlib.pyplot as plt
import numpy as np

def add_noise(embeddings, noise_sd=0.01): 
    
    noisy_embeddings = embeddings.copy()

    for index,embedding in enumerate(noisy_embeddings): 
        noise = np.random.normal(0,noise_sd, len(embedding))
        noisy_embeddings[index] += noise
        
    return noisy_embeddings

def geometric_evaluation(embeddings_, labels_): 

    # Distance metrics
    DB_score = davies_bouldin_score(embeddings_, labels_) # The lower the better
    CH_score = calinski_harabasz_score(embeddings_, labels_) # The higher the better
    S_score = silhouette_score(embeddings_, labels_, metric="cosine") # The higher the better
    
    return DB_score, CH_score, S_score

def robustness_evaluation(labels, noisy_labels): 

    # Robustness metrics
    AR_score = adjusted_rand_score(labels,noisy_labels) # The higher the better
    AMI_score = adjusted_mutual_info_score(labels,noisy_labels) # The higher the better
    NMI_score = normalized_mutual_info_score(labels,noisy_labels) # The higher the better
    
    return AR_score, AMI_score, NMI_score

def semantic_evaluation(labels):
    
    # Create sublists with the labels assigned to sections in the same document (including outliers)
    prev = 0
    doc_labels = []
    docs_labels_list = []
    for index,element in enumerate(document_id_list): 
        if element == prev:
            doc_labels.append(labels[index])
        else: 
            docs_labels_list.append(doc_labels)
            doc_labels = []
            doc_labels.append(labels[index])
        prev = element
    docs_labels_list.append(doc_labels)

    # Take the cluster that is the most frequent and see how frequent it is in the list
    scores_list = []
    for doc in docs_labels_list: # Each doc is a lists with the labels of its sections
        # Find the most repeated label in the document
        most_freq_label = max(set(doc), key=doc.count)
        # Find how many time does that label appear and compare it with the total quantity of labels (sections) of the doc
        label_repetitions = doc.count(most_freq_label)
        score = (label_repetitions/len(doc))*100
        scores_list.append(score)

    # Compute the total score weighting the score of each doc with its number of sections
    sum_of_scores = 0
    for index,doc in enumerate(docs_labels_list): 
        sum_of_scores += scores_list[index]*len(doc)

    weighted_score = sum_of_scores/len(labels)

    '''
    This metric as it is is not very intuitive, because it is upper bounded by 84.68% (because we discard repeated sections) and
    lower bounded by the score that we would get if no repeated section would be in any document (11.16%) (REALLYYYYY??????)

    Then, we should rescale it!
    '''

    return weighted_score

initial_time = time.time()

# First we import the files
our_flyers_path = 'flyers_text.json'

print("Importing sections...")
with open(our_flyers_path, 'r') as file:
    json_file = json.load(file)

# The list of sections to analyze is the list of values of each value of the original dictionary (json file)
sections_dict = {} # The dictionary is to avoid repeated elements
saw_sections = []
counter = -1
for document in json_file.values(): 
    counter += 1
    for section in document.values(): 
        if section not in saw_sections:
            sections_dict[section] = counter
            saw_sections.append(section)

sections = list(sections_dict.keys())
document_id_list = list(sections_dict.values())

print("Imported sections:", len(sections))

class CustomEmbedder(BaseEmbedder):
    def __init__(self, embedding_model):
        super().__init__()
        self.embedding_model = embedding_model

    def embed(self, documents, verbose=False):
        embeddings = self.embedding_model.encode(documents, show_progress_bar=verbose)
        return embeddings 

iters = 5

for it in range(iters):
    
    embedding_model = SentenceTransformer("multi-qa-mpnet-base-dot-v1")
    custom_embedder = CustomEmbedder(embedding_model=embedding_model)

    print(f"Creating embeddings for iteration {it+1}")
    manual_embeddings = custom_embedder.embed(sections, verbose=True)
    print("Starting evaluation...")

    dimension_of_embeddings = [10]
    min_cluster_sizes = [15]
    counter = 0

    list_of_lists_of_number_of_clusters = []
    list_of_lists_of_outliers = []
    list_of_lists_of_DB_scores = []
    list_of_lists_of_CH_scores = []
    list_of_lists_of_S_scores = []
    list_of_lists_of_AR_scores = []
    list_of_lists_of_AMI_scores = []
    list_of_lists_of_NMI_scores = []
    list_of_lists_of_sem_scores = []

    for i in dimension_of_embeddings: 

        list_of_number_of_clusters = []
        list_of_outliers = []
        list_of_DB_scores = []
        list_of_CH_scores = []
        list_of_S_scores = []
        list_of_AR_scores = []
        list_of_AMI_scores = []
        list_of_NMI_scores = []
        list_of_sem_scores = []

        for j in min_cluster_sizes:

            counter += 1
            if counter == 1: 
                #print("Starting iteration:", it - iters[0])
                pass
            if counter % 5 == 0:
                #print(f"Evaluating configation {counter} of {len(dimension_of_embeddings)*len(min_cluster_sizes)}")
                pass
            #print("Executing reduction of dimensionality by UMAP model...")
            umap_model = UMAP(n_neighbors=15, n_components=i, min_dist=0.0, metric='cosine')
            embeddings = umap_model.fit_transform(manual_embeddings)
            #print("UMAP embeddings created!")

            #print("Generating noisy embeddings...")
            noisy_embeddings = add_noise(embeddings, noise_sd=0.05)

            #print("Executing HDBSCAN...")
            hdbscan_model = HDBSCAN(min_cluster_size=j, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
            #hdbscan_model = HDBSCAN(min_cluster_size=j, metric='cosine', cluster_selection_method='eom')
            labels = hdbscan_model.fit_predict(embeddings)
            number_of_clusters = len(set(labels))

            noisy_labels = hdbscan_model.fit_predict(noisy_embeddings)
            #print("Clusters computed by HDBSCAN!")

            #print("Remove outliers before applying geometrical evaluation")
            labels_ = []
            embeddings_ = []
            noisy_labels_ = []
            outliers_counter = 0
            for index,element in enumerate(labels): 
                if element != -1: 
                    labels_.append(element)
                    embeddings_.append(embeddings[index])
                    noisy_labels_.append(noisy_labels[index]) # ATTENTION: I DON'T REMEMBER THE OUTLIERS IN THE NOISY_LABELS, 
                    # BUT THOSE POSITIONS WHERE OUTLIERS WHERE SPOTTED IN THE HEALTHY LABELS! IF CLUSTERS ARE ROBUST, THEY SHOULD BE THE SAME!!!!!
                else: 
                    outliers_counter += 1


            #print("Computing geometric evaluation...")
            DB_score, CH_score, S_score = geometric_evaluation(embeddings_, labels_)

            #print("Computing robustness evaluation...")
            AR_score, AMI_score, NMI_score = robustness_evaluation(labels_, noisy_labels_)

            #print("Computing semantic evaluation...")
            semantic_score = semantic_evaluation(labels)

            list_of_number_of_clusters.append(number_of_clusters)
            list_of_outliers.append(outliers_counter)
            list_of_DB_scores.append(DB_score)
            list_of_CH_scores.append(CH_score)
            list_of_S_scores.append(S_score)
            list_of_AR_scores.append(AR_score)
            list_of_AMI_scores.append(AMI_score)
            list_of_NMI_scores.append(NMI_score)
            list_of_sem_scores.append(semantic_score)

        list_of_lists_of_number_of_clusters.append(list_of_number_of_clusters[0])
        list_of_lists_of_outliers.append(list_of_outliers[0])
        list_of_lists_of_DB_scores.append(list_of_DB_scores[0])
        list_of_lists_of_CH_scores.append(list_of_CH_scores[0])
        list_of_lists_of_S_scores.append(list_of_S_scores[0])
        list_of_lists_of_AR_scores.append(list_of_AR_scores[0])
        list_of_lists_of_AMI_scores.append(list_of_AMI_scores[0])
        list_of_lists_of_NMI_scores.append(list_of_NMI_scores[0])
        list_of_lists_of_sem_scores.append(list_of_sem_scores[0])
        
list_of_lists_of_number_of_clusters_np = np.array(list_of_lists_of_number_of_clusters)
#np.savetxt(f'scoring/list_{it}_of_lists_of_number_of_clusters.csv', list_of_lists_of_number_of_clusters_np, delimiter=',')
list_of_lists_of_outliers_np = np.array(list_of_lists_of_outliers)
#np.savetxt(f'scoring/list_{it}_of_lists_of_outliers.csv', list_of_lists_of_outliers, delimiter=',')
list_of_lists_of_DB_scores_np = np.array(list_of_lists_of_DB_scores)
#np.savetxt(f'scoring/list_{it}_of_lists_of_DB_scores_np.csv', list_of_lists_of_DB_scores_np, delimiter=',')
list_of_lists_of_CH_scores_np = np.array(list_of_lists_of_CH_scores)
#np.savetxt(f'scoring/list_{it}_of_lists_of_CH_scores_np.csv', list_of_lists_of_CH_scores_np, delimiter=',')
list_of_lists_of_S_scores_np = np.array(list_of_lists_of_S_scores)
#np.savetxt(f'scoring/list_{it}_of_lists_of_S_scores_np.csv', list_of_lists_of_S_scores_np, delimiter=',')
list_of_lists_of_AR_scores_np = np.array(list_of_lists_of_AR_scores)
#np.savetxt(f'scoring/list_{it}_of_lists_of_AR_scores_np.csv', list_of_lists_of_AR_scores_np, delimiter=',')
list_of_lists_of_AMI_scores_np = np.array(list_of_lists_of_AMI_scores)
#np.savetxt(f'scoring/list_{it}_of_lists_of_AMI_scores_np.csv', list_of_lists_of_AMI_scores_np, delimiter=',')
list_of_lists_of_NMI_scores_np = np.array(list_of_lists_of_NMI_scores)
#np.savetxt(f'scoring/list_{it}_of_lists_of_NMI_scores_np.csv', list_of_lists_of_NMI_scores_np, delimiter=',')
list_of_lists_of_sem_scores_np = np.array(list_of_lists_of_sem_scores)
#np.savetxt(f'scoring/list_{it}_of_lists_of_sem_scores_np.csv', list_of_lists_of_sem_scores_np, delimiter=',')
    
'''list_of_lists_of_number_of_clusters_np = np.genfromtxt(f'scoring/list_{it}_of_lists_of_number_of_clusters.csv', delimiter=',')
list_of_lists_of_outliers_np = np.genfromtxt(f'scoring/list_{it}_of_lists_of_outliers.csv', delimiter=',')
list_of_lists_of_DB_scores_np = np.genfromtxt(f'scoring/list_{it}_of_lists_of_DB_scores_np.csv', delimiter=',')
list_of_lists_of_CH_scores_np = np.genfromtxt(f'scoring/list_{it}_of_lists_of_CH_scores_np.csv', delimiter=',')
list_of_lists_of_S_scores_np = np.genfromtxt(f'scoring/list_{it}_of_lists_of_S_scores_np.csv', delimiter=',')
list_of_lists_of_AR_scores_np = np.genfromtxt(f'scoring/list_{it}_of_lists_of_AR_scores_np.csv', delimiter=',')
list_of_lists_of_AMI_scores_np = np.genfromtxt(f'scoring/list_{it}_of_lists_of_AMI_scores_np.csv', delimiter=',')
list_of_lists_of_NMI_scores_np = np.genfromtxt(f'scoring/list_{it}_of_lists_of_NMI_scores_np.csv', delimiter=',')
list_of_lists_of_sem_scores_np = np.genfromtxt(f'scoring/list_{it}_of_lists_of_sem_scores_np.csv', delimiter=',')'''
    
arrays_list = [list_of_lists_of_number_of_clusters_np, list_of_lists_of_outliers_np, list_of_lists_of_DB_scores_np, \
                list_of_lists_of_CH_scores_np, list_of_lists_of_S_scores_np, \
                list_of_lists_of_AR_scores_np, list_of_lists_of_AMI_scores_np, list_of_lists_of_NMI_scores_np, list_of_lists_of_sem_scores_np]
titles_list = [f'Clusters', f'Outliers', f'DB scores (geometry)', f'CH scores (geometry)', f'S score (geometry)', \
                f'AR score (robustness)', f'AMI scores (robustness)', f'NMI scores (robustness)', f'Semantic scores']

'''for index,array in enumerate(arrays_list): 
    plt.imshow(np.flipud(array), cmap='coolwarm') 
    plt.colorbar() 
    plt.title(titles_list[index]) 
    plt.xticks(np.arange(array.shape[1]), [5, 10, 15, 20, 25, 30, 35, 40, 45, 50])
    plt.yticks(np.arange(array.shape[0]), [50, 45, 40, 35, 30, 25, 20, 15, 10, 5])
    plt.xlabel('Minimum cluster size') 
    plt.ylabel('Embeddings dimensionality')
    plt.savefig("images/"+titles_list[index]+'.png')
    plt.close()'''
    
# Print the average of each one of the measurements: 
    
for index, array in enumerate(arrays_list): 
    print(f"{titles_list[index]} average:", np.mean(array))
    
final_time = time.time()
used_time = final_time - initial_time
print("Used time:", used_time/60, "mins.")

Importing libraries...
Importing sections...
Imported sections: 11827
Creating embeddings for iteration 1


Batches:   0%|          | 0/370 [00:00<?, ?it/s]

Starting evaluation...
Creating embeddings for iteration 2


Batches:   0%|          | 0/370 [00:00<?, ?it/s]

Starting evaluation...
Creating embeddings for iteration 3


Batches:   0%|          | 0/370 [00:00<?, ?it/s]

Starting evaluation...
Creating embeddings for iteration 4


Batches:   0%|          | 0/370 [00:00<?, ?it/s]

Starting evaluation...
Creating embeddings for iteration 5


Batches:   0%|          | 0/370 [00:00<?, ?it/s]

Starting evaluation...
Clusters average: 153.0
Outliers average: 4288.0
DB scores (geometry) average: 0.421093617204376
CH scores (geometry) average: 7176.469281512615
S score (geometry) average: 0.7744507
AR score (robustness) average: 0.6445819077117131
AMI scores (robustness) average: 0.8732230007214775
NMI scores (robustness) average: 0.8936709501788663
Semantic scores average: 48.60911473746512
Used time: 14.390301020940145 mins.
