In [17]:

from embeddings.embedding_utils import get_queries, get_query_key
import pickle
import numpy as np
import pandas as pd
import math


In [18]:
def load_clustering(query_key, embedding_algo, clustering_algo):
    embedding_algo_name=embedding_algo.__module__.split(".")[-1]+"__"+embedding_algo.__name__
    clustering_algo_name=clustering_algo.__name__
    filename = f"../cluster_results/{embedding_algo_name}/{clustering_algo_name}/{query_key}.pkl"
    with open(filename, "rb") as f:
        data_dict = pickle.load(f)
    return data_dict["embeddings"], data_dict["clusters"], data_dict["numerical_labels_true"], data_dict["categorizer"]

def load_original_document(query_key):
    filename = f"../query_results/{query_key}.pkl"
    with open(filename, "rb") as f:
        data_dict = pickle.load(f)
    return data_dict["data"]

In [19]:
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

def internal_evaluation(embeddings, assigned_labels):
    """
    :arg embeddings: Embedding matrix of the documents
    :arg assigned_labels: Cluster labels assigned by the clustering algorithm
    """
    if len(set(assigned_labels))<=1:
        return np.nan, np.nan, np.nan


    # Silhouette score
    silhouette = silhouette_score(embeddings, assigned_labels)

    # Davies-Bouldin score
    davies_bouldin = davies_bouldin_score(embeddings, assigned_labels)

    # Calinski-Harabasz score
    calinski_harabasz = calinski_harabasz_score(embeddings, assigned_labels)
    return silhouette, davies_bouldin, calinski_harabasz

In [20]:
def remove_noise_entries(embeddings, assigned_labels):
    no_noise_indices= np.array(assigned_labels) != -1
    return embeddings[no_noise_indices], np.array(assigned_labels)[no_noise_indices]

def make_noise_cluster(assigned_labels):
    noise_indices= np.array(assigned_labels) == -1
    free_cluster_label = len(set(assigned_labels))
    new_labels = np.array(assigned_labels)
    new_labels[noise_indices] = free_cluster_label
    return new_labels

In [21]:
from embeddings.tf_idf import get_embedding_pca as tf_idf_pca
from embeddings.fasttext import get_embedding_combined_polling_pca
from embeddings.openai_api import get_embedding as embedding_openai

from clustering.clusterings import kmeans_with_estimated_k,xmeans_clustering, hdbscan_clustering, agglomerative_clustering_with_estimated_k

data_queries = get_queries()
embedding_algos = [tf_idf_pca,get_embedding_combined_polling_pca, embedding_openai]
clustering_algos = [kmeans_with_estimated_k,xmeans_clustering, agglomerative_clustering_with_estimated_k]
clustering_algos_noise = [hdbscan_clustering]

internal_scores = np.zeros((4,5,len(embedding_algos),len(clustering_algos)+2*len(clustering_algos_noise),3))

string_to_num_translator = {}
num_to_string_translator = {}
for i_0, category in enumerate(data_queries.keys()):
    string_to_num_translator["0:"+category] = i_0
    num_to_string_translator["0:" + str(i_0)] = category
    for i_1, query in enumerate(data_queries[category]):
        string_to_num_translator["1:"+query] = i_1
        num_to_string_translator["1:" + str(i_1)] = query
        query_key = get_query_key(category, query)

        for i_2, embedding_algo in enumerate(embedding_algos):
            embedding_algo_name = embedding_algo.__module__.split(".")[-1] + "__" + embedding_algo.__name__
            string_to_num_translator["2:"+embedding_algo_name] = i_2
            num_to_string_translator["2:"+str(i_2)] = embedding_algo_name

            for i_3, cluster_algo in enumerate(clustering_algos):
                cluster_algo_name = cluster_algo.__name__
                string_to_num_translator["3:"+cluster_algo_name] = i_3
                num_to_string_translator["3:"+str(i_3)] = cluster_algo_name

                embedding, cluster_labels, _, _ = load_clustering(query_key, embedding_algo, cluster_algo)
                silhouette, davies_bouldin, calinski_harabasz = internal_evaluation(embedding, cluster_labels)
                internal_scores[i_0, i_1, i_2, i_3, 0] = silhouette
                internal_scores[i_0, i_1, i_2, i_3, 1] = davies_bouldin
                internal_scores[i_0, i_1, i_2, i_3, 2] = calinski_harabasz

            for i_3, cluster_algo in enumerate(clustering_algos_noise):
                i_3 = 2 * i_3 + len(clustering_algos)
                cluster_algo_name = cluster_algo.__name__+" no noise"
                string_to_num_translator["3:"+cluster_algo_name] = i_3
                num_to_string_translator["3:"+str(i_3)] = cluster_algo_name

                embedding, cluster_labels, _, _ = load_clustering(query_key, embedding_algo, cluster_algo)
                embedding_no_noise, labels_no_noise= remove_noise_entries(embedding, cluster_labels)
                silhouette, davies_bouldin, calinski_harabasz = internal_evaluation(embedding_no_noise, labels_no_noise)
                internal_scores[i_0, i_1, i_2, i_3, 0] = silhouette
                internal_scores[i_0, i_1, i_2, i_3, 1] = davies_bouldin
                internal_scores[i_0, i_1, i_2, i_3, 2] = calinski_harabasz

                i_3+=1
                cluster_algo_name = cluster_algo.__name__+" noise-cluster"
                string_to_num_translator["3:"+cluster_algo_name] = i_3
                num_to_string_translator["3:"+str(i_3)] = cluster_algo_name

                labels_noise_cluster= make_noise_cluster(cluster_labels)
                silhouette, davies_bouldin, calinski_harabasz = internal_evaluation(embedding, labels_noise_cluster)
                internal_scores[i_0, i_1, i_2, i_3, 0] = silhouette
                internal_scores[i_0, i_1, i_2, i_3, 1] = davies_bouldin
                internal_scores[i_0, i_1, i_2, i_3, 2] = calinski_harabasz


In [22]:
def data_function(category, score_type, embedding_algo_name, cluster_algo_name):
    i_0=string_to_num_translator["0:"+category]
    i_2=string_to_num_translator["2:"+embedding_algo_name]
    i_3=string_to_num_translator["3:"+cluster_algo_name]
    i_4=["S","DB","CH"].index(score_type)
    considered_scores=internal_scores[i_0, :, i_2, i_3, i_4]
    nan_count=np.isnan(considered_scores).sum()
    suffix=""
    if nan_count==len(considered_scores):
        return "-"
    if nan_count>0:
        suffix="*"*nan_count
    mean = np.nanmean(considered_scores)
    decimal_places = max(0,2-max(0,math.floor(math.log10(math.fabs(mean)))))
    return f"{mean:.{decimal_places}f}"+suffix

In [23]:
# make the header of the table
column_main_labels = [embedding_algo.__module__.split(".")[-1] + "__" + embedding_algo.__name__ for embedding_algo in embedding_algos]
column_sub_labels = [cluster_algo.__name__ for cluster_algo in clustering_algos]
column_sub_labels.extend([cluster_algo.__name__+" no noise" for cluster_algo in clustering_algos_noise])
column_sub_labels.extend([cluster_algo.__name__+" noise-cluster" for cluster_algo in clustering_algos_noise])
row_main_labels = [category for category in data_queries.keys()]
row_sub_labels = ["S","DB","CH"]

num_col_main = len(column_main_labels)
num_col_sub = len(column_sub_labels)
num_row_main = len(row_main_labels)

In [24]:
escaped_=r"\_"
latex_code = r"  \begin{tabular}{|l|c|*{" + str(num_col_main * num_col_sub) + r"}{c|}}" + "\n"
latex_code += r"    \hline" + "\n"
latex_code += r"    \multicolumn{2}{|c|}{\multirow{2}{*}{}} &"
for i, main_label in enumerate(column_main_labels):
    latex_code += rf" \multicolumn{{{num_col_sub}}}{{c|}}{{{main_label}}}"
    if i < num_col_main - 1:
        latex_code += " &"
latex_code += r" \\" + "\n"
latex_code += r"    \cline{3-" + str(2 + num_col_main * num_col_sub) + "}" + "\n"
latex_code += r"    \multicolumn{2}{|c|}{} &"
for j in range(len(column_main_labels)):
    for i, sub_label in enumerate(column_sub_labels):
        latex_code += rf" \multicolumn{{1}}{{c|}}{{\rotatebox{{90}}{{{sub_label.replace('_',escaped_)}~}}}}"
        if i < len(column_sub_labels) - 1 or j < len(column_main_labels) - 1:
            latex_code += " &"
latex_code += r" \\" + "\n"
latex_code += r"    \hline" + "\n"

s_avges=np.zeros((len(column_main_labels) * len(column_sub_labels)))
db_avges=np.zeros((len(column_main_labels) * len(column_sub_labels)))
ch_avges=np.zeros((len(column_main_labels) * len(column_sub_labels)))

for i, main_label in enumerate(row_main_labels):
    num_sub_rows = len(row_sub_labels)
    latex_code += rf"    \multirow{{{num_sub_rows}}}{{*}}{{\makecell{{{main_label}}}}}"
    for j, sub_label in enumerate(row_sub_labels):
        latex_code += f" & {sub_label} &"
        for col_main_idx, col_main in enumerate(column_main_labels):
            for col_sub_idx, col_sub in enumerate(column_sub_labels):
                data = data_function(main_label, sub_label, col_main, col_sub)
                latex_code += f" {data}"
                if col_main_idx < num_col_main - 1 or col_sub_idx < len(column_sub_labels) - 1:
                    latex_code += " &"
                if j==0:
                    s_avges[col_main_idx * len(column_sub_labels) + col_sub_idx] += float(data.replace("*",""))
                elif j==1:
                    db_avges[col_main_idx * len(column_sub_labels) + col_sub_idx] += float(data.replace("*",""))
                else:
                    ch_avges[col_main_idx * len(column_sub_labels) + col_sub_idx] += float(data.replace("*",""))
        latex_code += r" \\" + "\n"

    latex_code += r"    \hline" + "\n"
latex_code += r"    \hline" + "\n"
s_avges /= len(row_main_labels)
db_avges /= len(row_main_labels)
ch_avges /= len(row_main_labels)
latex_code += rf"""\multirow{{3}}{{*}}{{\makecell{{Average}}}} & S & {' & '.join([f'{avg_val:.2f}' for avg_val in s_avges])} \\
 & DB & {' & '.join([f'{avg_val:.2f}' for avg_val in db_avges])} \\
 & CH & {' & '.join([f'{avg_val:.2f}' for avg_val in ch_avges])} \\"""+"\n"

latex_code += r"\hline"+ "\n"

latex_code += r"  \end{tabular}" + "\n"
print(latex_code)

  \begin{tabular}{|l|c|*{15}{c|}}
    \hline
    \multicolumn{2}{|c|}{\multirow{2}{*}{}} & \multicolumn{5}{c|}{tf_idf__get_embedding_pca} & \multicolumn{5}{c|}{fasttext__get_embedding_combined_polling_pca} & \multicolumn{5}{c|}{openai_api__get_embedding} \\
    \cline{3-17}
    \multicolumn{2}{|c|}{} & \multicolumn{1}{c|}{\rotatebox{90}{kmeans\_with\_estimated\_k~}} & \multicolumn{1}{c|}{\rotatebox{90}{xmeans\_clustering~}} & \multicolumn{1}{c|}{\rotatebox{90}{agglomerative\_clustering\_with\_estimated\_k~}} & \multicolumn{1}{c|}{\rotatebox{90}{hdbscan\_clustering no noise~}} & \multicolumn{1}{c|}{\rotatebox{90}{hdbscan\_clustering noise-cluster~}} & \multicolumn{1}{c|}{\rotatebox{90}{kmeans\_with\_estimated\_k~}} & \multicolumn{1}{c|}{\rotatebox{90}{xmeans\_clustering~}} & \multicolumn{1}{c|}{\rotatebox{90}{agglomerative\_clustering\_with\_estimated\_k~}} & \multicolumn{1}{c|}{\rotatebox{90}{hdbscan\_clustering no noise~}} & \multicolumn{1}{c|}{\rotatebox{90}{hdbscan\_clustering noise