In [11]:

from tqdm import tqdm

from embeddings.embedding_utils import get_queries, get_query_key
import pickle
import numpy as np
import pandas as pd
import math

In [12]:
def load_clustering(query_key, embedding_algo, clustering_algo):
    embedding_algo_name=embedding_algo.__module__.split(".")[-1]+"__"+embedding_algo.__name__
    clustering_algo_name=clustering_algo.__name__
    filename = f"../cluster_results/{embedding_algo_name}/{clustering_algo_name}/{query_key}.pkl"
    with open(filename, "rb") as f:
        data_dict = pickle.load(f)
    return np.array(data_dict["embeddings"]),np.array(data_dict["clusters"]), pd.Series(data_dict["numerical_labels_true"]), data_dict["categorizer"]

def load_original_document(query_key):
    filename = f"../query_results/{query_key}.pkl"
    with open(filename, "rb") as f:
        data_dict = pickle.load(f)
    return data_dict["data"]

In [13]:
def remove_noise_entries(numerical_label_lists, assigned_labels):
    no_noise_indices= np.array(assigned_labels) != -1
    return numerical_label_lists[no_noise_indices], np.array(assigned_labels)[no_noise_indices]

def make_noise_cluster(assigned_labels):
    noise_indices= np.array(assigned_labels) == -1
    free_cluster_label = len(set(assigned_labels))
    new_labels = np.array(assigned_labels)
    new_labels[noise_indices] = free_cluster_label
    return new_labels

In [14]:
def get_singe_label_gts(numerical_label_lists):
    flatlist = [item for sublist in numerical_label_lists for item in sublist]
    gts = []
    for label in set(flatlist):
        single_labels = []
        for sublist in numerical_label_lists:
            if label in sublist:
                single_labels.append(1)
            else:
                single_labels.append(0)
        gts.append(single_labels)
    return gts

In [15]:
from sklearn.metrics import adjusted_mutual_info_score


def get_combined_AMI(clustering_labels, singe_label_gts):
    sizes = [sum(sigle_label_gt) for sigle_label_gt in singe_label_gts]
    ami_s = []
    for i in range(len(singe_label_gts)):
        ami_s.append(adjusted_mutual_info_score(clustering_labels, singe_label_gts[i]))
    return np.average(ami_s, weights=sizes)

In [16]:
from embeddings.tf_idf import get_embedding_pca as tf_idf_pca
from embeddings.fasttext import get_embedding_combined_polling_pca
from embeddings.openai_api import get_embedding as embedding_openai

from clustering.clusterings import kmeans_with_estimated_k,xmeans_clustering, hdbscan_clustering, agglomerative_clustering_with_estimated_k

data_queries = get_queries()
embedding_algos = [tf_idf_pca,get_embedding_combined_polling_pca, embedding_openai]
clustering_algos = [kmeans_with_estimated_k,xmeans_clustering, agglomerative_clustering_with_estimated_k]
clustering_algos_noise = [hdbscan_clustering]

In [17]:
AMI_matrix = np.zeros((4,5,len(embedding_algos),len(clustering_algos)+2*len(clustering_algos_noise)))


string_to_num_translator = {}
num_to_string_translator = {}
for i_0, category in tqdm(enumerate(data_queries.keys()),desc="Categories"):
    string_to_num_translator["0:"+category] = i_0
    num_to_string_translator["0:" + str(i_0)] = category
    for i_1, query in enumerate(data_queries[category]):
        string_to_num_translator["1:"+query] = i_1
        num_to_string_translator["1:" + str(i_1)] = query
        query_key = get_query_key(category, query)

        doc_set_label_idf = None

        for i_2, embedding_algo in enumerate(embedding_algos):
            embedding_algo_name = embedding_algo.__module__.split(".")[-1] + "__" + embedding_algo.__name__
            string_to_num_translator["2:"+embedding_algo_name] = i_2
            num_to_string_translator["2:"+str(i_2)] = embedding_algo_name

            for i_3, cluster_algo in enumerate(clustering_algos):
                cluster_algo_name = cluster_algo.__name__
                string_to_num_translator["3:"+cluster_algo_name] = i_3
                num_to_string_translator["3:"+str(i_3)] = cluster_algo_name
                embedding, cluster_labels, numerical_labels_true, _ = load_clustering(query_key, embedding_algo, cluster_algo)

                #fresh ground truth
                singe_label_gts = get_singe_label_gts(numerical_labels_true)

                combined_AMI= get_combined_AMI(cluster_labels, singe_label_gts)
                AMI_matrix[i_0,i_1,i_2,i_3] = combined_AMI

            for i_3, cluster_algo in enumerate(clustering_algos_noise):
                i_3 = 2 * i_3 + len(clustering_algos)
                cluster_algo_name = cluster_algo.__name__+" no noise"
                string_to_num_translator["3:"+cluster_algo_name] = i_3
                num_to_string_translator["3:"+str(i_3)] = cluster_algo_name

                embedding, cluster_labels, numerical_labels_true, _ = load_clustering(query_key, embedding_algo, cluster_algo)
                numerical_labels_true_no_noise, labels_no_noise= remove_noise_entries(numerical_labels_true, cluster_labels)

                singe_label_gts_no_noise = get_singe_label_gts(numerical_labels_true_no_noise)


                combined_AMI= get_combined_AMI(labels_no_noise, singe_label_gts_no_noise)
                AMI_matrix[i_0,i_1,i_2,i_3] = combined_AMI

                i_3+=1
                cluster_algo_name = cluster_algo.__name__+" noise cluster"
                num_to_string_translator["3:"+str(i_3)] = cluster_algo_name
                string_to_num_translator["3:"+cluster_algo_name] = i_3

                labels_noise_cluster= make_noise_cluster(cluster_labels)

                #fresh ground truth
                singe_label_gts = get_singe_label_gts(numerical_labels_true)

                combined_AMI= get_combined_AMI(labels_noise_cluster, singe_label_gts)
                AMI_matrix[i_0,i_1,i_2,i_3] = combined_AMI

Categories: 4it [00:51, 12.82s/it]


In [18]:
def data_function(category, query, embedding_algo_name, cluster_algo_name):
    i_0=string_to_num_translator["0:"+category]
    i_1=string_to_num_translator["1:"+query]
    i_2=string_to_num_translator["2:"+embedding_algo_name]
    i_3=string_to_num_translator["3:"+cluster_algo_name]
    value = AMI_matrix[i_0,i_1,i_2,i_3]
    return f"{value:.3f}"


In [19]:
# make the header of the table
column_main_labels = [embedding_algo.__module__.split(".")[-1] + "__" + embedding_algo.__name__ for embedding_algo in
                      embedding_algos]
column_sub_labels = [cluster_algo.__name__ for cluster_algo in clustering_algos]
column_sub_labels.extend([cluster_algo.__name__ + " no noise" for cluster_algo in clustering_algos_noise])
column_sub_labels.extend([cluster_algo.__name__ + " noise cluster" for cluster_algo in clustering_algos_noise])
row_main_labels = [category for category in data_queries.keys()]
row_sub_labels = [data_queries[category] for category in data_queries.keys()]

In [21]:
num_col_main = len(column_main_labels)
num_col_sub = len(column_sub_labels)
num_row_main = len(row_main_labels)
bkslsh="\\"
escaped_ = r"\_"
latex_code = r"  \begin{tabular}{|l|c|*{" + str(num_col_main * num_col_sub) + r"}{c|}}" + "\n"
latex_code += r"    \hline" + "\n"
latex_code += r"    \multicolumn{2}{|c|}{\multirow{2}{*}{}} &"
for i, main_label in enumerate(column_main_labels):
    latex_code += rf" \multicolumn{{{num_col_sub}}}{{c|}}{{{main_label}}}"
    if i < num_col_main - 1:
        latex_code += " &"
latex_code += r" \\" + "\n"
latex_code += r"    \cline{3-" + str(2 + num_col_main * num_col_sub) + "}" + "\n"
latex_code += r"    \multicolumn{2}{|c|}{} &"
for j in range(len(column_main_labels)):
    for i, sub_label in enumerate(column_sub_labels):
        latex_code += rf" \multicolumn{{1}}{{c|}}{{\rotatebox{{90}}{{{sub_label.replace('_', escaped_)}~}}}}"
        if i < len(column_sub_labels) - 1 or j < len(column_main_labels) - 1:
            latex_code += " &"
latex_code += r" \\" + "\n"
latex_code += r"    \hline" + "\n"

avges = np.zeros((len(column_main_labels)*len(column_sub_labels)))
scores = np.zeros((len(row_main_labels)*len(row_sub_labels[0]),len(column_main_labels)*len(column_sub_labels)))

for i, main_label in enumerate(row_main_labels):
    num_sub_rows = len(row_sub_labels[i])
    latex_code += rf"    \multirow{{{num_sub_rows}}}{{*}}{{\rotatebox{{90}}{{\makecell{{{main_label}}}}}}}"
    for j, sub_label in enumerate(row_sub_labels[i]):
        latex_code += f" & {sub_label} &"
        for col_main_idx, col_main in enumerate(column_main_labels):
            for col_sub_idx, col_sub in enumerate(column_sub_labels):
                data = data_function(main_label, sub_label, col_main, col_sub)
                latex_code += f" {data.replace('0.','.')}"
                if col_main_idx < num_col_main - 1 or col_sub_idx < len(column_sub_labels) - 1:
                    latex_code += " &"
                avges[col_main_idx * len(column_sub_labels) + col_sub_idx] += float(data)
                scores[i*num_sub_rows+j][col_main_idx * len(column_sub_labels) + col_sub_idx] = float(data)
        latex_code += r" \\" + "\n"

    latex_code += r"    \hline" + "\n"

avges/=len(row_main_labels)*5
latex_code += rf"""    \hline
    & Average & {' & '.join([bkslsh+f'textbf{{{avg_val:.3f}}}'.replace('0.','.') for avg_val in avges])}\\
    \hline"""+"\n"
latex_code += r"  \end{tabular}" + "\n"
print(latex_code)

  \begin{tabular}{|l|c|*{15}{c|}}
    \hline
    \multicolumn{2}{|c|}{\multirow{2}{*}{}} & \multicolumn{5}{c|}{tf_idf__get_embedding_pca} & \multicolumn{5}{c|}{fasttext__get_embedding_combined_polling_pca} & \multicolumn{5}{c|}{openai_api__get_embedding} \\
    \cline{3-17}
    \multicolumn{2}{|c|}{} & \multicolumn{1}{c|}{\rotatebox{90}{kmeans\_with\_estimated\_k~}} & \multicolumn{1}{c|}{\rotatebox{90}{xmeans\_clustering~}} & \multicolumn{1}{c|}{\rotatebox{90}{agglomerative\_clustering\_with\_estimated\_k~}} & \multicolumn{1}{c|}{\rotatebox{90}{hdbscan\_clustering no noise~}} & \multicolumn{1}{c|}{\rotatebox{90}{hdbscan\_clustering noise cluster~}} & \multicolumn{1}{c|}{\rotatebox{90}{kmeans\_with\_estimated\_k~}} & \multicolumn{1}{c|}{\rotatebox{90}{xmeans\_clustering~}} & \multicolumn{1}{c|}{\rotatebox{90}{agglomerative\_clustering\_with\_estimated\_k~}} & \multicolumn{1}{c|}{\rotatebox{90}{hdbscan\_clustering no noise~}} & \multicolumn{1}{c|}{\rotatebox{90}{hdbscan\_clustering noise

In [25]:
variances = np.std(scores,axis=0)
print(f"""& St.deviation & {' & '.join([bkslsh+f'textbf{{{avg_val:.3f}}}'.replace('0.','.') for avg_val in variances])}\\\\
    \hline""")

& St.deviation & \textbf{.031} & \textbf{.038} & \textbf{.056} & \textbf{.044} & \textbf{.027} & \textbf{.022} & \textbf{.024} & \textbf{.026} & \textbf{.040} & \textbf{.022} & \textbf{.047} & \textbf{.063} & \textbf{.046} & \textbf{.051} & \textbf{.036}\
    \hline
