In [None]:
import pickle

from embeddings.embedding_utils import get_queries, get_query_key
from clustering.clusterings import kmeans_with_estimated_k,xmeans_clustering, hdbscan_clustering, agglomerative_clustering_with_estimated_k
from matplotlib import pyplot as plt
from embeddings.tf_idf import get_embedding_pca,get_embedding_UMAP
from embeddings.openai_api import get_embedding as embedding_openai
import seaborn as sns

In [None]:
data_queries = get_queries()
embeddings = [get_embedding_pca,get_embedding_UMAP,embedding_openai]
cluster_algos = [kmeans_with_estimated_k,xmeans_clustering, hdbscan_clustering, agglomerative_clustering_with_estimated_k]
cluster_algo_display_names=["K-Means with estimated k", "X-Means", "HDBSCAN","Agglomerative Clustering"]
fig, axs = plt.subplots(2,2, figsize=(8,8))

for i, cluster_algo in enumerate(cluster_algos):
    ax = axs[i//2,i%2]
    cluster_algo_name=cluster_algo.__name__
    cluster_algo_display_name=cluster_algo_display_names[i]
    colors = sns.color_palette("hsv", len(embeddings))

    for embedding_algo in embeddings:
        embedding_algo_name=embedding_algo.__module__.split(".")[-1]+"__"+embedding_algo.__name__
        cluster_counts = []
        for category in data_queries.keys():
            for query in data_queries[category]:
                query_key = get_query_key(category, query)
                file=f"{embedding_algo_name}/{cluster_algo_name}/{query_key}.pkl"
                with open(file, "wb") as f:
                    data_dict = pickle.load(open(file, "rb"))
                cluster_labels = data_dict["clusters"]

                num_of_clusters = len(set(cluster_labels) - {-1})
                cluster_counts.append(num_of_clusters)
        cluster_counts.sort()
        ax.plot(cluster_labels, cluster_counts, color=colors[i], label=cluster_algo_name)




