In [16]:
import pickle
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import isnull
from sklearn.metrics.pairwise import cosine_similarity

In [17]:
def get_embedding_score(numerical_labels,target_embedding ):
    n = len(numerical_labels)


    label_occurances = {}
    for label_list in numerical_labels:
        for label in label_list:
            label_occurances[label] = label_occurances.get(label, 0) + 1

    label_idfs = {key: np.log2(n / value) for key, value in label_occurances.items()}



    unrelated_distances = []
    related_distances = []
    related_distances_weights = []

    sample_size = 1_000
    indices_i = np.random.randint(0, len(numerical_labels), sample_size)
    indices_j = np.random.randint(0, len(numerical_labels)-1, sample_size)
    indices_j = np.where(indices_j == indices_i, indices_j, len(numerical_labels)-1)

    embeddings_i = target_embedding[indices_i]
    embeddings_j = target_embedding[indices_j]

    for i, doc_embedding_1, j, doc_embedding_2 in zip(indices_i,embeddings_i,indices_j,embeddings_j):
        label_similarity = sum(
            [label_idfs[label_1] for label_1 in numerical_labels[i] if label_1 in numerical_labels[j]])
        doc_distance = cosine_similarity(doc_embedding_1.reshape(1, -1), doc_embedding_2.reshape(1, -1))[0][0]
        if label_similarity == 0:
            unrelated_distances.append(doc_distance)
        else:
            related_distances.append(doc_distance)
            related_distances_weights.append(label_similarity)

    mean_unrelated_distances = np.mean(unrelated_distances, axis=0)
    mean_related_distances = np.average(related_distances, axis=0, weights=related_distances_weights)

    min_distance = min(min(related_distances), min(unrelated_distances))
    max_distance = max(max(related_distances), max(unrelated_distances))
    final_score = (mean_related_distances - mean_unrelated_distances) / (max_distance - min_distance)
    return final_score * 100

In [18]:
from embeddings.embedding_utils import get_queries

data_querries = get_queries()
from embeddings.tf_idf import get_embedding_pca as tf_idf_embedding
from embeddings.tf_idf import get_embedding_UMAP as tf_idf_UMAP_embedding
from embeddings.fasttext import get_embedding_mean_polling as fasttext_mean_embedding
from embeddings.fasttext import get_embedding_max_polling as fasttext_max_embedding
from embeddings.fasttext import get_embedding_combined_polling as fasttext_combined_embedding
from embeddings.indirect_topics_trough_keyword_clusters import get_embedding as keyword_embedding
from embeddings.openai_api import get_embedding as openai_embedding
embedding_algos = [tf_idf_embedding,tf_idf_UMAP_embedding, fasttext_mean_embedding, fasttext_max_embedding, fasttext_combined_embedding, keyword_embedding, openai_embedding]

In [19]:
def get_algo_name(algo):
    return str(algo.__module__.split(".")[-1]+"__"+algo.__name__)

def get_embedding(category, querry, algo):
    querry_key = (category[:5]+"_"+querry[:10]).replace(" ","_")
    dir_name = get_algo_name(algo)
    filename = f"../embedding_results/{dir_name}/{querry_key}.pkl"
    if not os.path.exists(filename):
        return None, None, None
    with open(filename, "rb") as f:
        embedding_dict = pickle.load(f)
    return embedding_dict["embeddings"], embedding_dict["numerical_labels"], embedding_dict["categorizer"]


In [20]:
algo_names = [get_algo_name(algo) for algo in embedding_algos]
querries_flatmap = []
for querries in data_querries.values():
    querries_flatmap += querries

In [21]:
metrics = pd.DataFrame(index= querries_flatmap, columns=algo_names, )

In [39]:

for category, querries in data_querries.items():
    for querry in querries:
        for embedding_algo in embedding_algos:
            algo_name = get_algo_name(embedding_algo)
            embedding, numerical_labels, categorizer = get_embedding(category, querry, embedding_algo)
            if embedding is None:
                metrics.loc[querry, algo_name] = "-"
            elif metrics.loc[querry, algo_name] != "-":
                continue
            else:
                score = get_embedding_score(numerical_labels, embedding)
                metrics.loc[querry, algo_name] = score
                print(f"set {metrics.loc[querry, algo_name]} for {querry}, {algo_name}")


set 14.70031155016566 for Transformer models, tf_idf__get_embedding_UMAP
set 4.587344605697416 for Federated learning, tf_idf__get_embedding_UMAP
set 2.2225937259308925 for Quantum computing, tf_idf__get_embedding_UMAP
set -3.4045358014806433 for Explainable AI, tf_idf__get_embedding_UMAP
set 21.007679498499055 for Graph neural networks, tf_idf__get_embedding_UMAP
set 3.2213919634611177 for Topological insulators, tf_idf__get_embedding_UMAP
set 35.06003515454373 for Optical metamaterials, tf_idf__get_embedding_UMAP
set -20.226208171095 for Fission, tf_idf__get_embedding_UMAP
set 45.18792775646175 for Soft robotics, tf_idf__get_embedding_UMAP
set 31.401894465817076 for Health monitoring, tf_idf__get_embedding_UMAP
set 40.90331490858543 for CRISPR, tf_idf__get_embedding_UMAP
set 28.98492295180674 for Microbiome, tf_idf__get_embedding_UMAP
set 29.76085483767892 for DNA sequencing, tf_idf__get_embedding_UMAP
set 24.774703025589638 for Synthetic biology, tf_idf__get_embedding_UMAP
set 10.63

In [23]:
for category, querries in data_querries.items():
    for querry in querries:
        for embedding_algo in embedding_algos:
            algo_name = get_algo_name(embedding_algo)
            # if is Nan
            if pd.isnull(metrics.loc[querry, algo_name]):
                metrics.loc[querry, algo_name] = "-"

In [32]:
from datetime import datetime

# Get the current timestamp up to minutes
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
metrics_path = f"../embedding_results/metrics_{timestamp}.pkl"

In [33]:
# metrics_path = f"../embedding_results/metrics_20250328_1134.pkl"
metrics[get_algo_name(tf_idf_UMAP_embedding)] = "-"

In [34]:
with open(metrics_path, "wb") as f:
    pickle.dump(metrics, f)

In [35]:
with open(metrics_path, "rb") as f:
    metrics = pickle.load(f)

In [36]:
def to_display_str(value):
    if value == "-":
        return str(value)
    float_val = float(value)
    return f"{float_val:.2f}"

In [40]:
# generate latex table
print(f"& {' & '.join(algo_names)} \\\\")
for category, querries in data_querries.items():
    print("\n\\midrule")
    for querry in querries:
        print(f"{querry} & {' & '.join([to_display_str(value) for value in metrics.loc[querry, :]])} \\\\")


& tf_idf__get_embedding_pca & tf_idf__get_embedding_UMAP & fasttext__get_embedding_mean_polling & fasttext__get_embedding_max_polling & fasttext__get_embedding_combined_polling & indirect_topics_trough_keyword_clusters__get_embedding & openai_api__get_embedding \\

\midrule
Transformer models & 1.13 & 14.70 & 1.10 & 1.22 & 1.84 & 3.00 & 5.94 \\
Federated learning & 3.51 & 4.59 & 0.76 & 3.97 & -0.04 & 1.70 & 9.23 \\
Quantum computing & 3.90 & 2.22 & 10.65 & 5.50 & 3.46 & -0.42 & 10.30 \\
Explainable AI & 5.92 & -3.40 & 4.06 & -1.35 & -1.55 & -1.91 & 10.59 \\
Graph neural networks & 38.14 & 21.01 & 15.19 & 9.01 & 12.77 & 4.38 & 24.23 \\

\midrule
Topological insulators & 3.24 & 3.22 & -0.38 & 1.55 & 0.58 & -0.63 & 4.33 \\
Optical metamaterials & 46.88 & 35.06 & 34.60 & 39.15 & 37.12 & 35.35 & 25.68 \\
Fission & 10.27 & -20.23 & 15.05 & 4.92 & 1.50 & -1.45 & 5.75 \\
Soft robotics & 14.08 & 45.19 & 21.71 & 16.57 & 11.20 & 11.32 & 19.72 \\
Health monitoring & 23.30 & 31.40 & 24.49 & 7.65 & 

In [41]:
' & '.join([f"{avg:.2f}" for avg in  np.mean(metrics,axis = 0).iloc[:]])

'22.33 & 19.98 & 16.19 & 13.01 & 11.52 & 10.73 & 22.84'

In [42]:
metrics_PCA = pd.DataFrame(index= querries_flatmap, columns=algo_names)

In [43]:
from sklearn.decomposition import PCA


def reduce_embedding(embedding):
    target_dimensions = min(100, embedding.shape[1], embedding.shape[0])
    pca = PCA(n_components=target_dimensions)
    return pca.fit_transform(embedding)

In [44]:
for category, querries in data_querries.items():
    for querry in querries:
        for embedding_algo in embedding_algos:
            algo_name = get_algo_name(embedding_algo)
            embedding, numerical_labels, categorizer = get_embedding(category, querry, embedding_algo)
            reduced_embedding = reduce_embedding(embedding)
            if embedding is None:
                metrics_PCA.loc[querry, algo_name] = "-"
            else:
                score = get_embedding_score(numerical_labels, embedding)
                metrics_PCA.loc[querry, algo_name] = score
                print(f"set {metrics_PCA.loc[querry, algo_name]} for {querry}, {algo_name}")

set 0.48081338539903984 for Transformer models, tf_idf__get_embedding_pca
set 15.451769082605015 for Transformer models, tf_idf__get_embedding_UMAP
set -0.9282972050122924 for Transformer models, fasttext__get_embedding_mean_polling
set 2.2895627408991035 for Transformer models, fasttext__get_embedding_max_polling
set 0.645111982807967 for Transformer models, fasttext__get_embedding_combined_polling
set 2.6464799346159933 for Transformer models, indirect_topics_trough_keyword_clusters__get_embedding
set 5.980117966392168 for Transformer models, openai_api__get_embedding
set 1.0460986125937937 for Federated learning, tf_idf__get_embedding_pca
set 7.255743336258358 for Federated learning, tf_idf__get_embedding_UMAP
set 2.2652643269578436 for Federated learning, fasttext__get_embedding_mean_polling
set 2.8785783690832285 for Federated learning, fasttext__get_embedding_max_polling
set 7.394282458904577 for Federated learning, fasttext__get_embedding_combined_polling
set 2.720949585312906 f

In [38]:
# generate latex table
print(f"& {' & '.join(algo_names)} \\\\")
for category, querries in data_querries.items():
    print("\n\\midrule")
    for querry in querries:
        print(f"{querry} & {' & '.join([to_display_str(value) for value in metrics_PCA.loc[querry, :]])} \\\\")

& tf_idf__get_embedding_pca & fasttext__get_embedding_mean_polling & fasttext__get_embedding_max_polling & fasttext__get_embedding_combined_polling & indirect_topics_trough_keyword_clusters__get_embedding & openai_api__get_embedding \\

\midrule
Transformer models & 1.03 & 1.89 & 1.19 & 0.17 & 0.27 & 8.66 \\
Federated learning & 0.53 & 2.43 & 1.99 & 0.19 & 0.26 & 10.58 \\
Quantum computing & 0.75 & 7.78 & 4.00 & 3.57 & 1.84 & 9.30 \\
Explainable AI & 2.01 & 4.98 & 0.62 & 0.84 & 0.65 & 6.54 \\
Graph neural networks & 35.70 & 16.29 & 11.41 & 13.55 & 10.76 & 28.15 \\

\midrule
Topological insulators & 4.22 & -0.44 & 0.16 & 0.47 & 0.90 & 4.41 \\
Optical metamaterials & 45.54 & 36.78 & 39.60 & 32.91 & 34.19 & 19.30 \\
Fission & 5.42 & 10.73 & 4.63 & 10.35 & -6.07 & 14.02 \\
Soft robotics & 15.34 & 21.19 & 15.34 & 10.97 & 10.55 & 20.55 \\
Health monitoring & 18.91 & 10.75 & 12.13 & 24.69 & 19.12 & 23.19 \\

\midrule
CRISPR & 47.32 & 49.92 & 30.49 & 42.81 & 28.84 & 43.26 \\
Microbiome & 12.45

In [45]:
' & '.join([f"{avg:.2f}" for avg in  np.mean(metrics_PCA,axis = 0).iloc[:]])

'20.77 & 20.97 & 15.77 & 13.30 & 11.67 & 10.45 & 22.73'