In [1]:
import pickle
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import isnull
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def get_embedding_score(numerical_labels,target_embedding ):
    n = len(numerical_labels)


    label_occurances = {}
    for label_list in numerical_labels:
        for label in label_list:
            label_occurances[label] = label_occurances.get(label, 0) + 1

    label_idfs = {key: np.log2(n / value) for key, value in label_occurances.items()}



    unrelated_distances = []
    related_distances = []
    related_distances_weights = []

    sample_size = 100_000
    indices_i = np.random.randint(0, len(numerical_labels), sample_size)
    indices_j = np.random.randint(0, len(numerical_labels)-1, sample_size)
    indices_j = np.where(indices_j == indices_i, indices_j, len(numerical_labels)-1)

    embeddings_i = target_embedding[indices_i]
    embeddings_j = target_embedding[indices_j]

    for i, doc_embedding_1, j, doc_embedding_2 in zip(indices_i,embeddings_i,indices_j,embeddings_j):
        label_similarity = sum(
            [label_idfs[label_1] for label_1 in numerical_labels[i] if label_1 in numerical_labels[j]])
        doc_distance = cosine_similarity(doc_embedding_1.reshape(1, -1), doc_embedding_2.reshape(1, -1))[0][0]
        if label_similarity == 0:
            unrelated_distances.append(doc_distance)
        else:
            related_distances.append(doc_distance)
            related_distances_weights.append(label_similarity)

    mean_unrelated_distances = np.mean(unrelated_distances, axis=0)
    mean_related_distances = np.average(related_distances, axis=0, weights=related_distances_weights)

    min_distance = min(min(related_distances), min(unrelated_distances))
    max_distance = max(max(related_distances), max(unrelated_distances))
    final_score = (mean_related_distances - mean_unrelated_distances) / (max_distance - min_distance)
    return final_score * 100

In [3]:
data_querries = {"Computer Science and AI": ["Transformer models", "Federated learning", "Quantum computing", "Explainable AI", "Graph neural networks"],
"Physics and Engineering": ["Topological insulators","Optical metamaterials","Fission","Soft robotics", "Health monitoring"],
"Biology and Medicine": ["CRISPR","Microbiome","DNA sequencing","Synthetic biology","Drug delivery"],
"Earth and Environmental Science": ["Climate model","Remote sensing","Greenhouse gas","Biodiversity","Light pollution"]}
from embeddings.tf_idf import get_embedding_pca as tf_idf_embedding
from embeddings.fasttext import get_embedding_mean_polling as fasttext_mean_embedding
from embeddings.fasttext import get_embedding_max_polling as fasttext_max_embedding
from embeddings.fasttext import get_embedding_combined_polling as fasttext_combined_embedding
from embeddings.indirect_topics_trough_keyword_clusters import get_embedding as keyword_embedding
from embeddings.openai_api import get_embedding as openai_embedding
embedding_algos = [tf_idf_embedding, fasttext_mean_embedding, fasttext_max_embedding, fasttext_combined_embedding, keyword_embedding, openai_embedding]

In [4]:
def get_algo_name(algo):
    return str(algo.__module__.split(".")[-1]+"__"+algo.__name__)

def get_embedding(category, querry, algo):
    querry_key = (category[:5]+"_"+querry[:10]).replace(" ","_")
    dir_name = get_algo_name(algo)
    filename = f"../embedding_results/{dir_name}/{querry_key}.pkl"
    if not os.path.exists(filename):
        return None, None, None
    with open(filename, "rb") as f:
        embedding_dict = pickle.load(f)
    return embedding_dict["embeddings"], embedding_dict["numerical_labels"], embedding_dict["categorizer"]


In [5]:
algo_names = [get_algo_name(algo) for algo in embedding_algos]
querries_flatmap = []
for querries in data_querries.values():
    querries_flatmap += querries
metrics = pd.DataFrame(index= querries_flatmap, columns=algo_names, )

In [6]:

for category, querries in data_querries.items():
    for querry in querries:
        for embedding_algo in embedding_algos:
            algo_name = get_algo_name(embedding_algo)
            embedding, numerical_labels, categorizer = get_embedding(category, querry, embedding_algo)
            if embedding is None:
                metrics.loc[querry, algo_name] = "-"
            else:
                score = get_embedding_score(numerical_labels, embedding)
                metrics.loc[querry, algo_name] = score
                print(f"set {metrics.loc[querry, algo_name]} for {querry}, {algo_name}")


set 1.6420922303198855 for Transformer models, tf_idf__get_embedding_pca
set 1.436958977836901 for Transformer models, fasttext__get_embedding_mean_polling
set 1.4875825007783177 for Transformer models, fasttext__get_embedding_max_polling
set 1.0435524460131442 for Transformer models, fasttext__get_embedding_combined_polling
set -0.24183276043128854 for Transformer models, indirect_topics_trough_keyword_clusters__get_embedding
set 6.263348592071349 for Transformer models, openai_api__get_embedding
set 1.405408858760425 for Federated learning, tf_idf__get_embedding_pca
set 0.5545552633064432 for Federated learning, fasttext__get_embedding_mean_polling
set 2.266005273958432 for Federated learning, fasttext__get_embedding_max_polling
set 0.18376150507554784 for Federated learning, fasttext__get_embedding_combined_polling
set 2.738139189031687 for Federated learning, indirect_topics_trough_keyword_clusters__get_embedding
set 6.9438712965825875 for Federated learning, openai_api__get_embedd

In [7]:
for category, querries in data_querries.items():
    for querry in querries:
        for embedding_algo in embedding_algos:
            algo_name = get_algo_name(embedding_algo)
            # if is Nan
            if pd.isnull(metrics.loc[querry, algo_name]):
                metrics.loc[querry, algo_name] = "-"

In [8]:
from datetime import datetime

# Get the current timestamp up to minutes
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
metrics_path = f"../embedding_results/metrics_{timestamp}.pkl"

In [9]:
with open(metrics_path, "wb") as f:
    pickle.dump(metrics, f)

In [10]:
with open(metrics_path, "rb") as f:
    metrics = pickle.load(f)

In [11]:
def to_display_str(value):
    if value == "-":
        return str(value)
    float_val = float(value)
    return f"{float_val:.2f}"

In [12]:
# generate latex table
print(f"& {' & '.join(algo_names)} \\\\")
for category, querries in data_querries.items():
    print("\n\\midrule")
    for querry in querries:
        print(f"{querry} & {' & '.join([to_display_str(value) for value in metrics.loc[querry, :]])} \\\\")


& tf_idf__get_embedding_pca & fasttext__get_embedding_mean_polling & fasttext__get_embedding_max_polling & fasttext__get_embedding_combined_polling & indirect_topics_trough_keyword_clusters__get_embedding & openai_api__get_embedding \\

\midrule
Transformer models & 1.64 & 1.44 & 1.49 & 1.04 & -0.24 & 6.26 \\
Federated learning & 1.41 & 0.55 & 2.27 & 0.18 & 2.74 & 6.94 \\
Quantum computing & 0.11 & 6.02 & 3.59 & 2.61 & 2.05 & 9.90 \\
Explainable AI & 3.07 & 4.05 & -0.45 & 0.38 & 2.03 & 6.22 \\
Graph neural networks & 39.15 & 19.84 & 10.40 & 13.32 & 0.88 & 24.60 \\

\midrule
Topological insulators & 3.51 & -0.18 & 0.59 & 0.64 & -0.96 & 4.64 \\
Optical metamaterials & 46.76 & 34.85 & 33.20 & 35.12 & 28.16 & 26.34 \\
Fission & 11.92 & 4.29 & 4.23 & 2.95 & -4.52 & 7.55 \\
Soft robotics & 14.75 & 22.50 & 15.85 & 10.91 & 13.14 & 19.77 \\
Health monitoring & 17.74 & 15.75 & 14.92 & 17.47 & 11.54 & 22.21 \\

\midrule
CRISPR & 46.04 & 41.81 & 37.14 & 36.99 & 25.11 & 45.02 \\
Microbiome & 12.54 

In [21]:
np.mean(metrics,axis = 0).iloc[:]

tf_idf__get_embedding_pca                                 21.801068
fasttext__get_embedding_mean_polling                      15.520483
fasttext__get_embedding_max_polling                       12.741932
fasttext__get_embedding_combined_polling                  11.576221
indirect_topics_trough_keyword_clusters__get_embedding     8.984849
openai_api__get_embedding                                 21.973839
dtype: object