In [16]:
import pickle
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import isnull
from sklearn.metrics.pairwise import cosine_similarity

In [17]:
def get_embedding_score(numerical_labels,target_embedding ):
    n = len(numerical_labels)


    label_occurances = {}
    for label_list in numerical_labels:
        for label in label_list:
            label_occurances[label] = label_occurances.get(label, 0) + 1

    label_idfs = {key: np.log2(n / value) for key, value in label_occurances.items()}



    unrelated_distances = []
    related_distances = []
    related_distances_weights = []

    sample_size = 1_000
    indices_i = np.random.randint(0, len(numerical_labels), sample_size)
    indices_j = np.random.randint(0, len(numerical_labels)-1, sample_size)
    indices_j = np.where(indices_j == indices_i, indices_j, len(numerical_labels)-1)

    embeddings_i = target_embedding[indices_i]
    embeddings_j = target_embedding[indices_j]

    for i, doc_embedding_1, j, doc_embedding_2 in zip(indices_i,embeddings_i,indices_j,embeddings_j):
        label_similarity = sum(
            [label_idfs[label_1] for label_1 in numerical_labels[i] if label_1 in numerical_labels[j]])
        doc_distance = cosine_similarity(doc_embedding_1.reshape(1, -1), doc_embedding_2.reshape(1, -1))[0][0]
        if label_similarity == 0:
            unrelated_distances.append(doc_distance)
        else:
            related_distances.append(doc_distance)
            related_distances_weights.append(label_similarity)

    mean_unrelated_distances = np.mean(unrelated_distances, axis=0)
    mean_related_distances = np.average(related_distances, axis=0, weights=related_distances_weights)

    min_distance = min(min(related_distances), min(unrelated_distances))
    max_distance = max(max(related_distances), max(unrelated_distances))
    final_score = (mean_related_distances - mean_unrelated_distances) / (max_distance - min_distance)
    return final_score * 100

In [18]:
def get_embedding_score_2(numerical_labels,target_embedding ):
    n = len(numerical_labels)


    label_occurances = {}
    for label_list in numerical_labels:
        for label in label_list:
            label_occurances[label] = label_occurances.get(label, 0) + 1

    label_idfs = {key: np.log2(n / value) for key, value in label_occurances.items()}



    unrelated_distances = []
    related_distances = []
    related_distances_weights = []

    sample_size = 1_000
    indices_i = np.random.randint(0, len(numerical_labels), sample_size)
    indices_j = np.random.randint(0, len(numerical_labels)-1, sample_size)
    indices_j = np.where(indices_j == indices_i, indices_j, len(numerical_labels)-1)

    embeddings_i = target_embedding[indices_i]
    embeddings_j = target_embedding[indices_j]

    for i, doc_embedding_1, j, doc_embedding_2 in zip(indices_i,embeddings_i,indices_j,embeddings_j):
        label_similarity = sum(
            [label_idfs[label_1] for label_1 in numerical_labels[i] if label_1 in numerical_labels[j]])
        doc_distance = cosine_similarity(doc_embedding_1.reshape(1, -1), doc_embedding_2.reshape(1, -1))[0][0]
        if label_similarity == 0:
            unrelated_distances.append(doc_distance)
        else:
            related_distances.append(doc_distance)
            related_distances_weights.append(label_similarity)

    mean_unrelated_distances = np.mean(unrelated_distances, axis=0)
    mean_related_distances = np.average(related_distances, axis=0, weights=related_distances_weights)

    final_score = (mean_related_distances - mean_unrelated_distances) / max(mean_related_distances,mean_unrelated_distances)
    return final_score * 100

In [19]:
from embeddings.embedding_utils import get_queries,optimal_pca

data_querries = get_queries()
from embeddings.tf_idf import get_embedding_pca as tf_idf_embedding_pca
from embeddings.tf_idf import get_embedding_UMAP as tf_idf_UMAP_embedding
from embeddings.fasttext import get_embedding_mean_polling as fasttext_mean_embedding
from embeddings.fasttext import get_embedding_max_polling as fasttext_max_embedding
from embeddings.fasttext import get_embedding_combined_polling as fasttext_combined_embedding
from embeddings.fasttext import get_embedding_combined_polling_pca as fasttext_combined_embedding_pca
from embeddings.indirect_topics_trough_keyword_clusters import get_embedding as keyword_embedding
from embeddings.openai_api import get_embedding as openai_embedding
embedding_algos = [tf_idf_embedding_pca,tf_idf_UMAP_embedding, fasttext_mean_embedding, fasttext_max_embedding, fasttext_combined_embedding, keyword_embedding, openai_embedding]

In [20]:
def get_algo_name(algo):
    return str(algo.__module__.split(".")[-1]+"__"+algo.__name__)

def get_embedding(category, querry, algo):
    querry_key = (category[:5]+"_"+querry[:10]).replace(" ","_")
    dir_name = get_algo_name(algo)
    filename = f"../embedding_results/{dir_name}/{querry_key}.pkl"
    if not os.path.exists(filename):
        return None, None, None
    with open(filename, "rb") as f:
        embedding_dict = pickle.load(f)
    return embedding_dict["embeddings"], embedding_dict["numerical_labels"], embedding_dict["categorizer"]


In [21]:
algo_names = [get_algo_name(algo) for algo in embedding_algos]
querries_flatmap = []
for querries in data_querries.values():
    querries_flatmap += querries

In [22]:
metrics = pd.DataFrame(index= querries_flatmap, columns=algo_names, )
for algo_name in algo_names:
    metrics[algo_name] = "-"

In [23]:

for category, querries in data_querries.items():
    for querry in querries:
        for embedding_algo in embedding_algos:
            algo_name = get_algo_name(embedding_algo)
            embedding, numerical_labels, categorizer = get_embedding(category, querry, embedding_algo)
            if embedding is None:
                metrics.loc[querry, algo_name] = "-"
            elif metrics.loc[querry, algo_name] != "-":
                continue
            else:
                score = get_embedding_score_2(numerical_labels, embedding)
                metrics.loc[querry, algo_name] = score
                print(f"set {metrics.loc[querry, algo_name]} for {querry}, {algo_name}")


set 149.90214483103784 for Transformer models, tf_idf__get_embedding_pca
set 0.04010453194483277 for Transformer models, tf_idf__get_embedding_UMAP
set 0.26037923772469485 for Transformer models, fasttext__get_embedding_mean_polling
set 0.2188608963220358 for Transformer models, fasttext__get_embedding_max_polling
set 0.08142465074865324 for Transformer models, fasttext__get_embedding_combined_polling
set 16.206921904716612 for Transformer models, indirect_topics_trough_keyword_clusters__get_embedding
set 9.769057591722591 for Transformer models, openai_api__get_embedding
set 89.68928641120938 for Federated learning, tf_idf__get_embedding_pca
set 0.019651018927624303 for Federated learning, tf_idf__get_embedding_UMAP
set 0.3401040659931322 for Federated learning, fasttext__get_embedding_mean_polling
set 0.17454552112066063 for Federated learning, fasttext__get_embedding_max_polling
set 0.17589198405288173 for Federated learning, fasttext__get_embedding_combined_polling
set 5.8937343245

ZeroDivisionError: Weights sum to zero, can't be normalized

In [9]:
from datetime import datetime

# Get the current timestamp up to minutes
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
metrics_path = f"../embedding_results/metrics_{timestamp}.pkl"

In [10]:
with open(metrics_path, "wb") as f:
    pickle.dump(metrics, f)

In [11]:
with open(metrics_path, "rb") as f:
    metrics = pickle.load(f)

In [12]:
def to_display_str(value):
    if value == "-":
        return str(value)
    float_val = float(value)
    return f"{float_val:.2f}"

In [13]:
# generate latex table
print(f"& {' & '.join(algo_names)} \\\\")
for category, querries in data_querries.items():
    print("\n\\midrule")
    for querry in querries:
        print(f"{querry} & {' & '.join([to_display_str(value) for value in metrics.loc[querry, :]])} \\\\")


& tf_idf__get_embedding_pca \\

\midrule
Transformer models & 118.21 \\
Federated learning & 108.23 \\
Quantum computing & -136.48 \\
Explainable AI & 119.16 \\
Graph neural networks & 102.42 \\

\midrule
Topological insulators & 112.75 \\
Optical metamaterials & 104.80 \\
Fission & 101.76 \\
Soft robotics & 125.60 \\
Health monitoring & 101.81 \\

\midrule
CRISPR & 106.01 \\
Microbiome & 117.28 \\
DNA sequencing & 100.81 \\
Synthetic biology & 104.94 \\
Drug delivery & 104.06 \\

\midrule
Climate model & 100.14 \\
Remote sensing & 101.99 \\
Greenhouse gas & 116.70 \\
Biodiversity & 140.83 \\
Light pollution & 111.85 \\


In [14]:
' & '.join([f"{avg:.2f}" for avg in  np.mean(metrics,axis = 0).iloc[:]])

'98.14'

In [15]:
PCA_embedding_algos = [fasttext_mean_embedding, fasttext_max_embedding, keyword_embedding, openai_embedding]
PCA_algo_names = [get_algo_name(embed_algo) for embed_algo in PCA_embedding_algos]
metrics_PCA = pd.DataFrame(index= querries_flatmap, columns=algo_names)
for category, querries in data_querries.items():
    for querry in querries:
        for embedding_algo in PCA_embedding_algos:
            algo_name = get_algo_name(embedding_algo)
            embedding, numerical_labels, categorizer = get_embedding(category, querry, embedding_algo)
            reduced_embedding = optimal_pca(embedding)
            if embedding is None:
                metrics_PCA.loc[querry, algo_name] = "-"
            else:
                score = get_embedding_score_2(numerical_labels, reduced_embedding)
                metrics_PCA.loc[querry, algo_name] = score
                print(f"set {metrics_PCA.loc[querry, algo_name]} for {querry}, {algo_name}")

set 35.689238833553674 for Transformer models, fasttext__get_embedding_mean_polling
set 310.3229233332539 for Transformer models, fasttext__get_embedding_max_polling
set 20.016292450813626 for Transformer models, indirect_topics_trough_keyword_clusters__get_embedding
set 152.75382176653943 for Transformer models, openai_api__get_embedding
set 91.39869768320989 for Federated learning, fasttext__get_embedding_mean_polling
set 445.028710925433 for Federated learning, fasttext__get_embedding_max_polling
set 23.62298905078464 for Federated learning, indirect_topics_trough_keyword_clusters__get_embedding
set 110.20252614720927 for Federated learning, openai_api__get_embedding
set 911.9205536405375 for Quantum computing, fasttext__get_embedding_mean_polling
set 319.72925769912 for Quantum computing, fasttext__get_embedding_max_polling
set 112.76657743417833 for Quantum computing, indirect_topics_trough_keyword_clusters__get_embedding
set 414.60503955513343 for Quantum computing, openai_api__g

KeyboardInterrupt: 

In [25]:
# generate latex table
print(f"& {' & '.join(algo_names)} \\\\")
for category, querries in data_querries.items():
    print("\n\\midrule")
    for querry in querries:
        print(f"{querry} & {' & '.join([to_display_str(value) for value in metrics_PCA.loc[querry, :]])} \\\\")

& tf_idf__get_embedding_pca \\

\midrule
Transformer models & nan & 3.64 & 4.37 & 0.38 & 4.59 \\
Federated learning & nan & 3.27 & -0.48 & 7.35 & 3.24 \\
Quantum computing & nan & 7.62 & 8.99 & 1.09 & 9.81 \\
Explainable AI & nan & 6.20 & -0.28 & -5.98 & 0.88 \\
Graph neural networks & nan & 31.56 & 17.76 & 3.98 & 37.49 \\

\midrule
Topological insulators & nan & -0.15 & 1.91 & 0.11 & 2.81 \\
Optical metamaterials & nan & 31.83 & 48.99 & 27.83 & 37.34 \\
Fission & nan & 7.81 & 7.65 & 1.39 & 21.71 \\
Soft robotics & nan & 29.85 & 23.62 & 6.59 & 23.90 \\
Health monitoring & nan & 20.30 & 11.69 & 10.89 & 9.47 \\

\midrule
CRISPR & nan & 43.39 & 39.49 & 31.95 & 42.44 \\
Microbiome & nan & 18.89 & 12.26 & 0.65 & 19.53 \\
DNA sequencing & nan & 47.87 & 47.43 & 29.32 & 55.58 \\
Synthetic biology & nan & 19.06 & 21.20 & 9.83 & 26.09 \\
Drug delivery & nan & 3.43 & 8.77 & 12.71 & 4.92 \\

\midrule
Climate model & nan & 70.31 & 82.26 & 55.43 & 79.85 \\
Remote sensing & nan & 25.08 & 18.68 & -2.7

In [26]:
' & '.join([f"{avg:.2f}" for avg in  np.mean(metrics_PCA,axis = 0).iloc[:]])

'nan & 20.44 & 20.05 & 10.11 & 23.40'