In [1]:
import sys

import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer

sys.path.append("/home/jupyter/work/resources/DiplomDimReduction/")
import importlib

import config

importlib.reload(config)
import utils
from config import config_dict

importlib.reload(utils)
from sklearn.metrics import ndcg_score
from utils import (
    Autoencoder,
    LinearAutoencoder,
    cosine_similarity,
    create_path,
    load_mapping,
    load_sk_model,
    load_vectors,
    save_metrics,
)



In [2]:
# model_name = config_dict["dpr_model"]
# method_prefix = config_dict["dpr_prefix"]

# model_name = config_dict['ance_model']
# method_prefix = config_dict['ance_prefix']

model_name = config_dict['tas-b_model']
method_prefix = config_dict['tas-b_prefix']

# model_name = config_dict['s-bert_model']
# method_prefix = config_dict['s-bert_prefix']

corpus_prefix = config_dict["marco_prefix"]
split_suffix = config_dict["test_suffix"]

data_path = config_dict["data_template"].format(corpus_prefix, split_suffix)

corpus_vector_path = config_dict["corpus_vector_template"].format(
    corpus_prefix, method_prefix, split_suffix
)
corpus_mapping_path = config_dict["corpus_mapping_template"].format(
    corpus_prefix, method_prefix, split_suffix
)

queries_vector_path = config_dict["queries_vector_template"].format(
    corpus_prefix, method_prefix, split_suffix
)
queries_mapping_path = config_dict["queries_mapping_template"].format(
    corpus_prefix, method_prefix, split_suffix
)

ndcgs_path_template = config_dict["ndcgs_template"]

sk_model_path_template = config_dict["reduction_sk_model_template"]
ae_model_path_template = config_dict["reduction_ae_model_template"]

In [3]:
data = pd.read_parquet(data_path)
data

Unnamed: 0,query_id,corpus_id,label,corpus_text,query_text
0,19335,8412683,1,Ecological anthropology is defined as the stud...,anthropological definition of environment
1,19335,1729,1,Graduate Study in Anthropology. The graduate p...,anthropological definition of environment
2,19335,8412684,1,Ecological Anthropology. Ecological anthropolo...,anthropological definition of environment
3,19335,3683653,0,The branches of Earth Science are: 1 Geology ...,anthropological definition of environment
4,19335,342432,0,Five Disciplines of Anthropology. 1 Applied A...,anthropological definition of environment
...,...,...,...,...,...
425,1133167,6467520,0,"Climate data for ball mtn lake, Longitude: -72...",how is the weather in jamaica
426,1133167,4712274,0,"Re: Best Time of Year to Visit Jamaica Mar 17,...",how is the weather in jamaica
427,1133167,7115353,0,Hurricane season has ended over a month ago an...,how is the weather in jamaica
428,1133167,8415745,0,"The weather stations sit near sea level, with ...",how is the weather in jamaica


In [4]:
def reduct_sk(query_embeds, corpus_embeds, model_name, red_dim, model_kwargs={}):
    model_path = sk_model_path_template.format(
        corpus_prefix, method_prefix, model_name, red_dim
    )
    red_model = load_sk_model(model_path)
    red_embeddings = red_model.transform(np.vstack([query_embeds, corpus_embeds]))
    return np.split(red_embeddings, [len(query_embeds)])

In [5]:
query_embeds = load_vectors(queries_vector_path)
queries_mapping = load_mapping(queries_mapping_path)

In [6]:
corpus_embeds = load_vectors(corpus_vector_path)
corpus_mapping = load_mapping(corpus_mapping_path)

In [7]:
red_dims = [512, 256, 128, 64]

In [8]:
def score_ndcg(data, query_embeds, queries_mapping, corpus_embeds, corpus_mapping):
    ndcgs = []

    for q, g in tqdm(data.groupby("query_id")):

        q_text = g["query_text"].values[0]
        q_i = queries_mapping["text2i"][q_text]
        q_vec = query_embeds[q_i]

        c_texts = g["corpus_text"].values.tolist()
        c_vecs = np.empty((len(c_texts), corpus_embeds.shape[1]), dtype=np.float32)
        for i, c_text in enumerate(c_texts):
            c_i = corpus_mapping["text2i"][c_text]
            c_vec = corpus_embeds[c_i]
            c_vecs[i] = c_vec

        scores = cosine_similarity(q_vec, c_vecs)
        indices = np.argsort(-scores)

        labels = g["label"].values
        ndcg = ndcg_score([labels], [scores], k=10)
        ndcgs.append(ndcg)

    mean_ndcg = np.mean(ndcgs)
    print(f"Mean nDCG: {mean_ndcg:.4f}")
    return ndcgs, mean_ndcg

In [9]:
def score_dims(red_fn, red_dims, method, model_kwargs={}):
    ndcg_dict = {}

    ndcgs, mean_ndcg = score_ndcg(
        data, query_embeds, queries_mapping, corpus_embeds, corpus_mapping
    )
    ndcg_dict[query_embeds.shape[1]] = {"ndcgs": ndcgs, "mean_ndcg": mean_ndcg}

    for red_dim in red_dims:
        red_query_embeds, red_corpus_embeds = red_fn(
            query_embeds, corpus_embeds, method, red_dim, model_kwargs
        )
        red_ndcgs, red_mean_ndcg = score_ndcg(
            data, red_query_embeds, queries_mapping, red_corpus_embeds, corpus_mapping
        )
        ndcg_dict[red_dim] = {"ndcgs": red_ndcgs, "mean_ndcg": red_mean_ndcg}

    ndcgs_path = ndcgs_path_template.format(corpus_prefix, method_prefix, method)

    save_metrics(ndcg_dict, ndcgs_path)

    return ndcg_dict

In [10]:
# ndcg_dict = score_dims(reduct_sk, red_dims, "PCA")

In [11]:
# ndcg_dict = score_dims(reduct_sk, red_dims, "UMAP")

In [None]:
ndcg_dict = score_dims(reduct_sk, red_dims, "UMAP_5")

100%|██████████| 43/43 [00:00<00:00, 1267.19it/s]

Mean nDCG: 0.8567



2025-05-30 13:26:29.930317: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-30 13:26:29.987279: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
100%|██████████| 43/43 [00:00<00:00, 1223.20it/s]

Mean nDCG: 0.7222



100%|██████████| 43/43 [00:00<00:00, 1280.79it/s]

Mean nDCG: 0.7484



100%|██████████| 43/43 [00:00<00:00, 1218.67it/s]


Mean nDCG: 0.7274


100%|██████████| 43/43 [00:00<00:00, 1315.55it/s]

Mean nDCG: 0.7404
/home/jupyter/work/resources/DiplomDimReduction/data/metrics/marco/tas_b exists.
232 -> 4136





In [None]:
ndcg_dict = score_dims(reduct_sk, red_dims, "UMAP_25")

100%|██████████| 43/43 [00:00<00:00, 1289.35it/s]


Mean nDCG: 0.8567


100%|██████████| 43/43 [00:00<00:00, 1283.62it/s]

Mean nDCG: 0.7431



100%|██████████| 43/43 [00:00<00:00, 1268.94it/s]

Mean nDCG: 0.7277



100%|██████████| 43/43 [00:00<00:00, 1169.53it/s]

Mean nDCG: 0.7448



100%|██████████| 43/43 [00:00<00:00, 1294.06it/s]

Mean nDCG: 0.7426
/home/jupyter/work/resources/DiplomDimReduction/data/metrics/marco/tas_b exists.
232 -> 3993





In [None]:
# score_dims(reduct_sk, red_dims, "KernelPCA")

In [None]:
# score_dims(reduct_sk, red_dims, "TruncatedSVD")

In [None]:
# red_ndcg_dict = score_dims(reduct_sk, red_dims, "FastICA")

In [None]:
# score_dims(reduct_sk, red_dims, "LocallyLinearEmbedding")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
name2model = {
    "LinearAutoencoder": LinearAutoencoder,
    "AutoEncoder+": Autoencoder,
    # "AutoEncoder256": Autoencoder,
}

In [None]:
def resolve_ae_name(model_name, red_dim, in_dim=768):
    if model_name[-1] == "+":
        model_name = model_name[:-1]
        hid_dim = (in_dim + red_dim) // 2
        model_name = f"{model_name}{hid_dim}"
    return model_name

In [None]:
def reduct_ae(query_embeds, corpus_embeds, model_name, red_dim, model_kwargs={}):
    model_class = name2model[model_name]
    # model_name = resolve_ae_name(model_name, red_dim, in_dim=768)
    red_model = model_class(
        input_dim=query_embeds.shape[1], output_dim=red_dim, **model_kwargs
    )
    model_path = ae_model_path_template.format(
        corpus_prefix, method_prefix, model_name, red_dim
    )
    print(model_path)
    red_model.load_state_dict(torch.load(model_path, map_location=device))
    red_model.to(device)
    red_model.eval()
    with torch.no_grad():
        red_embeddings = red_model.encoder(
            torch.tensor(np.vstack([query_embeds, corpus_embeds]), dtype=torch.float32)
        ).numpy()
    return np.split(red_embeddings, [len(query_embeds)])

In [None]:
# ndcg_dict = score_dims(reduct_ae, red_dims, "LinearAutoencoder")

In [None]:
# ndcg_dict = score_dims(reduct_ae, red_dims, "AutoEncoder+")

In [None]:
# ndcg_dict = score_dims(
#     reduct_ae, red_dims, "AutoEncoder256", model_kwargs={"hidden_dim": 256}
# )