In [15]:
import os
import sys

import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer

sys.path.append("/home/jupyter/work/resources/DiplomDimReduction/")
import importlib

import config

importlib.reload(config)
import random

from config import config_dict

seed = 42

import utils

importlib.reload(utils)
from utils import save_vectors

In [2]:
corpus_prefix = config_dict["marco_prefix"]

In [3]:
split_suffix = config_dict["test_suffix"]
data_path = config_dict["data_template"].format(corpus_prefix, split_suffix)

In [4]:
data = pd.read_parquet(data_path)
data

Unnamed: 0,query_id,corpus_id,label,corpus_text,query_text
0,19335,8412683,1,Ecological anthropology is defined as the stud...,anthropological definition of environment
1,19335,1729,1,Graduate Study in Anthropology. The graduate p...,anthropological definition of environment
2,19335,8412684,1,Ecological Anthropology. Ecological anthropolo...,anthropological definition of environment
3,19335,3683653,0,The branches of Earth Science are: 1 Geology ...,anthropological definition of environment
4,19335,342432,0,Five Disciplines of Anthropology. 1 Applied A...,anthropological definition of environment
...,...,...,...,...,...
425,1133167,6467520,0,"Climate data for ball mtn lake, Longitude: -72...",how is the weather in jamaica
426,1133167,4712274,0,"Re: Best Time of Year to Visit Jamaica Mar 17,...",how is the weather in jamaica
427,1133167,7115353,0,Hurricane season has ended over a month ago an...,how is the weather in jamaica
428,1133167,8415745,0,"The weather stations sit near sea level, with ...",how is the weather in jamaica


In [5]:
queries = sorted(data["query_text"].unique())
corpus = sorted(data["corpus_text"].unique())

queries_mapping = create_mapping(queries)
corpus_mapping = create_mapping(corpus)

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


# Dense

In [7]:
model_name = config_dict["dpr_model"]
method_prefix = config_dict["dpr_prefix"]

# model_name = config_dict['ance_model']
# method_prefix = config_dict['ance_prefix']

# model_name = config_dict['tas-b_model']
# method_prefix = config_dict['tas-b_prefix']

# model_name = config_dict["s-bert_model"]
# method_prefix = config_dict["s-bert_prefix"]


corpus_vector_path = config_dict["corpus_vector_template"].format(
    corpus_prefix, method_prefix, split_suffix
)
corpus_mapping_path = config_dict["corpus_mapping_template"].format(
    corpus_prefix, method_prefix, split_suffix
)

queries_vector_path = config_dict["queries_vector_template"].format(
    corpus_prefix, method_prefix, split_suffix
)
queries_mapping_path = config_dict["queries_mapping_template"].format(
    corpus_prefix, method_prefix, split_suffix
)

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.to(device)
model.eval()

  warn(
2025-05-21 05:27:32.537404: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-21 05:27:35.085172: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of the model checkpoint at facebook/dpr-question_encoder-multiset-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with anot

DPRQuestionEncoder(
  (question_encoder): DPREncoder(
    (bert_model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSdpaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_feature

In [9]:
batch_size = 16
embed_dim = model.config.hidden_size

In [10]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = (
        model_output.last_hidden_state
    )  # [batch_size, seq_len, hidden_size]
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    return (token_embeddings * input_mask_expanded).sum(1) / input_mask_expanded.sum(1)

In [19]:
def vectorize_batch(batch):
    encoded = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
    encoded.to(device)
    with torch.no_grad():
        if model_name == config_dict["tas-b_model"]:
            batch = model(**encoded).last_hidden_state.mean(dim=1)
        elif model_name == config_dict["s-bert_model"]:
            batch = mean_pooling(model(**encoded), encoded["attention_mask"])
        else:
            batch = model(**encoded).pooler_output
        batch = torch.nn.functional.normalize(batch, p=2, dim=1)
    return batch.cpu().numpy()

In [20]:
def vectorize_all(unique_data, embed_dim, batch_size, disable=False):
    n_data = len(unique_data)
    embeddings = np.empty((n_data, embed_dim), dtype=np.float32)
    for i in tqdm(range(0, n_data, batch_size), disable=disable):
        batch = unique_data[i : i + batch_size]
        batch = vectorize_batch(batch)
        embeddings[i : i + batch_size] = batch
    return embeddings

In [21]:
query_embeds = vectorize_all(queries, embed_dim, batch_size)
query_embeds.shape

100%|██████████| 3/3 [00:00<00:00,  3.79it/s]


(43, 768)

In [23]:
%%time
save_vectors(query_embeds, queries_vector_path, queries_mapping, queries_mapping_path)

/home/jupyter/work/resources/DiplomDimReduction/data/vectors/marco/dpr exists.
/home/jupyter/work/resources/DiplomDimReduction/data/vectors/marco/dpr exists.
132224 -> 122958
CPU times: user 9.28 ms, sys: 322 µs, total: 9.6 ms
Wall time: 25.3 ms


In [44]:
corpus_embeds = vectorize_all(corpus, embed_dim, batch_size=16)
corpus_embeds.shape

100%|██████████| 27/27 [00:04<00:00,  6.09it/s]


(428, 384)

In [45]:
%%time
save_vectors(corpus_embeds, corpus_vector_path, corpus_mapping, corpus_mapping_path)

/home/jupyter/work/resources/DiplomDimReduction//data/vectors/marco/sbert exists.
/home/jupyter/work/resources/DiplomDimReduction//data/vectors/marco/sbert exists.
657536 -> 611391
CPU times: user 35.8 ms, sys: 0 ns, total: 35.8 ms
Wall time: 119 ms
