In [6]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm
import numpy as np
import os
import sys

import sys
sys.path.append('/home/jupyter/work/resources/DiplomDimReduction/')
import importlib

import config
importlib.reload(config)
from config import config_dict

import random
seed = 42

import utils
importlib.reload(utils)
from utils import save_vectors, create_mapping, save_vector_lists_pkl, save_vector_lists_npz, load_vectors

In [12]:
corpus_prefix = config_dict['marco_prefix']

In [13]:
split_suffix = config_dict['train_suffix']
data_path = config_dict['data_template'].format(corpus_prefix, split_suffix)

In [14]:
data = pd.read_parquet(data_path)
data

Unnamed: 0,query_id,corpus_id,label,corpus_text,query_text
0,93065,5993631,1,47129 is located in the state of Indiana in th...,clarksville indiana zip
1,93065,419610,0,"Timer has separate night and day outlets, whic...",clarksville indiana zip
2,93065,4614226,0,The rose-buying public still encounters a wide...,clarksville indiana zip
3,93065,4108603,0,Map of Wendover (Aut) Airport. A detailed map ...,clarksville indiana zip
4,93065,3744854,0,And as the poems Reapers and Cotton Song indic...,clarksville indiana zip
...,...,...,...,...,...
99995,958706,3033940,0,Melissa was cast as Sookie St. James on Gilmor...,when was mansa musa born
99996,958706,4297114,0,05/09/2001. A routing number is a nine digit c...,when was mansa musa born
99997,958706,6770324,0,Some 12 million Americans visit medical profes...,when was mansa musa born
99998,958706,6484257,0,"So far, weâve taken you inside the life of a...",when was mansa musa born


In [15]:
queries = sorted(data['query_text'].unique())
corpus = sorted(data['corpus_text'].unique())

queries_mapping = create_mapping(queries)
corpus_mapping = create_mapping(corpus)

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


# Dense

In [15]:
# model_name = config_dict['dpr_model']
# method_prefix = config_dict['dpr_prefix']

# model_name = config_dict['ance_model']
# method_prefix = config_dict['ance_prefix']

# model_name = config_dict['tas-b_model']
# method_prefix = config_dict['tas-b_prefix']

corpus_vector_path = config_dict['corpus_vector_template'].format(corpus_prefix, method_prefix, split_suffix)
corpus_mapping_path = config_dict['corpus_mapping_template'].format(corpus_prefix, method_prefix, split_suffix)

queries_vector_path = config_dict['queries_vector_template'].format(corpus_prefix, method_prefix, split_suffix)
queries_mapping_path = config_dict['queries_mapping_template'].format(corpus_prefix, method_prefix, split_suffix)

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.to(device)
model.eval()

  warn(
2025-05-06 13:38:29.526164: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-06 13:38:32.215741: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of the model checkpoint at facebook/dpr-question_encoder-multiset-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with anot

DPRQuestionEncoder(
  (question_encoder): DPREncoder(
    (bert_model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSdpaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_feature

In [9]:
batch_size = 128
embed_dim = model.config.hidden_size

In [10]:
def vectorize_batch(batch):
  batch = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
  batch.to(device)
  with torch.no_grad():
    if model_name == config_dict['tas-b_model']:
      batch = model(**batch).last_hidden_state.mean(dim=1)
    else:
      batch = model(**batch).pooler_output
  return batch.cpu().numpy()

In [11]:
def vectorize_all(unique_data, embed_dim, batch_size, disable=False):
  n_data = len(unique_data)
  embeddings = np.empty((n_data, embed_dim), dtype=np.float32)
  for i in tqdm(range(0, n_data, batch_size), disable=disable):
    batch = unique_data[i:i+batch_size]
    batch = vectorize_batch(batch)
    embeddings[i:i+batch_size] = batch
  return embeddings

In [12]:
query_embeds = vectorize_all(queries, embed_dim, batch_size)

  0%|          | 0/79 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 79/79 [02:30<00:00,  1.90s/it]


In [13]:
save_vectors(query_embeds, queries_vector_path, queries_mapping, queries_mapping_path)

/home/jupyter/work/resources/DiplomDimReduction//data/vectors/marco/dpr exists.
/home/jupyter/work/resources/DiplomDimReduction//data/vectors/marco/dpr exists.
30720128 -> 28473939


In [None]:
corpus_embeds = vectorize_all(corpus, embed_dim, batch_size=16)

100%|██████████| 6216/6216 [3:06:58<00:00,  1.80s/it]  


In [None]:
save_vectors(corpus_embeds, corpus_vector_path, corpus_mapping, corpus_mapping_path)

/home/jupyter/work/resources/DiplomDimReduction//data/vectors/marco/dpr exists.
/home/jupyter/work/resources/DiplomDimReduction//data/vectors/marco/dpr exists.
305482880 -> 283112208


# Colbert

In [4]:
model_name = config_dict['colbert_model']
method_prefix = config_dict['late interaction prefix']

corpus_vector_path = config_dict['corpus_vector_template'].format(corpus_prefix, method_prefix, split_suffix)
corpus_sample_vector_path = config_dict['corpus_sample_vector_template'].format(corpus_prefix, method_prefix, split_suffix)
corpus_mapping_path = config_dict['corpus_mapping_template'].format(corpus_prefix, method_prefix, split_suffix)

queries_vector_path = config_dict['queries_vector_template'].format(corpus_prefix, method_prefix, split_suffix)
queries_mapping_path = config_dict['queries_mapping_template'].format(corpus_prefix, method_prefix, split_suffix)

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.to(device)
model.eval()

  warn(
2025-05-08 10:09:59.608052: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-08 10:10:01.897087: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [9]:
batch_size = 16
embed_dim = model.config.hidden_size

In [10]:
def vectorize_batch(batch):
    batch = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
    batch.to(device)
    input_ids = batch["input_ids"].cpu().numpy()
    keep_mask = ~np.isin(input_ids, [tokenizer.cls_token_id, tokenizer.sep_token_id, tokenizer.pad_token_id])

    with torch.no_grad():
        outputs = model(**batch)
        embeddings = outputs.last_hidden_state  # [batch_size, seq_len, hidden_dim]

    final_embeddings = []
    for i in range(embeddings.size(0)):  # По каждому тексту в батче
        kept = embeddings[i][keep_mask[i]]  # Отбираем только те токены, которые не являются [PAD], [CLS], [SEP]
        kept = torch.nn.functional.normalize(kept, p=2, dim=1)  # Нормализация по токенам
        final_embeddings.append(kept.cpu().numpy())

    del embeddings
    return final_embeddings

In [11]:
def vectorize_all(unique_data, batch_size, disable=False):
  all_batches = []
  for i in tqdm(range(0, len(unique_data), batch_size), disable=disable):
    batch = unique_data[i:i+batch_size]
    batch = vectorize_batch(batch)
    all_batches.extend(batch)
  return all_batches

In [12]:
query_embeds = vectorize_all(queries, batch_size)

100%|██████████| 625/625 [02:33<00:00,  4.08it/s]


In [13]:
# save_vector_lists_pkl(query_embeds, queries_vector_path.replace('npz', 'pkl'), queries_mapping, queries_mapping_path)

/home/jupyter/work/resources/DiplomDimReduction//data/vectors/marco/colbert exists.
/home/jupyter/work/resources/DiplomDimReduction//data/vectors/marco/colbert exists.
82328 -> 215311090


In [14]:
save_vector_lists_npz(query_embeds, queries_vector_path, queries_mapping, queries_mapping_path)

/home/jupyter/work/resources/DiplomDimReduction//data/vectors/marco/colbert exists.
/home/jupyter/work/resources/DiplomDimReduction//data/vectors/marco/colbert exists.
82328 -> 200272873


In [None]:
corpus_embeds = vectorize_all(corpus, batch_size)

100%|██████████| 6216/6216 [1:47:44<00:00,  1.04s/it]  


In [18]:
%%time
save_vector_lists_npz(corpus_embeds, corpus_vector_path, corpus_mapping, corpus_mapping_path)

/home/jupyter/work/resources/DiplomDimReduction//data/vectors/marco/colbert exists.
/home/jupyter/work/resources/DiplomDimReduction//data/vectors/marco/colbert exists.
876344 -> 21384739690
CPU times: user 19min 1s, sys: 30 s, total: 19min 31s
Wall time: 22min 53s


In [19]:
%%time
# save_vector_lists_pkl(corpus_embeds, corpus_vector_path.replace('npz', 'pkl'), corpus_mapping, corpus_mapping_path)

/home/jupyter/work/resources/DiplomDimReduction//data/vectors/marco/colbert exists.
/home/jupyter/work/resources/DiplomDimReduction//data/vectors/marco/colbert exists.
876344 -> 23001168831
CPU times: user 24.4 s, sys: 15.5 s, total: 39.8 s
Wall time: 6min 35s


In [7]:
%%time
corpus_vectors = load_vectors(corpus_vector_path)
corpus_vectors.shape

CPU times: user 2min 20s, sys: 1min 9s, total: 3min 30s
Wall time: 6min 8s


(7485904, 768)

In [9]:
random.seed(seed)

corpus_sample_vectors = np.empty((100000, corpus_vectors.shape[1]), dtype=np.float32)
corpus_sample_vectors = corpus_vectors[random.sample(range(corpus_vectors.shape[0]), 100000)]
corpus_sample_vectors.shape

(100000, 768)

In [10]:
save_vectors(corpus_sample_vectors, corpus_sample_vector_path)

/home/jupyter/work/resources/DiplomDimReduction//data/vectors/marco/colbert exists.
307200128 -> 285588639
