# Импорты

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from tqdm import tqdm
import json
import joblib
from sklearn.metrics import ndcg_score

import torch
import torch.nn as nn

import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/Diplom/')
import importlib

import config
importlib.reload(config)
from config import config_dict

import utils
importlib.reload(utils)
from utils import create_path

# Пути

In [None]:
model_name = config_dict['colbert_model']

method_prefix = config_dict['late interaction prefix']
corpus_prefix = config_dict['marco_prefix']
split_suffix = config_dict['test_suffix']

data_path = config_dict['data_template'].format(corpus_prefix, split_suffix)

corpus_vector_path = config_dict['corpus_vector_template'].format(corpus_prefix, method_prefix, split_suffix)
corpus_mapping_path = config_dict['corpus_mapping_template'].format(corpus_prefix, method_prefix, split_suffix)

queries_vector_path = config_dict['queries_vector_template'].format(corpus_prefix, method_prefix, split_suffix)
queries_mapping_path = config_dict['queries_mapping_template'].format(corpus_prefix, method_prefix, split_suffix)

sk_model_path_template = config_dict['reduction_sk_model_template']
ae_model_path_template = config_dict['reduction_ae_model_template']

# Данные

In [None]:
data = pd.read_parquet(data_path)
data

Unnamed: 0,query_id,corpus_id,label,corpus_text,query_text
0,19335,8412683,1,Ecological anthropology is defined as the stud...,anthropological definition of environment
1,19335,1729,1,Graduate Study in Anthropology. The graduate p...,anthropological definition of environment
2,19335,8412684,1,Ecological Anthropology. Ecological anthropolo...,anthropological definition of environment
3,19335,3683653,0,The branches of Earth Science are: 1 Geology ...,anthropological definition of environment
4,19335,342432,0,Five Disciplines of Anthropology. 1 Applied A...,anthropological definition of environment
...,...,...,...,...,...
425,1133167,6467520,0,"Climate data for ball mtn lake, Longitude: -72...",how is the weather in jamaica
426,1133167,4712274,0,"Re: Best Time of Year to Visit Jamaica Mar 17,...",how is the weather in jamaica
427,1133167,7115353,0,Hurricane season has ended over a month ago an...,how is the weather in jamaica
428,1133167,8415745,0,"The weather stations sit near sea level, with ...",how is the weather in jamaica


In [None]:
corpus_mapping = json.load(open(corpus_mapping_path))

In [None]:
corpus_vectors = np.load(corpus_vector_path, allow_pickle=True)['data']
corpus_vectors.shape

(428,)

In [None]:
queries_mapping = json.load(open(queries_mapping_path))

In [None]:
queries_vectors = np.load(queries_vector_path, allow_pickle=True)['data']
queries_vectors.shape

(43,)

In [None]:
def late_score(query_emb, doc_emb):
    score_matrix = np.matmul(query_emb, doc_emb.T)
    max_per_query_token = np.max(score_matrix, axis=1)
    return max_per_query_token.sum().item()

In [None]:
red_dims = [512, 256, 128, 64]

In [None]:
def score_ndcg(data, query_embeds, queries_mapping, corpus_embeds, corpus_mapping):
  ndcgs = []

  for q, g in tqdm(data.groupby('query_id')):
    q_text = g['query_text'].values[0]
    q_i = queries_mapping['text2i'][q_text]
    q_vec = query_embeds[q_i]

    c_texts = g['corpus_text'].values.tolist()
    c_vecs = []
    for i, c_text in enumerate(c_texts):
      c_i = corpus_mapping['text2i'][c_text]
      c_vec = corpus_embeds[c_i]
      c_vecs.append(c_vec)



    scores = [late_score(np.array(q_vec), np.array(c_vec)) for c_vec in c_vecs]

    labels = g['label'].values
    ndcg = ndcg_score([labels], [scores], k=10)
    ndcgs.append(ndcg)

  mean_ndcg = np.mean(ndcgs)
  print(f"Mean nDCG: {mean_ndcg:.4f}")

In [None]:
score_ndcg(data, queries_vectors, queries_mapping, corpus_vectors, corpus_mapping)

100%|██████████| 43/43 [00:00<00:00, 204.13it/s]

Mean nDCG: 0.8846





In [None]:
def reduct_sk(query_embeds, corpus_embeds, model_name, red_dim):
  model_path = sk_model_path_template.format(corpus_prefix, method_prefix, model_name, red_dim)
  red_model = joblib.load(model_path)
  # TODO: по-умному, с длинами предложений, см раньше делала:
    # lens = [vec.shape[0] for vec in c_vecs]
    # c_vecs = np.vstack(c_vecs)
    # понижение размерности
    # c_vecs = np.split(c_vecs, np.cumsum(lens[:-1]))
  red_queries_vectors = [red_model.transform(q_vec) for q_vec in query_embeds]
  red_corpus_vectors = [red_model.transform(c_vec) for c_vec in corpus_embeds]
  return red_queries_vectors, red_corpus_vectors

In [None]:
def score_dims(red_fn, red_dims, method):
  for red_dim in red_dims:
    red_query_embeds, red_corpus_embeds = red_fn(queries_vectors, corpus_vectors, method, red_dim)
    embed_dim = red_dim
    score_ndcg(data, red_query_embeds, queries_mapping, red_corpus_embeds, corpus_mapping)

In [None]:
score_dims(reduct_sk, red_dims, "PCA")

100%|██████████| 43/43 [00:00<00:00, 233.08it/s]


Mean nDCG: 0.8802


100%|██████████| 43/43 [00:00<00:00, 453.25it/s]


Mean nDCG: 0.8851


100%|██████████| 43/43 [00:00<00:00, 553.49it/s]


Mean nDCG: 0.8867


100%|██████████| 43/43 [00:00<00:00, 464.73it/s]

Mean nDCG: 0.8745



