In [2]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm
import numpy as np
import os
import sys
import json
import joblib

import sys
sys.path.append("/home/jupyter/work/resources/DiplomDimReduction/")
import importlib

import config
importlib.reload(config)
from config import config_dict

import utils
importlib.reload(utils)
from utils import create_path, cosine_similarity

from sklearn.metrics import ndcg_score

import torch
import torch.nn as nn

In [3]:
model_name = config_dict['dpr_model']
method_prefix = config_dict['dpr_prefix']

# model_name = config_dict['ance_model']
# method_prefix = config_dict['ance_prefix']

# model_name = config_dict['tas-b_model']
# method_prefix = config_dict['tas-b_prefix']

corpus_prefix = config_dict['marco_prefix']
split_suffix = config_dict['test_suffix']

data_path = config_dict['data_template'].format(corpus_prefix, split_suffix)

corpus_vector_path = config_dict['corpus_vector_template'].format(corpus_prefix, method_prefix, split_suffix)
corpus_mapping_path = config_dict['corpus_mapping_template'].format(corpus_prefix, method_prefix, split_suffix)

queries_vector_path = config_dict['queries_vector_template'].format(corpus_prefix, method_prefix, split_suffix)
queries_mapping_path = config_dict['queries_mapping_template'].format(corpus_prefix, method_prefix, split_suffix)

sk_model_path_template = config_dict['reduction_sk_model_template']
ae_model_path_template = config_dict['reduction_ae_model_template']

In [4]:
data = pd.read_parquet(data_path)
data

Unnamed: 0,query_id,corpus_id,label,corpus_text,query_text
0,19335,8412683,1,Ecological anthropology is defined as the stud...,anthropological definition of environment
1,19335,1729,1,Graduate Study in Anthropology. The graduate p...,anthropological definition of environment
2,19335,8412684,1,Ecological Anthropology. Ecological anthropolo...,anthropological definition of environment
3,19335,3683653,0,The branches of Earth Science are: 1 Geology ...,anthropological definition of environment
4,19335,342432,0,Five Disciplines of Anthropology. 1 Applied A...,anthropological definition of environment
...,...,...,...,...,...
425,1133167,6467520,0,"Climate data for ball mtn lake, Longitude: -72...",how is the weather in jamaica
426,1133167,4712274,0,"Re: Best Time of Year to Visit Jamaica Mar 17,...",how is the weather in jamaica
427,1133167,7115353,0,Hurricane season has ended over a month ago an...,how is the weather in jamaica
428,1133167,8415745,0,"The weather stations sit near sea level, with ...",how is the weather in jamaica


In [5]:
def reduct_sk(query_embeds, corpus_embeds, model_name, red_dim):
  model_path = sk_model_path_template.format(corpus_prefix, method_prefix, model_name, red_dim)
  red_model = joblib.load(model_path)
  red_embeddings = red_model.transform(np.vstack([query_embeds, corpus_embeds]))
  return np.split(red_embeddings, [len(query_embeds)])

In [6]:
query_embeds = np.load(queries_vector_path)['data']
queries_mapping = json.load(open(queries_mapping_path))

In [7]:
corpus_embeds = np.load(corpus_vector_path)['data']
corpus_mapping = json.load(open(corpus_mapping_path))

In [8]:
red_dims = [512, 256, 128, 64]

In [9]:
def score_ndcg(data, query_embeds, queries_mapping, corpus_embeds, corpus_mapping):
  ndcgs = []

  for q, g in tqdm(data.groupby('query_id')):

    q_text = g['query_text'].values[0]
    q_i = queries_mapping['text2i'][q_text]
    q_vec = query_embeds[q_i]

    c_texts = g['corpus_text'].values.tolist()
    c_vecs = np.empty((len(c_texts), corpus_embeds.shape[1]), dtype=np.float32)
    for i, c_text in enumerate(c_texts):
      c_i = corpus_mapping['text2i'][c_text]
      c_vec = corpus_embeds[c_i]
      c_vecs[i] = c_vec

    scores = cosine_similarity(q_vec, c_vecs)
    indices = np.argsort(-scores)

    labels = g['label'].values
    ndcg = ndcg_score([labels], [scores], k=10)
    ndcgs.append(ndcg)

  mean_ndcg = np.mean(ndcgs)
  print(f"Mean nDCG: {mean_ndcg:.4f}")

In [10]:
score_ndcg(data, query_embeds, queries_mapping, corpus_embeds, corpus_mapping)

100%|██████████| 43/43 [00:00<00:00, 1119.36it/s]

Mean nDCG: 0.8650





In [11]:
def score_dims(red_fn, red_dims, method):
  for red_dim in red_dims:
    red_query_embeds, red_corpus_embeds = red_fn(query_embeds, corpus_embeds, method, red_dim)
    embed_dim = red_dim
    score_ndcg(data, red_query_embeds, queries_mapping, red_corpus_embeds, corpus_mapping)

In [12]:
score_dims(reduct_sk, red_dims, "PCA")

100%|██████████| 43/43 [00:00<00:00, 641.79it/s]


Mean nDCG: 0.8503


100%|██████████| 43/43 [00:00<00:00, 755.75it/s]


Mean nDCG: 0.8496


100%|██████████| 43/43 [00:00<00:00, 514.17it/s]


Mean nDCG: 0.8503


100%|██████████| 43/43 [00:00<00:00, 726.06it/s]

Mean nDCG: 0.8377





In [14]:
score_dims(reduct_sk, red_dims, "KernelPCA")

FileNotFoundError: [Errno 2] No such file or directory: '/home/jupyter/work/resources/DiplomDimReduction//data/models/marco/dpr/KernelPCA_512.joblib'

In [13]:
score_dims(reduct_sk, red_dims, "TruncatedSVD")

100%|██████████| 43/43 [00:00<00:00, 778.06it/s]


Mean nDCG: 0.8637


100%|██████████| 43/43 [00:00<00:00, 800.00it/s]


Mean nDCG: 0.8637


100%|██████████| 43/43 [00:00<00:00, 740.61it/s]

Mean nDCG: 0.8550



100%|██████████| 43/43 [00:00<00:00, 791.97it/s]

Mean nDCG: 0.8350





In [15]:
score_dims(reduct_sk, red_dims, "FastICA")

100%|██████████| 43/43 [00:00<00:00, 792.10it/s]


Mean nDCG: 0.8401


100%|██████████| 43/43 [00:00<00:00, 718.01it/s]


Mean nDCG: 0.8538


100%|██████████| 43/43 [00:00<00:00, 642.12it/s]


Mean nDCG: 0.8465


100%|██████████| 43/43 [00:00<00:00, 776.95it/s]

Mean nDCG: 0.8326





In [16]:
score_dims(reduct_sk, red_dims, "LocallyLinearEmbedding")

FileNotFoundError: [Errno 2] No such file or directory: '/home/jupyter/work/resources/DiplomDimReduction//data/models/marco/dpr/LocallyLinearEmbedding_512.joblib'

In [17]:
class LinearAutoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        self.__name__ = 'LinearAutoEncoder'
        super(LinearAutoencoder, self).__init__()
        self.encoder = nn.Linear(input_dim, hidden_dim)
        self.decoder = nn.Linear(hidden_dim, input_dim)

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [21]:
def reduct_ae(query_embeds, corpus_embeds, model_name, red_dim):
  red_model = LinearAutoencoder(input_dim=query_embeds.shape[1], hidden_dim=red_dim)
  model_path = ae_model_path_template.format(corpus_prefix, method_prefix, red_model.__name__, red_dim)
  red_model.load_state_dict(torch.load(model_path, map_location=torch.device(device)))
  red_model.to(device)
  red_model.eval()
  with torch.no_grad():
        red_embeddings = red_model.encoder(torch.tensor(np.vstack([query_embeds, corpus_embeds]), dtype=torch.float32)).numpy()
  return np.split(red_embeddings, [len(query_embeds)])

In [22]:
score_dims(reduct_ae, red_dims, "LinearAutoEncoder")

100%|██████████| 43/43 [00:00<00:00, 1253.41it/s]


Mean nDCG: 0.8473


100%|██████████| 43/43 [00:00<00:00, 1239.76it/s]


Mean nDCG: 0.8429


100%|██████████| 43/43 [00:00<00:00, 1259.37it/s]


Mean nDCG: 0.8575


100%|██████████| 43/43 [00:00<00:00, 1258.85it/s]

Mean nDCG: 0.8410



