In [1]:
import sys

import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer

sys.path.append("/home/jupyter/work/resources/DiplomDimReduction/")
import importlib

import random
seed=42

import config

importlib.reload(config)
import utils
from config import config_dict

importlib.reload(utils)
from sklearn.metrics import ndcg_score
from utils import (
    Autoencoder,
    LinearAutoencoder,
    cosine_similarity,
    create_path,
    load_mapping,
    load_sk_model,
    load_vectors,
    save_preds,
)



In [94]:
curr_dim = 64

In [95]:
model_name = config_dict["dpr_model"]
method_prefix = config_dict["dpr_prefix"]

# model_name = config_dict['ance_model']
# method_prefix = config_dict['ance_prefix']

# model_name = config_dict['tas-b_model']
# method_prefix = config_dict['tas-b_prefix']

# model_name = config_dict['s-bert_model']
# method_prefix = config_dict['s-bert_prefix']

corpus_prefix = config_dict["marco_prefix"]
split_suffix = config_dict["test_suffix"]

data_path = config_dict["data_template"].format(corpus_prefix, split_suffix)

corpus_vector_path = config_dict["corpus_vector_template"].format(
    corpus_prefix, method_prefix, split_suffix
)
corpus_mapping_path = config_dict["corpus_mapping_template"].format(
    corpus_prefix, method_prefix, split_suffix
)

queries_vector_path = config_dict["queries_vector_template"].format(
    corpus_prefix, method_prefix, split_suffix
)
queries_mapping_path = config_dict["queries_mapping_template"].format(
    corpus_prefix, method_prefix, split_suffix
)

sk_model_path_template = config_dict["reduction_sk_model_template"]
ae_model_path_template = config_dict["reduction_ae_model_template"]

preds_path = config_dict['preds_template'].format(corpus_prefix, method_prefix, curr_dim)

In [96]:
data = pd.read_parquet(data_path)
data

Unnamed: 0,query_id,corpus_id,label,corpus_text,query_text
0,19335,8412683,1,Ecological anthropology is defined as the stud...,anthropological definition of environment
1,19335,1729,1,Graduate Study in Anthropology. The graduate p...,anthropological definition of environment
2,19335,8412684,1,Ecological Anthropology. Ecological anthropolo...,anthropological definition of environment
3,19335,3683653,0,The branches of Earth Science are: 1 Geology ...,anthropological definition of environment
4,19335,342432,0,Five Disciplines of Anthropology. 1 Applied A...,anthropological definition of environment
...,...,...,...,...,...
425,1133167,6467520,0,"Climate data for ball mtn lake, Longitude: -72...",how is the weather in jamaica
426,1133167,4712274,0,"Re: Best Time of Year to Visit Jamaica Mar 17,...",how is the weather in jamaica
427,1133167,7115353,0,Hurricane season has ended over a month ago an...,how is the weather in jamaica
428,1133167,8415745,0,"The weather stations sit near sea level, with ...",how is the weather in jamaica


In [97]:
%%time
corpus_vectors = load_vectors(corpus_vector_path)
corpus_vectors.shape

CPU times: user 7.02 ms, sys: 3.52 ms, total: 10.5 ms
Wall time: 10.2 ms


(428, 768)

In [98]:
corpus_mapping = load_mapping(corpus_mapping_path)
corpus_mapping.keys()

dict_keys(['i2text', 'text2i'])

In [99]:
%%time
query_vectors = load_vectors(queries_vector_path)
query_vectors.shape

CPU times: user 1.97 ms, sys: 776 µs, total: 2.74 ms
Wall time: 2.4 ms


(43, 768)

In [100]:
query_mapping = load_mapping(queries_mapping_path)
query_mapping.keys()

dict_keys(['i2text', 'text2i'])

In [101]:
np.random.seed(seed)
sample_size = 10
query_indices = np.random.choice(data['query_id'].unique(), sample_size, replace=False)
query_indices

array([1117099,  833860,  855410, 1115776, 1114646, 1121709,   87452,
        168216,  146187,   87181])

In [102]:
def reduct_sk(query_embeds, corpus_embeds, model_name, red_dim, model_kwargs={}):
    model_path = sk_model_path_template.format(
        corpus_prefix, method_prefix, model_name, red_dim
    )
    red_model = load_sk_model(model_path)
    red_embeddings = red_model.transform(np.vstack([query_embeds, corpus_embeds]))
    return np.split(red_embeddings, [len(query_embeds)])

In [103]:
pca_queries, pca_corpus = reduct_sk(query_vectors, corpus_vectors, "PCA", curr_dim)

In [104]:
umap_queries, umap_corpus = reduct_sk(query_vectors, corpus_vectors, "UMAP", curr_dim)

In [105]:
umap_5_queries, umap_5_corpus = reduct_sk(query_vectors, corpus_vectors, "UMAP_5", curr_dim)

In [106]:
umap_25_queries, umap_25_corpus = reduct_sk(query_vectors, corpus_vectors, "UMAP_25", curr_dim)

In [107]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [108]:
name2model = {
    "LinearAutoencoder": LinearAutoencoder,
    "AutoEncoder+": Autoencoder,
}

In [109]:
def reduct_ae(query_embeds, corpus_embeds, model_name, red_dim, model_kwargs={}):
    model_class = name2model[model_name]
    red_model = model_class(
        input_dim=query_embeds.shape[1], output_dim=red_dim, **model_kwargs
    )
    model_path = ae_model_path_template.format(
        corpus_prefix, method_prefix, model_name, red_dim
    )
    print(model_path)
    red_model.load_state_dict(torch.load(model_path, map_location=device))
    red_model.to(device)
    red_model.eval()
    with torch.no_grad():
        query_embeddings = red_model.encoder(
            torch.tensor(query_embeds, dtype=torch.float32)).numpy()
        corpus_embeddings = red_model.encoder(
            torch.tensor(corpus_embeds, dtype=torch.float32)).numpy()

        return query_embeddings, corpus_embeddings

In [110]:
lae_queries, lae_corpus = reduct_ae(query_vectors, corpus_vectors, "LinearAutoencoder", curr_dim)

/home/jupyter/work/resources/DiplomDimReduction/data/models/marco/dpr/LinearAutoencoder_64.pt


In [111]:
ae_queries, ae_corpus = reduct_ae(query_vectors, corpus_vectors, "AutoEncoder+", curr_dim)

/home/jupyter/work/resources/DiplomDimReduction/data/models/marco/dpr/AutoEncoder+_64.pt


In [112]:
def predict_group(query_embeds, queries_mapping, corpus_embeds, corpus_mapping, n=10):
    predictions = {}
    
    for q, g in tqdm(data.groupby("query_id")):
        if q in query_indices:

            # Ранжируем имеющееся
            q_text = g["query_text"].values[0]
            
            q_i = queries_mapping["text2i"][q_text]
            q_vec = query_embeds[q_i]

            c_texts = g["corpus_text"].values.tolist()
            i2t = {i: text for i, text in enumerate(c_texts)}

            c_vecs = np.empty((len(c_texts), corpus_embeds.shape[1]), dtype=np.float32)
            for i, c_text in enumerate(c_texts):
                c_i = corpus_mapping["text2i"][c_text]
                c_vec = corpus_embeds[c_i]
                c_vecs[i] = c_vec

            scores = cosine_similarity(q_vec, c_vecs)
            indices = np.argsort(-scores)

            texts = [i2t[i] for i in indices]

            t2l = dict(zip(g['corpus_text'], g['label']))
            labels = [t2l[text] for text in texts]
            
            predictions[q_text] = [{'text': text, 'label': label} for text, label in zip(texts, labels)]
            
    return predictions

In [113]:
data[data['query_text'] == 'what is a active margin']

Unnamed: 0,query_id,corpus_id,label,corpus_text,query_text
370,1117099,52516,1,"In plate tectonics, a convergent boundary, als...",what is a active margin
371,1117099,859540,1,"Convergent boundary. In plate tectonics, a con...",what is a active margin
372,1117099,4393950,1,"(February 2016) In plate tectonics, a converge...",what is a active margin
373,1117099,5452829,0,The location where the sinking of a plate occu...,what is a active margin
374,1117099,3349607,0,57. ____________ continental margins occur whe...,what is a active margin
375,1117099,4040437,0,difference $ 5 margin $ 5 $ 55 9 1 % markup $ ...,what is a active margin
376,1117099,855067,0,1 Convergent Boundaries-A convergent boundary ...,what is a active margin
377,1117099,8202908,0,Subduction zones occur at collision boundaries...,what is a active margin
378,1117099,3951182,0,Tectonics and Landforms Plate Boundaries Along...,what is a active margin
379,1117099,1527742,0,When one plate is composed of oceanic lithosph...,what is a active margin


In [114]:
predictions = {}

In [115]:
predictions['orig'] = predict_group(query_vectors, query_mapping, corpus_vectors, corpus_mapping)

100%|██████████| 43/43 [00:00<00:00, 7296.51it/s]


In [116]:
predictions['PCA'] = predict_group(pca_queries, query_mapping, pca_corpus, corpus_mapping)

100%|██████████| 43/43 [00:00<00:00, 9118.05it/s]


In [117]:
predictions['UMAP'] = predict_group(umap_queries, query_mapping, umap_corpus, corpus_mapping)

100%|██████████| 43/43 [00:00<00:00, 6133.07it/s]


In [118]:
predictions['UMAP_5'] = predict_group(umap_5_queries, query_mapping, umap_5_corpus, corpus_mapping)

100%|██████████| 43/43 [00:00<00:00, 7222.00it/s]


In [119]:
predictions['UMAP_25'] = predict_group(umap_25_queries, query_mapping, umap_25_corpus, corpus_mapping)

100%|██████████| 43/43 [00:00<00:00, 8133.63it/s]


In [120]:
predictions['LAE'] = predict_group(lae_queries, query_mapping, lae_corpus, corpus_mapping)

100%|██████████| 43/43 [00:00<00:00, 7761.88it/s]


In [121]:
predictions['AE+'] = predict_group(ae_queries, query_mapping, ae_corpus, corpus_mapping)

100%|██████████| 43/43 [00:00<00:00, 7205.27it/s]


In [122]:
save_preds(predictions, preds_path)

/home/jupyter/work/resources/DiplomDimReduction/data/predictions/marco/dpr exists.
360 -> 304441


In [123]:
    # for q_i, q_vec in tqdm(zip(query_indices, query_vectors)):
    #     scores = cosine_similarity(q_vec, corpus_vectors)
    #     indices = np.argsort(-scores).tolist()[:n]
    #     predictions[q_i] = indices
    # return predictions

In [124]:
# def map_preds(pred_ids):
#     predictions = {}
#     for q_id, c_ids in pred_ids.items():
#         q_text = query_mapping['i2text'][str(q_id)]
#         predictions[q_text] = [corpus_mapping['i2text'][str(c_id)] for c_id in c_ids]
#     return predictions

In [33]:
# orig_pred_ids = predict_n(orig_query_vectors, corpus_vectors, n=20)
# orig_pred_texts = map_preds(orig_pred_ids)