In [1]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm

seed = 42
import random
import sys
from collections import defaultdict

from tqdm import tqdm

sys.path.append("/home/jupyter/work/resources/DiplomDimReduction/")
import importlib

import config

importlib.reload(config)
import utils
from config import config_dict

importlib.reload(utils)

import torch

from utils import (
    Autoencoder,
    LinearAutoencoder,
    load_mapping,
    load_sk_model,
    load_vectors,
)

In [2]:
corpus_prefix = config_dict["marco_prefix"]
split_suffix = config_dict["train_suffix"]

In [3]:
# method_prefix = config_dict["dpr_prefix"]
# method_prefix = config_dict["ance_prefix"]
method_prefix = config_dict['tas-b_prefix']
# method_prefix = config_dict['s-bert_prefix']
# method_prefix = config_dict["late interaction prefix"]

corpus_vector_path = config_dict["corpus_vector_template"].format(
    corpus_prefix, method_prefix, split_suffix
)

corpus_sample_vector_path = config_dict["corpus_sample_vector_template"].format(
    corpus_prefix, method_prefix, split_suffix
)

corpus_mapping_path = config_dict["corpus_mapping_template"].format(
    corpus_prefix, method_prefix, split_suffix
)

sk_model_path_template = config_dict["reduction_sk_model_template"]
ae_model_path_template = config_dict["reduction_ae_model_template"]

In [37]:
curr_dim = 128

In [38]:
%%time
corpus_vectors = load_vectors(corpus_vector_path)
corpus_vectors.shape

CPU times: user 1.76 s, sys: 251 ms, total: 2.01 s
Wall time: 2.8 s


(99441, 768)

In [39]:
corpus_mapping = load_mapping(corpus_mapping_path)
corpus_mapping.keys()

dict_keys(['i2text', 'text2i'])

In [40]:
def reduct_sk(corpus_embeds, model_name, red_dim):
    model_path = sk_model_path_template.format(
        corpus_prefix, method_prefix, model_name, red_dim
    )
    red_model = load_sk_model(model_path)
    red_embeddings = red_model.transform(corpus_embeds)
    return red_embeddings

In [41]:
pca_vectors = reduct_sk(corpus_vectors, "PCA", curr_dim)

In [42]:
umap_vectors = reduct_sk(corpus_vectors, "UMAP", curr_dim)

In [43]:
umap_5_vectors = reduct_sk(corpus_vectors, "UMAP_5", curr_dim)

In [44]:
umap_25_vectors = reduct_sk(corpus_vectors, "UMAP_25", curr_dim)

In [45]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [46]:
name2model = {
    "LinearAutoencoder": LinearAutoencoder,
    "AutoEncoder+": Autoencoder,
}

In [47]:
def reduct_ae(corpus_embeds, model_name, red_dim, model_kwargs={}):
    model_class = name2model[model_name]
    red_model = model_class(
        input_dim=corpus_embeds.shape[1], output_dim=red_dim, **model_kwargs
    )
    model_path = ae_model_path_template.format(
        corpus_prefix, method_prefix, model_name, red_dim
    )
    print(model_path)
    red_model.load_state_dict(torch.load(model_path, map_location=device))
    red_model.to(device)
    red_model.eval()
    with torch.no_grad():
        red_embeddings = red_model.encoder(
            torch.tensor(corpus_embeds, dtype=torch.float32)).numpy()
    return red_embeddings

In [48]:
lae_vectors = reduct_ae(corpus_vectors, "LinearAutoencoder", curr_dim)

/home/jupyter/work/resources/DiplomDimReduction/data/models/marco/tas_b/LinearAutoencoder_128.pt


In [49]:
ae_vectors = reduct_ae(corpus_vectors, "AutoEncoder+", curr_dim)

/home/jupyter/work/resources/DiplomDimReduction/data/models/marco/tas_b/AutoEncoder+_128.pt


# Тесты

In [50]:
np.random.seed(seed)
sample_size = 10000
indices = np.random.choice(corpus_vectors.shape[0], sample_size, replace=False)

In [51]:
orig_vectors = corpus_vectors[indices]
pca_vectors = pca_vectors[indices]
umap_vectors = umap_vectors[indices]
umap_5_vectors = umap_5_vectors[indices]
umap_25_vectors = umap_25_vectors[indices]
lae_vectors = lae_vectors[indices]
ae_vectors = ae_vectors[indices]

In [52]:
k = 10

In [53]:
nn_orig = NearestNeighbors(n_neighbors=k+1).fit(orig_vectors).kneighbors(return_distance=False)[:, 1:]  # без самого себя

In [54]:
nn_orig.shape

(10000, 10)

In [55]:
nn_pca = NearestNeighbors(n_neighbors=k+1).fit(pca_vectors).kneighbors(return_distance=False)[:, 1:]  # без самого себя

In [56]:
nn_umap = NearestNeighbors(n_neighbors=k+1).fit(umap_vectors).kneighbors(return_distance=False)[:, 1:]  # без самого себя

In [57]:
nn_umap_5 = NearestNeighbors(n_neighbors=k+1).fit(umap_5_vectors).kneighbors(return_distance=False)[:, 1:]  # без самого себя

In [58]:
nn_umap_25 = NearestNeighbors(n_neighbors=k+1).fit(umap_25_vectors).kneighbors(return_distance=False)[:, 1:]  # без самого себя

In [59]:
nn_lae = NearestNeighbors(n_neighbors=k+1).fit(lae_vectors).kneighbors(return_distance=False)[:, 1:]  # без самого себя

In [60]:
nn_ae = NearestNeighbors(n_neighbors=k+1).fit(ae_vectors).kneighbors(return_distance=False)[:, 1:]  # без самого себя

In [61]:
def local_structure_preservation(nn_orig, nn_reduced, k):
    overlap = [
        len(set(orig).intersection(set(reduced))) / k
        for orig, reduced in zip(nn_orig, nn_reduced)
    ]
    return np.mean(overlap)

In [62]:
local_structure_preservation(nn_orig, nn_pca, k)

0.6177199999999999

In [63]:
local_structure_preservation(nn_orig, nn_umap, k)

0.17514000000000002

In [64]:
local_structure_preservation(nn_orig, nn_umap_5, k)

0.15888999999999998

In [65]:
local_structure_preservation(nn_orig, nn_umap_25, k)

0.17819000000000002

In [66]:
local_structure_preservation(nn_orig, nn_lae, k)

0.60193

In [67]:
local_structure_preservation(nn_orig, nn_ae, k)

0.4908

### Сами соседи

In [35]:
# random.seed(seed)
# points = random.sample(indices.tolist(), 10)
# points

In [36]:
# for point in points:
#     i = np.where(indices == point)[0][0]
#     orig_neighbours = nn_orig[i]
#     pca_neighbours = nn_pca[i]
#     umap_neighbours = nn_umap[i]
#     lae_neighbours = nn_lae[i]
#     ae_neighbours = nn_ae[i]
    
#     print(f'{point}:\n-{orig_neighbours}\n-{pca_neighbours}\n-{umap_neighbours}\n-{lae_neighbours}\n-{ae_neighbours}')
    