In [1]:
import sys

import numpy as np
from tqdm import tqdm

sys.path.append("/home/jupyter/work/resources/DiplomDimReduction/")
import importlib

import config

importlib.reload(config)
import utils
from config import config_dict

importlib.reload(utils)

import torch
import torch.nn as nn
from sklearn.decomposition import PCA, FastICA, KernelPCA, TruncatedSVD
from sklearn.manifold import MDS, Isomap, LocallyLinearEmbedding
from torch.utils.data import DataLoader, TensorDataset
from umap import UMAP
from utils import load_vectors, save_ae_model, save_sk_model

2025-05-14 16:40:11.734328: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-14 16:40:14.512047: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
corpus_prefix = config_dict["marco_prefix"]
split_suffix = config_dict["train_suffix"]

In [3]:
# method_prefix = config_dict['dpr_prefix']
# method_prefix = config_dict["ance_prefix"]
# method_prefix = config_dict['tas-b_prefix']
method_prefix = config_dict["late interaction prefix"]

corpus_vector_path = config_dict["corpus_vector_template"].format(
    corpus_prefix, method_prefix, split_suffix
)
corpus_sample_vector_path = config_dict["corpus_sample_vector_template"].format(
    corpus_prefix, method_prefix, split_suffix
)
queries_vector_path = config_dict["queries_vector_template"].format(
    corpus_prefix, method_prefix, split_suffix
)

sk_model_path_template = config_dict["reduction_sk_model_template"]
ae_model_path_template = config_dict["reduction_ae_model_template"]

In [4]:
%%time
corpus_vectors = load_vectors(corpus_sample_vector_path)
corpus_vectors.shape

CPU times: user 1.83 s, sys: 397 ms, total: 2.23 s
Wall time: 2.84 s


(100000, 768)

In [5]:
red_dims = [512, 256, 128, 64]

In [6]:
def train_sk_models(model_fn, red_dims):  # TODO: kwargs
    # with warnings.catch_warnings(record=True) as w:
    #   warnings.simplefilter("always", UserWarning)

    for red_dim in tqdm(red_dims):
        try:
            red_model = model_fn(n_components=red_dim)
            red_model.fit(corpus_vectors)
            model_path = sk_model_path_template.format(
                corpus_prefix, method_prefix, model_fn.__name__, red_dim
            )
            print(f"{red_dim} is saved")
            save_sk_model(red_model, model_path)
        except ValueError as ve:
            print(f"{red_dim} is passed because of '{ve}'")
            continue

In [7]:
%%time
train_sk_models(PCA, red_dims)

  0%|          | 0/4 [00:00<?, ?it/s]

512 is saved
/home/jupyter/work/resources/DiplomDimReduction//data/models/marco/colbert exists.


 25%|██▌       | 1/4 [00:14<00:43, 14.43s/it]

48 -> 1587149


 50%|█████     | 2/4 [00:21<00:20, 10.23s/it]

256 is saved
/home/jupyter/work/resources/DiplomDimReduction//data/models/marco/colbert exists.
48 -> 795597


 75%|███████▌  | 3/4 [00:26<00:07,  7.66s/it]

128 is saved
/home/jupyter/work/resources/DiplomDimReduction//data/models/marco/colbert exists.
48 -> 399821


100%|██████████| 4/4 [00:31<00:00,  7.81s/it]

64 is saved
/home/jupyter/work/resources/DiplomDimReduction//data/models/marco/colbert exists.
48 -> 201933
CPU times: user 1min 24s, sys: 23.5 s, total: 1min 47s
Wall time: 31.2 s





In [8]:
%%time
train_sk_models(KernelPCA, red_dims)

  0%|          | 0/4 [00:00<?, ?it/s]


MemoryError: Unable to allocate 37.3 GiB for an array with shape (100000, 100000) and data type float32

In [9]:
%%time
train_sk_models(TruncatedSVD, red_dims)

 25%|██▌       | 1/4 [00:17<00:52, 17.37s/it]

512 is saved
/home/jupyter/work/resources/DiplomDimReduction//data/models/marco/colbert exists.
48 -> 1579751


 50%|█████     | 2/4 [00:26<00:24, 12.42s/it]

256 is saved
/home/jupyter/work/resources/DiplomDimReduction//data/models/marco/colbert exists.
48 -> 790247


 75%|███████▌  | 3/4 [00:31<00:09,  9.28s/it]

128 is saved
/home/jupyter/work/resources/DiplomDimReduction//data/models/marco/colbert exists.
48 -> 395495


100%|██████████| 4/4 [00:35<00:00,  8.98s/it]

64 is saved
/home/jupyter/work/resources/DiplomDimReduction//data/models/marco/colbert exists.
48 -> 198119
CPU times: user 1min 37s, sys: 27 s, total: 2min 4s
Wall time: 35.9 s





In [10]:
%%time
train_sk_models(FastICA, red_dims)

 25%|██▌       | 1/4 [00:31<01:33, 31.28s/it]

512 is passed because of 'array must not contain infs or NaNs'


 50%|█████     | 2/4 [01:51<01:59, 59.87s/it]

256 is saved
/home/jupyter/work/resources/DiplomDimReduction//data/models/marco/colbert exists.
48 -> 2625319


 75%|███████▌  | 3/4 [02:37<00:53, 53.53s/it]

128 is saved
/home/jupyter/work/resources/DiplomDimReduction//data/models/marco/colbert exists.
48 -> 1249063


100%|██████████| 4/4 [03:16<00:00, 49.19s/it]

64 is saved
/home/jupyter/work/resources/DiplomDimReduction//data/models/marco/colbert exists.
48 -> 610087
CPU times: user 9min 57s, sys: 2min 54s, total: 12min 52s
Wall time: 3min 16s





In [None]:
%%time
train_sk_models(UMAP, red_dims)

  0%|          | 0/4 [00:00<?, ?it/s]

512 is saved
/home/jupyter/work/resources/DiplomDimReduction//data/models/marco/colbert exists.


 25%|██▌       | 1/4 [16:40<50:01, 1000.45s/it]

48 -> 1388481565
256 is saved
/home/jupyter/work/resources/DiplomDimReduction//data/models/marco/colbert exists.


 50%|█████     | 2/4 [23:59<22:20, 670.41s/it] 

48 -> 1285930845
128 is saved
/home/jupyter/work/resources/DiplomDimReduction//data/models/marco/colbert exists.


 75%|███████▌  | 3/4 [27:42<07:45, 465.86s/it]

48 -> 1234364621
64 is saved
/home/jupyter/work/resources/DiplomDimReduction//data/models/marco/colbert exists.


100%|██████████| 4/4 [30:08<00:00, 339.64s/it]

48 -> 1207727325


100%|██████████| 4/4 [30:08<00:00, 452.11s/it]

CPU times: user 3h 24min 56s, sys: 14min 13s, total: 3h 39min 9s
Wall time: 30min 8s





In [None]:
%%time
train_sk_models(Isomap, red_dims)

  0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
%%time
train_sk_models(MDS, red_dims)

In [None]:
%%time
train_sk_models(LocallyLinearEmbedding, red_dims)

In [None]:
class LinearAutoencoder(nn.Module):  #TODO: дропаут
    def __init__(self, input_dim, output_dim):
        self.__name__ = "LinearAutoEncoder"
        super(LinearAutoencoder, self).__init__()
        self.encoder = nn.Linear(input_dim, output_dim)
        self.decoder = nn.Linear(output_dim, input_dim)

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
batch_size = 16
dataloader = DataLoader(
    TensorDataset(torch.tensor(corpus_vectors, dtype=torch.float32)),
    batch_size=batch_size,
    shuffle=True,
)

In [None]:
def train_ae_models(model_class, red_dims, n_epochs):
    for red_dim in tqdm(red_dims):
        model = model_class(input_dim=corpus_vectors.shape[1], output_dim=red_dim)
        model.to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
        loss_fn = nn.MSELoss()

        for epoch in range(n_epochs):
            model.train()
            total_loss = 0
            for batch in dataloader:
                batch_inputs = batch[0].to(device)

                optimizer.zero_grad()
                _, decoded = model(batch_inputs)
                loss = loss_fn(decoded, batch_inputs)
                loss.backward()
                optimizer.step()

                total_loss += loss.item()

            avg_loss = total_loss / len(dataloader)
            if (epoch + 1) % 5 == 0:
                print(f"Epoch {epoch+1}, Average Loss: {avg_loss:.4f}")

            # model.eval()
            # with torch.no_grad():
            #   embeddings_tensor = torch.tensor(corpus_vectors, dtype=torch.float32).to(device)
            #   reduced_embeddings, _ = model(embeddings_tensor)
            #   reduced_embeddings = reduced_embeddings.cpu().numpy()

        ae_model_path = ae_model_path_template.format(
            corpus_prefix, method_prefix, model.__name__, red_dim
        )
        save_ae_model(model, ae_model_path)

In [None]:
train_ae_models(LinearAutoencoder, red_dims, n_epochs=20)

  0%|          | 0/4 [00:00<?, ?it/s]

Epoch 5, Average Loss: 0.0001
Epoch 10, Average Loss: 0.0001
Epoch 15, Average Loss: 0.0001
Epoch 20, Average Loss: 0.0001
/home/jupyter/work/resources/DiplomDimReduction//data/models/marco/colbert exists.


 25%|██▌       | 1/4 [04:59<14:59, 299.88s/it]

48 -> 3153513
Epoch 5, Average Loss: 0.0001
Epoch 10, Average Loss: 0.0001
Epoch 15, Average Loss: 0.0001


 50%|█████     | 2/4 [08:31<08:16, 248.02s/it]

Epoch 20, Average Loss: 0.0001
/home/jupyter/work/resources/DiplomDimReduction//data/models/marco/colbert exists.
48 -> 1579625
Epoch 5, Average Loss: 0.0002
Epoch 10, Average Loss: 0.0002
Epoch 15, Average Loss: 0.0002


 75%|███████▌  | 3/4 [11:35<03:38, 218.69s/it]

Epoch 20, Average Loss: 0.0002
/home/jupyter/work/resources/DiplomDimReduction//data/models/marco/colbert exists.
48 -> 792681
Epoch 5, Average Loss: 0.0004
Epoch 10, Average Loss: 0.0004
Epoch 15, Average Loss: 0.0004


100%|██████████| 4/4 [14:24<00:00, 216.10s/it]

Epoch 20, Average Loss: 0.0004
/home/jupyter/work/resources/DiplomDimReduction//data/models/marco/colbert exists.
48 -> 399199





In [None]:
class Autoencoder(nn.Module):  #TODO: дропаут
    def __init__(self, input_dim, output_dim, hidden_dim=256):
        self.__name__ = f"AutoEncoder{hidden_dim}"
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim),
        )
        self.decoder = nn.Sequential(
            nn.Linear(output_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim),
        )
        
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

In [None]:
train_ae_models(Autoencoder, red_dims, n_epochs=20)

  0%|          | 0/4 [00:00<?, ?it/s]


TypeError: super(type, obj): obj must be an instance or subtype of type