In [1]:
import sys

import numpy as np
from tqdm import tqdm

sys.path.append("/home/jupyter/work/resources/DiplomDimReduction/")
import importlib

seed = 42

import config

importlib.reload(config)
import utils
from config import config_dict

importlib.reload(utils)

import torch
from sklearn.decomposition import PCA, FastICA, KernelPCA, TruncatedSVD
from sklearn.manifold import MDS, Isomap, LocallyLinearEmbedding
from torch.utils.data import DataLoader, TensorDataset
from umap import UMAP
from utils import (
    Autoencoder,
    LinearAutoencoder,
    load_vectors,
    save_ae_model,
    save_sk_model,
)

2025-05-30 09:20:02.205817: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-30 09:20:02.257955: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
corpus_prefix = config_dict["marco_prefix"]
train_split_suffix = config_dict["train_suffix"]
val_split_suffix = config_dict["val_suffix"]

In [3]:
# method_prefix = config_dict["dpr_prefix"]
# method_prefix = config_dict["ance_prefix"]
method_prefix = config_dict['tas-b_prefix']
# method_prefix = config_dict['s-bert_prefix']
# method_prefix = config_dict["late interaction prefix"]

corpus_vector_path = config_dict["corpus_vector_template"].format(
    corpus_prefix, method_prefix, train_split_suffix
)

corpus_sample_vector_path = config_dict["corpus_sample_vector_template"].format(
    corpus_prefix, method_prefix, train_split_suffix
)

val_vector_path = config_dict["corpus_vector_template"].format(
    corpus_prefix, method_prefix, val_split_suffix
)

val_sample_vector_path = config_dict["corpus_sample_vector_template"].format(
    corpus_prefix, method_prefix, val_split_suffix
)

sk_model_path_template = config_dict["reduction_sk_model_template"]
ae_model_path_template = config_dict["reduction_ae_model_template"]

In [4]:
%%time
corpus_vectors = load_vectors(corpus_vector_path)
corpus_vectors.shape

CPU times: user 1.85 s, sys: 193 ms, total: 2.04 s
Wall time: 3.64 s


(99441, 768)

In [5]:
%%time
val_vectors = load_vectors(val_vector_path)
val_vectors.shape

CPU times: user 1.3 s, sys: 138 ms, total: 1.44 s
Wall time: 3.2 s


(69529, 768)

In [6]:
red_dims = [512, 256, 128, 64]

In [7]:
def train_sk_models(model_fn, red_dims, model_kwargs={}, fit_kwargs={}, model_name=""):
    for red_dim in tqdm(red_dims):
        try:
            red_model = model_fn(n_components=red_dim, **model_kwargs)
            red_model.fit(corpus_vectors, **fit_kwargs)
            if not model_name:
                model_name = model_fn.__name__
            model_path = sk_model_path_template.format(
                corpus_prefix, method_prefix, model_name, red_dim
            )
            print(f"{red_dim} is trained")
            save_sk_model(red_model, model_path)

        except ValueError as ve:
            print(f"{red_dim} is passed because of '{ve}'")
            continue

In [8]:
%%time
# train_sk_models(PCA, red_dims)

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 9.06 µs


In [9]:
%%time
# train_sk_models(KernelPCA, red_dims)

CPU times: user 4 µs, sys: 2 µs, total: 6 µs
Wall time: 9.78 µs


In [10]:
%%time
# train_sk_models(TruncatedSVD, red_dims)

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 8.58 µs


In [11]:
%%time
# train_sk_models(FastICA, red_dims)

CPU times: user 4 µs, sys: 2 µs, total: 6 µs
Wall time: 9.54 µs


In [12]:
%%time
# train_sk_models(UMAP, red_dims)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 8.82 µs


In [None]:
%%time
train_sk_models(UMAP, red_dims, model_kwargs={"n_neighbors": 5}, model_name="UMAP_5")

  0%|          | 0/4 [00:00<?, ?it/s]

512 is trained
/home/jupyter/work/resources/DiplomDimReduction/data/models/marco/tas_b exists.


 25%|██▌       | 1/4 [06:21<19:04, 381.47s/it]

48 -> 1455261869
256 is trained
/home/jupyter/work/resources/DiplomDimReduction/data/models/marco/tas_b exists.


 50%|█████     | 2/4 [09:09<08:31, 255.83s/it]

48 -> 1354598109
128 is trained
/home/jupyter/work/resources/DiplomDimReduction/data/models/marco/tas_b exists.


100%|██████████| 4/4 [11:37<00:00, 174.47s/it]

48 -> 1276608877
CPU times: user 4h 3min 24s, sys: 37min 24s, total: 4h 40min 48s
Wall time: 11min 37s





In [None]:
%%time
train_sk_models(UMAP, red_dims, model_kwargs={"n_neighbors": 25}, model_name="UMAP_25")

  0%|          | 0/4 [00:00<?, ?it/s]

512 is trained
/home/jupyter/work/resources/DiplomDimReduction/data/models/marco/tas_b exists.


 25%|██▌       | 1/4 [05:28<16:25, 328.42s/it]

48 -> 1333518557
256 is trained
/home/jupyter/work/resources/DiplomDimReduction/data/models/marco/tas_b exists.


 50%|█████     | 2/4 [08:16<07:47, 233.82s/it]

48 -> 1232076573
128 is trained
/home/jupyter/work/resources/DiplomDimReduction/data/models/marco/tas_b exists.


In [None]:
%%time
# train_sk_models(UMAP, red_dims, model_kwargs={"min_dist": 0.8}, model_name="UMAP_0.8")

CPU times: user 3 µs, sys: 2 µs, total: 5 µs
Wall time: 10.5 µs


In [None]:
%%time
# train_sk_models(
#     UMAP,
#     red_dims,
#     model_kwargs={"n_neighbors": 5, "min_dist": 0.8},
#     model_name="UMAP_5_0.8",
# )

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 10 µs


In [None]:
%%time
# train_sk_models(Isomap, red_dims)

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 10 µs


In [None]:
%%time
# train_sk_models(MDS, red_dims)

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 10.3 µs


In [None]:
%%time
# train_sk_models(LocallyLinearEmbedding, red_dims)

CPU times: user 6 µs, sys: 0 ns, total: 6 µs
Wall time: 11.4 µs


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
batch_size = 16
train_dataloader = DataLoader(
    TensorDataset(torch.tensor(corpus_vectors, dtype=torch.float32)),
    batch_size=batch_size,
    shuffle=True,
)
val_dataloader = DataLoader(
    TensorDataset(torch.tensor(val_vectors, dtype=torch.float32)),
    batch_size=batch_size,
    shuffle=True,
)

In [None]:
def train_dim(model_class, red_dim, n_epochs, model_kwargs={}):
    model = model_class(
        input_dim=corpus_vectors.shape[1], output_dim=red_dim, **model_kwargs
    )
    print(model)
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = torch.nn.MSELoss()

    best_val_loss = float("inf")
    best_model_state = None

    for epoch in range(n_epochs):
        model.train()
        total_loss = 0
        for batch in train_dataloader:
            batch_inputs = batch[0].to(device)

            optimizer.zero_grad()
            _, decoded = model(batch_inputs)
            loss = loss_fn(decoded, batch_inputs)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_dataloader)

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_dataloader:
                batch_inputs = batch[0].to(device)
                _, decoded = model(batch_inputs)
                loss = loss_fn(decoded, batch_inputs)
                val_loss += loss.item()
        avg_val_loss = val_loss / len(val_dataloader)

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_model_state = model.state_dict()
            # print(f"Model state is updated on epoch {epoch + 1}")

        if (epoch + 1) % 5 == 0:
            print(
                f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}"
            )

    model.load_state_dict(best_model_state)
    return model

In [None]:
def train_ae_models(model_class, red_dims, n_epochs, model_kwargs={}, model_name=""):
    for red_dim in tqdm(red_dims):
        model = train_dim(model_class, red_dim, n_epochs, model_kwargs)
        print(f"{red_dim} is trained")
        if not model_name:
            model_name = model_class.__name__
        ae_model_path = ae_model_path_template.format(
            corpus_prefix, method_prefix, model_name, red_dim
        )
        save_ae_model(model, ae_model_path)

In [None]:
# train_ae_models(LinearAutoencoder, red_dims, n_epochs=20)

In [None]:
# train_ae_models(Autoencoder, red_dims, n_epochs=20, model_name="AutoEncoder+")

In [None]:
# train_ae_models(Autoencoder, red_dims, n_epochs=20, model_kwargs={"hidden_dim": 256}, model_name="AutoEncoder256")