In [None]:
import torch
from transformers import BertTokenizer, BertModel
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from tqdm import tqdm
import time

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 2048
max_seq_len = 96
num_workers = 16

In [None]:
tokenizer = BertTokenizer.from_pretrained("DeepPavlov/rubert-base-cased", force_download=True)
model = BertModel.from_pretrained("DeepPavlov/rubert-base-cased", force_download=True).to(device)

In [None]:
class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_seq_len):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(text, max_length=self.max_seq_len, truncation=True, padding='max_length', return_tensors='pt')
        return {k: v.squeeze(0) for k, v in encoding.items()}

In [None]:
data = pd.read_csv("data.csv")
lit_texts = data["lit_text"].tolist()
tg_texts = data["tg_text"].tolist()

In [None]:
lit_dataset = TextDataset(lit_texts, tokenizer, max_seq_len)
tg_dataset = TextDataset(tg_texts, tokenizer, max_seq_len)
lit_dataloader = DataLoader(lit_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)
tg_dataloader = DataLoader(tg_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)

In [None]:
def vectorize_texts(dataloader, model, output_file):
    embeddings = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc=f"Векторизация {output_file}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            with torch.cuda.amp.autocast():
                outputs = model(input_ids, attention_mask=attention_mask)
                embed = outputs.last_hidden_state[:, 0, :]

            embeddings.append(embed.cpu().numpy())
            torch.cuda.empty_cache()

        embeddings = np.concatenate(embeddings, axis=0)
        np.save(output_file, embeddings)
        return embeddings

In [None]:
start_time = time.time()
lit_embeddings = vectorize_texts(lit_dataloader, model, "lit_embeddings.npy")
tg_embeddings = vectorize_texts(tg_dataloader, model, "tg_embeddings.npy")
print(f"Векторизация завершена за {time.time() - start_time:.2f} секунд")

Векторизация lit_embeddings.npy: 100%|██████████| 676/676 [04:14<00:00,  2.65it/s]
Векторизация tg_embeddings.npy: 100%|██████████| 676/676 [04:14<00:00,  2.66it/s]


Векторизация завершена за 1066.50 секунд
