In [1]:
import sys
import os
import pandas as pd
import numpy as np

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

In [5]:
DATA_PATH = os.path.join(PROJECT_ROOT, "data")
PATH_LEHA = os.path.join(DATA_PATH, "raw_data", "LEHABOOKS.csv")
PATH_EMBDS = os.path.join(DATA_PATH, "embeddings", "fs_embds.npz")

df_books = pd.read_csv(PATH_LEHA)
book_embds = np.load(PATH_EMBDS, allow_pickle=True)


In [None]:
import torch
from transformers import BertTokenizer, BertModel
import numpy as np
import os

class EmbeddingsProducer:
    def __init__(self):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = BertModel.from_pretrained('bert-base-uncased').to(self.device)

    def create_embeddings_batch(self, texts: list[str]) -> list[np.ndarray]:
        encoded = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors='pt'
        )
        input_ids = encoded['input_ids'].to(self.device)
        attention_mask = encoded['attention_mask'].to(self.device)
    
        with torch.no_grad():
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
            token_embeddings = outputs.last_hidden_state
    
            mask = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
            masked_embeddings = token_embeddings * mask
            summed = masked_embeddings.sum(dim=1)
            counts = mask.sum(dim=1)
            mean_pooled = summed / counts
    
        return [emb.cpu().numpy() for emb in mean_pooled]
    


def produce_embeddings_batched(data, embeddings_path,  batch_size=16):
    from tqdm import tqdm
    embeddings_producer = EmbeddingsProducer()

    data.loc[:, "description"] = data["description"].fillna("No description")
    descriptions = data["description"].tolist()
    titles = data["Title"].to_numpy()

    embeddings = []

    for i in tqdm(range(0, len(descriptions), batch_size), desc="Embedding..."):
        batch_texts = descriptions[i:i + batch_size]
        batch_embeddings = embeddings_producer.create_embeddings_batch(batch_texts)
        embeddings.extend(batch_embeddings)

    embeddings = np.stack(embeddings).astype(np.float32)
    print("Embeddings saved to embeddings_structured.npz")
    return embeddings



In [None]:
produce_embeddings_batched(df_books.iloc[[-1]], "z")

Embedding...: 100%|██████████| 1/1 [00:01<00:00,  1.30s/it]

Embeddings saved to embeddings_structured.npz





array([[-3.73451471e-01,  3.00747603e-01,  4.71702397e-01,
         1.11736439e-01,  6.75265849e-01, -2.86260664e-01,
        -1.59864649e-01,  6.43746138e-01, -2.00701788e-01,
        -1.98152333e-01, -2.55984068e-02, -2.03301057e-01,
        -1.22979358e-01,  2.79510468e-01, -3.63630168e-02,
         4.05110151e-01,  1.44481808e-01,  9.52998921e-02,
        -2.92794436e-01,  1.96829170e-01,  6.83939084e-02,
         8.21147934e-02, -4.47523035e-02,  4.23731297e-01,
         1.50708109e-01,  3.70594785e-02,  1.38306059e-02,
         1.61906369e-02, -3.66735697e-01, -6.19398169e-02,
         1.09753177e-01,  4.71100174e-02, -3.82557482e-01,
        -8.22815895e-02, -1.82167664e-02, -1.17012635e-01,
        -5.23914844e-02, -3.92733030e-02,  2.47950375e-01,
         6.52044639e-02, -4.70781475e-01, -4.12094533e-01,
         2.03163192e-01,  8.92881900e-02, -2.22306065e-02,
        -2.43837893e-01,  5.70016384e-01, -1.87473208e-01,
        -2.65151039e-02, -7.89240748e-02, -5.28674185e-0

In [10]:
titles = book_embds["titles"]
embds = book_embds["embeddings"]

titles = np.concatenate([titles, df_books.iloc[[-1]]["Title"].values])

# А embds — это массив эмбеддингов, например shape (N, D)
new_emb = produce_embeddings_batched(df_books.iloc[[-1]], "z")
embds = np.vstack([embds, new_emb])

Embedding...: 100%|██████████| 1/1 [00:00<00:00,  3.47it/s]

Embeddings saved to embeddings_structured.npz





In [None]:
np.savez("PATH_EMBDS", titles=np.array(titles), embeddings=embds)