In [1]:
%cd ..

/home/den/dev/git/ozon-e-cup-2025


In [None]:
import json
from textwrap import dedent

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

In [None]:
train = pd.read_csv("data/1__train_with_flags.csv")
test = pd.read_csv("data/1__test_with_flags.csv")

train.shape, test.shape

((197198, 48), (22760, 47))

In [None]:
def create_text(description: str | None, name_rus: str, brand_name: str | None, CommercialTypeName4: str) -> str:
    text = dedent(
        f"""\
    Заголовок товара: {name_rus}.
    Категория товара: {CommercialTypeName4}.
    """
    )
    if brand_name:
        text += f"Бренд: {brand_name}.\n"
    else:
        text += "Бренд: Не указан.\n"
    if description:
        text += f"Описание: {description}."
    else:
        text += "Описание: Не указано."
    return text

In [None]:
print(create_text(**train.loc[0, ["description", "name_rus", "brand_name", "CommercialTypeName4"]].to_dict()))

Заголовок товара: Мешки для пылесоса PHILIPS TRIATLON, синтетические, многослойные, тип: HR 6947.
Категория товара: Пылесборник.
Бренд: ACTRUM.
Описание: Мешки пылесборники для пылесоса PHILIPS, 10 шт., синтетические, многослойные, бренд: ACTRUM, арт. AK-10/10, тип оригинального мешка: HR 6947.Подходят для пылесосов:PHILIPS: HR6955, HR6947, HR6888, HR6844 TRIATHLON, HR6843 TRIATHLON, HR6842 TRIATHLON, HR6841 TRIATHLON, HR6840 TRIATHLON, HR6839 TRIATHLON, HR6838 TRIATHLON, HR6837 TRIATHLON, HR6836 TRIATHLON, HR6835 TRIATHLON, HR6834 TRIATHLON, HR6833 TRIATHLON, HR6832 TRIATHLON, HR6831 TRIATHLON, HR6830 TRIATHLON, HR6829 TRIATHLON, HR6828 TRIATHLON, HR6827 TRIATHLON, HR6826 TRIATHLON, HR6825 TRIATHLON, HR6824 TRIATHLON, HR6823 TRIATHLON, HR6822 TRIATHLON, HR6821 TRIATHLON, HR6820 TRIATHLON, HR6819 TRIATHLON, HR6818 TRIATHLON, HR6817 TRIATHLON, HR6816 TRIATHLON, HR6815 TRIATHLON, HR6814 - HR6845 TRIATHLON, FC6844 TRIATHLON, FC6843 TRIATHLON, FC6842 TRIATHLON, FC6841 - FC6845 TRIATHLONОдн

In [None]:
import torch
from sentence_transformers import SentenceTransformer

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [None]:
model = SentenceTransformer("deepvk/USER2-small", device=device)

To estimate memory needs for a batch size and a SentenceTransformer model, consider:

1. **Input Tokenization:**  
    Each input text is tokenized. The memory for tokenized input is roughly:  
    `batch_size × max_seq_length × 4 bytes` (for int32 tokens).

2. **Model Forward Pass:**  
    The main memory usage comes from:
    - Model weights (fixed, depends on model size).
    - Activations (depends on batch_size, max_seq_length, hidden_size, number of layers).

    For activations (rough estimate):  
    `batch_size × max_seq_length × hidden_size × num_layers × 4 bytes` (float32).

3. **Output Embeddings:**  
    `batch_size × embedding_dim × 4 bytes`

**Example Calculation:**  
Suppose:
- `batch_size = 64`
- `max_seq_length = 512`
- `hidden_size = 384`
- `num_layers = 12`
- `embedding_dim = 384`

**Activations:**  
`64 × 512 × 384 × 12 × 4 ≈ 603 MB` (for activations, very rough upper bound)

**Tokenized Input:**  
`64 × 512 × 4 ≈ 128 KB`

**Output Embeddings:**  
`64 × 384 × 4 ≈ 96 KB`

**Model Weights:**  
Depends on the model, but for a small BERT (like 40M params):  
`40,000,000 × 4 ≈ 160 MB`

**Total:**  
Roughly `~800 MB` per batch (plus some overhead).

**Note:**  
- Actual usage may be lower due to memory optimizations.
- Use smaller batch sizes if you get OOM errors.
- Monitor with `nvidia-smi` (GPU) or `psutil` (CPU).

**Tip:**  
Try a small batch size, monitor memory, and increase until you reach your hardware limit.

In [None]:
def create_text_embeddings(df, batch_size=64) -> np.ndarray:
    texts = []
    for _, row in tqdm(
        df.loc[:, ["description", "name_rus", "brand_name", "CommercialTypeName4"]].iterrows(),
        total=len(df),
        desc="Creating texts",
    ):
        text = create_text(**row.to_dict())
        texts.append(text)
    embs = []
    for i in tqdm(
        range(0, len(texts), batch_size), total=(len(texts) + batch_size - 1) // batch_size, desc="Creating embeddings"
    ):
        batch_texts = texts[i : i + batch_size]
        batch_embs = model.encode(batch_texts, batch_size=batch_size, prompt_name="classification")
        embs.append(batch_embs)
    return np.vstack(embs)

In [None]:
train_embs = create_text_embeddings(train)
np.save("data/train_text_embs_user2_small_384dim.npy", train_embs)

In [None]:
test_embs = create_text_embeddings(test)
np.save("data/test_text_embs_user2_small_384dim.npy", test_embs)

In [None]:
train_id2idx_text_embs = {id: idx for idx, id in train.id.to_dict().items()}
test_id2idx_text_embs = {id: idx for idx, id in test.id.to_dict().items()}

json.dump(train_id2idx_text_embs, open("data/train_id2idx_text_embs_user2_small_384dim.json", "w"))
json.dump(test_id2idx_text_embs, open("data/test_id2idx_text_embs_user2_small_384dim.json", "w"))