In [None]:
import collections
import math
import pathlib

import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import AutoModelForMaskedLM, AutoTokenizer

2025-05-26 07:48:09.366629: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748245689.566033      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748245689.626618      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
from transformers import default_data_collator

In [None]:
from transformers import Trainer, TrainingArguments

In [2]:
DATA = pathlib.Path("../data")
DESC_DATA = DATA / "raw" / "rec_aaa_title_desc.pq"
TRAINED = pathlib.Path("../src/model/content/trained")

### Data prepare

In [3]:
MIN_LEN_DESCRIPTION = 20

In [4]:
desc_df = pd.read_parquet(DESC_DATA)
desc_df.head()

Unnamed: 0,Item_id,Title,t_rn,DescriptionRu,d_rn
0,238750565,Сход-развал развал схождения hanter 3D,1,"Развал-схождения 3D качественно, большой опыт....",1
1,298854720,Новые Beats Powerbeats Pro Cloud Pink от Apple,1,Продам НОВЫЕ Беспроводные наушники с микрофоно...,1
2,10409000978,Новые джинсы - капри белые,1,Джинсы (капри) новые растягиваются (стрейч). О...,1
3,14669000687,Подкрылки передние honda accord 7,1,Продается передний правый подкрылок на HONDA A...,1
4,22767000267,Цепочка. Серебро 925 пробы,1,❗Не отправляю. Только самовывоз. \nЦепочка тон...,1


In [5]:
desc_df["full_decription"] = desc_df.apply(
    lambda row: " % ".join([row["Title"], row["DescriptionRu"]]), axis=1
)

In [6]:
desc_df.drop(
    columns=["Title", "DescriptionRu", "Item_id", "t_rn", "d_rn"], inplace=True
)

In [7]:
%%time
dataset = DatasetDict(
    {
        "train": Dataset.from_pandas(desc_df),
    }
)
dataset

CPU times: user 1.64 s, sys: 1.85 s, total: 3.49 s
Wall time: 3.48 s


DatasetDict({
    train: Dataset({
        features: ['full_decription'],
        num_rows: 435355
    })
})

In [8]:
dataset["train"] = dataset["train"].filter(
    lambda x: len(x["full_decription"].split()) >= MIN_LEN_DESCRIPTION
)
dataset

Filter:   0%|          | 0/435355 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['full_decription'],
        num_rows: 283914
    })
})

In [9]:
pretrained_model_name = "cointegrated/rubert-tiny2"

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
tokenizer.vocab_size, tokenizer.is_fast

tokenizer_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.74M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

(83828, True)

In [10]:
def tokenize_function(examples):
    return tokenizer(
        examples["full_decription"],
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt",
    )

In [11]:
dataset = dataset.map(tokenize_function, batched=True, num_proc=8)
dataset

Map (num_proc=8):   0%|          | 0/283914 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['full_decription', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 283914
    })
})

In [12]:
mlm_model = AutoModelForMaskedLM.from_pretrained(pretrained_model_name)

config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/118M [00:00<?, ?B/s]

In [13]:
def mlm_tokenize_function(examples):
    result = tokenizer(examples["full_decription"])
    if tokenizer.is_fast:
        result["word_ids"] = [
            result.word_ids(i) for i in range(len(result["input_ids"]))
        ]
    return result


mlm_tokenized_datasets = (
    dataset["train"]
    .map(mlm_tokenize_function, batched=True, num_proc=2)
    .select_columns(["attention_mask", "input_ids", "word_ids"])
)
mlm_tokenized_datasets

Map (num_proc=2):   0%|          | 0/283914 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2860 > 2048). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2605 > 2048). Running this sequence through the model will result in indexing errors


Dataset({
    features: ['attention_mask', 'input_ids', 'word_ids'],
    num_rows: 283914
})

In [14]:
def group_texts(examples, chunk_size=256):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // chunk_size) * chunk_size
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result


mlm_tokenized_datasets = mlm_tokenized_datasets.map(
    group_texts, batched=True, num_proc=8
)
mlm_tokenized_datasets

Map (num_proc=8):   0%|          | 0/283914 [00:00<?, ? examples/s]

Dataset({
    features: ['attention_mask', 'input_ids', 'word_ids', 'labels'],
    num_rows: 163879
})

In [15]:
wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature["word_ids"]

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(
        [{k: v for k, v in feature.items() if k != "word_ids"} for feature in features]
    )

In [16]:
mlm_tokenized_datasets = mlm_tokenized_datasets.train_test_split(
    train_size=int(len(mlm_tokenized_datasets) * 0.9),
    test_size=int(len(mlm_tokenized_datasets) * 0.1),
    seed=42,
)

### Train

In [17]:
mlm_model = AutoModelForMaskedLM.from_pretrained(pretrained_model_name)

trainer = Trainer(
    model=mlm_model,
    args=TrainingArguments(
        output_dir=TRAINED,
        num_train_epochs=1,
        warmup_steps=100,
        optim="adamw_torch",
        learning_rate=1e-4,
        weight_decay=1e-2,
        fp16=True,
        max_grad_norm=1.0,
        gradient_accumulation_steps=4,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        do_eval=True,
        eval_strategy="steps",
        eval_steps=500,
        remove_unused_columns=False,
        report_to="none",
    ),
    train_dataset=mlm_tokenized_datasets["train"],
    eval_dataset=mlm_tokenized_datasets["test"],
    data_collator=whole_word_masking_data_collator,
)

In [18]:
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 481.89


In [19]:
trainer.train()

Step,Training Loss,Validation Loss,Model Preparation Time
500,5.1544,4.622911,0.0012
1000,4.7016,4.413905,0.0012
1500,4.5703,4.316596,0.0012
2000,4.4891,4.262186,0.0012


TrainOutput(global_step=2304, training_loss=4.693500836690267, metrics={'train_runtime': 1766.1313, 'train_samples_per_second': 83.511, 'train_steps_per_second': 1.305, 'total_flos': 562671485190144.0, 'train_loss': 4.693500836690267, 'epoch': 0.9996745850959974})

In [20]:
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 70.10


In [21]:
trainer.save_model(TRAINED)