In [4]:
!nvidia-smi

Wed Jul 31 10:06:31 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 520.61.05    Driver Version: 520.61.05    CUDA Version: 11.8     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100 80G...  Off  | 00000000:CA:00.0 Off |                   On |
| N/A   35C    P0    62W / 300W |                  N/A |     N/A      Default |
|                               |                      |              Enabled |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| MIG devices:                                                                |
+------

In [None]:
import os
import torch
import warnings
import collections
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoModelForMaskedLM, AutoTokenizer
from transformers import DataCollatorForLanguageModeling, DataCollatorForWholeWordMask
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
from transformers import default_data_collator

os.environ["WANDB_DISABLED"] = "true"
os.environ["DISABLE_MLFLOW_INTEGRATION"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.filterwarnings("ignore")
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

In [None]:
tokenizer = AutoTokenizer.from_pretrained(os.getenv('TOKENIZER_PATH'))
dataset = load_dataset('parquet', data_files={'train': 'df_train.parquet'})

In [None]:
# Собираем для каждого токена его word_id

def mlm_tokenize_function(examples):
    result = tokenizer(examples['description'])
    if tokenizer.is_fast:
        result['word_ids'] = [result.word_ids(i) for i in range(len(result['input_ids']))]
    return result

mlm_tokenized_datasets = dataset['train'].map(
    mlm_tokenize_function, batched=True, num_proc=8
).select_columns(
    ['attention_mask', 'input_ids', 'word_ids']
)

In [None]:
# Группируем тексты в чанки по 256 токенов, чтобы избежать лишних вычислений на PAD токенах

def group_texts(examples, chunk_size=256):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // chunk_size) * chunk_size
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

mlm_tokenized_datasets = mlm_tokenized_datasets.map(group_texts, batched=True, num_proc=8)

In [None]:
# Маскируем случайные 15% слов

wwm_probability = 0.15

def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature["word_ids"]

        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator([{k: v for k, v in feature.items() if k != 'word_ids'} for feature in features])

In [None]:
# Разбиение на тренировку и валидацию

mlm_tokenized_datasets = mlm_tokenized_datasets.train_test_split(
    train_size=int(len(mlm_tokenized_datasets) * 0.9), test_size=int(len(mlm_tokenized_datasets) * 0.1), seed=42
)

In [None]:
# Тренировка на MLM
mlm_model = AutoModelForMaskedLM.from_pretrained(os.getenv('MODEL_PATH'))

trainer = Trainer(
    model=mlm_model,
    args=TrainingArguments(
        output_dir=os.getenv('OUTPUT_DIR'),
        num_train_epochs=1, warmup_steps=100,
        optim='adamw_torch', learning_rate=1e-4, weight_decay=1e-2,
        fp16=True, max_grad_norm=1.0, gradient_accumulation_steps=4,
        per_device_train_batch_size=16, per_device_eval_batch_size=64,
        do_eval=True, eval_strategy='steps', eval_steps=100,
        remove_unused_columns=False,
    ),
    train_dataset=mlm_tokenized_datasets['train'],
    eval_dataset=mlm_tokenized_datasets['test'],
    data_collator=whole_word_masking_data_collator,
)

In [None]:
trainer.train()