In [85]:
import warnings
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

warnings.filterwarnings("ignore", category=UserWarning)

In [86]:
import torch
from transformers import BertTokenizerFast, BertForMultipleChoice, Trainer, TrainingArguments
from datasets import load_dataset


In [87]:
# Функция подготовки выборок для multiple choice task
def prepare_features(batch):
    # Конструируем prompt для каждого варианта выбора
    choices = [batch['choice1'], batch['choice2']]
    premise = [batch['premise']] * len(choices)

    question_with_choices = [
        f"{batch['question']} {choice}" for choice in choices
    ]

    # Токенизируем с сохранением связи индексов
    inputs = tokenizer(premise, question_with_choices, truncation=True, max_length=128, padding="max_length")

    # Добавляем метку правильного ответа
    label = batch.get('label')  # Если label отсутствует, значит тестовая выборка
    if label is not None:
        inputs['labels'] = label
    return inputs

In [88]:
# Загружаем предварительно подготовленный датасет
ds = load_dataset("PARus")

model_name = "DeepPavlov/rubert-base-cased"
# Загружаем предобученную модель
model = BertForMultipleChoice.from_pretrained(model_name)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if device.type == 'cuda':
    model = model.to("cuda")
# Подготавливаем токенизатор
tokenizer = BertTokenizerFast.from_pretrained(model_name)

# # Обрабатываем датасеты
for split_name in ['train', 'validation']:
    ds[split_name] = ds[split_name].map(prepare_features,
                                        batched=False,
                                        remove_columns=['premise', 'choice1', 'choice2', 'question', 'label']
                                        )


Some weights of BertForMultipleChoice were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 400/400 [00:00<00:00, 3348.08 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 3293.63 examples/s]


In [89]:
# Преобразуем объект в PyTorch TensorDataset
ds["train"].set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'labels'])
ds["validation"].set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'labels'])

ds["train"]

Dataset({
    features: ['idx', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 400
})

In [90]:
# Определение метрик точности (accuracy)
from sklearn.metrics import accuracy_score


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = logits.argmax(axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

In [91]:
# Аргументы обучения
training_args = TrainingArguments(
    output_dir='./rubert_finetuned_PARus',
    eval_strategy="epoch",
    save_strategy='epoch',
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    learning_rate=2e-5,
    num_train_epochs=10,
    warmup_ratio=0.1,
    weight_decay=0.01,
    save_total_limit=1,
    greater_is_better=True,
    disable_tqdm=False,
    load_best_model_at_end=True,
)

# Создаем тренера
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds['train'],
    eval_dataset=ds['validation'],
    compute_metrics=compute_metrics
)



In [92]:
# Начинаем обучение
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.689627,0.51
2,No log,0.67998,0.58
3,No log,0.648276,0.6
4,No log,0.685157,0.65
5,No log,0.699222,0.64
6,No log,0.73163,0.62
7,No log,0.819332,0.65
8,No log,0.875768,0.62
9,No log,0.883407,0.65
10,0.343200,0.922288,0.64


TrainOutput(global_step=500, training_loss=0.3431547546386719, metrics={'train_runtime': 4212.3607, 'train_samples_per_second': 0.95, 'train_steps_per_second': 0.119, 'total_flos': 526217385984000.0, 'train_loss': 0.3431547546386719, 'epoch': 10.0})

In [93]:
# Оцениваем модель на валидирующем наборе
trainer.evaluate()


{'eval_loss': 0.9222877621650696,
 'eval_accuracy': 0.64,
 'eval_runtime': 30.9749,
 'eval_samples_per_second': 3.228,
 'eval_steps_per_second': 0.42,
 'epoch': 10.0}

In [94]:
# Сохранение финальной модели
trainer.save_model("./final_model")