In [43]:
import warnings
import os

from torch.utils.data import Dataset

os.environ["TOKENIZERS_PARALLELISM"] = "false"

warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
import torch
import numpy as np
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset


In [3]:
# Загружаем предварительно подготовленный датасет
ds = load_dataset("DaNetQA")

model_name = "DeepPavlov/rubert-base-cased-conversational"
# Загружаем предобученную модель
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if device.type == 'cuda':
    model = model.to("cuda")
# Подготавливаем токенизатор
tokenizer = BertTokenizerFast.from_pretrained(model_name)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-conversational and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# Функция подготовки выборок для multiple choice task
def prepare_features(example):
    # Конструируем prompt для каждого варианта выбора
    inputs = example['question'] + '[SEP]' + example['passage']
    # Токенизируем с сохранением связи индексов
    inputs = tokenizer(inputs,
                       truncation=True,
                       max_length=512,
                       padding="max_length"
                       )

    # Добавляем метку правильного ответа
    label = example.get('label')  # Если label отсутствует, значит тестовая выборка
    if label is not None:
        inputs['labels'] = int(label)
    return inputs

In [5]:
# Обрабатываем датасеты
for split_name in ['train', 'validation']:
    ds[split_name] = ds[split_name].map(
        prepare_features,
        batched=False,
        remove_columns=['passage', 'question', 'label']
    )

ds["validation"] = ds["validation"].shuffle().select(range(300))
ds["train"] = ds["train"].shuffle().select(range(1000))

Map:   0%|          | 0/821 [00:00<?, ? examples/s]

In [6]:
# Преобразуем объект в PyTorch TensorDataset
ds["train"].set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'labels'])
ds["validation"].set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'labels'])

ds["train"]

Dataset({
    features: ['idx', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})

In [7]:
# Определение метрик точности (accuracy)
from sklearn.metrics import accuracy_score


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=1)
    return {"accuracy": accuracy_score(labels, predictions)}

In [8]:
# Аргументы обучения
training_args = TrainingArguments(
    output_dir='./rubert_finetuned_DaNetQA',
    eval_strategy="epoch",
    save_strategy='epoch',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    num_train_epochs=7,
    weight_decay=0.01,
    greater_is_better=True,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

# Создаем тренера
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds['train'],
    eval_dataset=ds['validation'],
    compute_metrics=compute_metrics,
)

In [9]:
# Начинаем обучение
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.602574,0.663333
2,No log,0.824856,0.62
3,No log,1.382486,0.626667
4,0.396900,1.74493,0.633333
5,0.396900,1.623381,0.7
6,0.396900,1.969519,0.673333
7,0.396900,2.004319,0.68


TrainOutput(global_step=875, training_loss=0.24507098824637277, metrics={'train_runtime': 26723.3126, 'train_samples_per_second': 0.262, 'train_steps_per_second': 0.033, 'total_flos': 1841777387520000.0, 'train_loss': 0.24507098824637277, 'epoch': 7.0})

In [10]:
# Оцениваем модель на валидирующем наборе
eval_results = trainer.evaluate()
print(f'Точность в наборе для проверки: {eval_results["eval_accuracy"]:.3f}')


Точность в наборе для проверки: 0.700


In [11]:
# Сохранение финальной модели
trainer.save_model("./rubert_finetuned_DaNetQA/final_model")

In [12]:
# Загружаем предобученную модель
saved_model_path = "./rubert_finetuned_DaNetQA/final_model"
# Загрузить модель из сохранённого каталога:
loaded_model = BertForSequenceClassification.from_pretrained(saved_model_path)
# Также загрузить токенизатор из сохранённого каталога:
loaded_tokenizer = BertTokenizerFast.from_pretrained(model_name)

In [15]:
# Обрабатываем датасеты
ds["test"] = ds["test"].map(
        prepare_features,
        batched=False,
        remove_columns=['passage', 'question', 'label'],
    )

ds["test"]

Map:   0%|          | 0/805 [00:00<?, ? examples/s]

Dataset({
    features: ['idx', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 805
})

In [16]:
loaded_model.eval()

training_args = TrainingArguments(
    output_dir='./rubert_finetuned_DaNetQA',
    do_train = False,
    do_predict = True,
    per_device_eval_batch_size = 4,
    dataloader_drop_last = False,
)

# Создаем тренера
trainer = Trainer(
    model = loaded_model,
    args = training_args,
)

In [17]:
test_results_content = trainer.predict(ds["test"])[0]
test_results_content

array([[-3.436156 ,  3.3206792],
       [-2.9944618,  2.7741835],
       [-3.441501 ,  3.286354 ],
       ...,
       [-0.9691386,  1.0794065],
       [-3.43276  ,  3.2861784],
       [ 2.7329412, -2.4606788]], dtype=float32)

In [29]:
# Написание функции для получения предикта
def get_prediction():
    test_pred = trainer.predict(ds["test"])
    labels = np.argmax(test_pred.predictions, axis=-1)
    return labels


pred = get_prediction()

In [41]:
# Обрабатываем результат
pred_label = ['true' if l else "false" for l in pred]
result_test = ds["test"]
result_test = result_test.add_column("label", pred_label)
result_test = result_test.remove_columns(['input_ids', 'token_type_ids', 'attention_mask'])
result_test

Dataset({
    features: ['idx', 'label'],
    num_rows: 805
})

In [42]:
result_test.to_json('DaNetQA.jsonl')

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

21833