# Load and Prepare Data

In [None]:
!pip install transformers -q
!pip install datasets -q
!pip install evaluate -q
!pip install optuna -q
!pip install --upgrade accelerate



In [None]:
# Импорт необходимых пакетов
import collections
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random
import time
import os

from tqdm import tqdm

# pytorch libraries
import torch
import torch.nn as nn

import transformers
from transformers import AutoTokenizer, AutoModel, AutoModelForQuestionAnswering
from transformers import TrainingArguments, Trainer
from datasets import load_dataset, Dataset, DatasetDict

import evaluate
import optuna

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [None]:
dataset = load_dataset("sberquad")
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 45328
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 5036
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 23936
    })
})

In [None]:
model_name = "timpal0l/mdeberta-v3-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
context = dataset["train"][2]["context"]
question = dataset["train"][2]["question"]

inputs = tokenizer(question, context)
tokenizer.decode(inputs["input_ids"])

'[CLS] что встречается в протерозойских отложениях?[SEP] В протерозойских отложениях органические остатки встречаются намного чаще, чем в архейских. Они представлены известковыми выделениями сине-зелёных водорослей, ходами червей, остатками кишечнополостных. Кроме известковых водорослей, к числу древнейших растительных остатков относятся скопления графито-углистого вещества, образовавшегося в результате разложения Corycium enigmaticum. В кремнистых сланцах железорудной формации Канады найдены нитевидные водоросли, грибные нити и формы, близкие современным кокколитофоридам. В железистых кварцитах Северной Америки и Сибири обнаружены железистые продукты жизнедеятельности бактерий.[SEP]'

In [None]:
stride=128
max_seq_length = 384

def preprocess_examples(examples):
    questions = [q.strip() for q in examples['question']]
    inputs = tokenizer(
        questions,
        examples['context'],
        max_length=max_seq_length,
        truncation='only_second',
        stride=stride,
        return_offsets_mapping=True,
        padding='max_length',
        )

    offset_mapping = inputs['offset_mapping']
    answers = examples['answers']
    start_positions = []
    end_positions = []
    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer['answer_start'][0]
        end_char = answer['answer_start'][0] + len(answer['text'][0])
        sequence_ids = inputs.sequence_ids(i)

    # Find the start and end of the context

        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

    # If the answer is not fully inside the context, label is (0, 0)

        if offset[context_start][0] > end_char \
            or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:

      # Otherwise it's the start and end token positions

            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs['start_positions'] = start_positions
    inputs['end_positions'] = end_positions
    return inputs

In [None]:
#Для ускорения процесса подбора гиперпараметров уменьшим датасет
part_of_data = 0.05
DATASETS_for_optuna = DatasetDict({
    'train': dataset["train"].map(
        preprocess_examples,
       batched=True).select(
            np.random.choice(range(len(dataset["train"])), int(len(dataset["train"])*part_of_data), replace=False)
        ),
    'validation': dataset["validation"].map(
        preprocess_examples,
        batched=True).select(
            np.random.choice(range(len(dataset["validation"])), int(len(dataset["validation"])*part_of_data), replace=False)
        ),
    'test': dataset["test"].map(
        preprocess_examples,
        batched=True).select(
            np.random.choice(range(len(dataset["test"])), int(len(dataset["test"])*part_of_data), replace=False)
        )
})

In [None]:
DATASETS = DatasetDict({
    'train': dataset["train"].map(
        preprocess_examples,
        batched=True).select(
            np.random.choice(range(len(dataset["train"])), int(len(dataset["train"])), replace=False)
        ),
    'validation': dataset["validation"].map(
        preprocess_examples,
        batched=True).select(
            np.random.choice(range(len(dataset["validation"])), int(len(dataset["validation"])), replace=False)
        ),
    'test': dataset["test"].map(
        preprocess_examples,
        batched=True).select(
            np.random.choice(range(len(dataset["test"])), int(len(dataset["test"])), replace=False)
        )
})

# Метрики

In [None]:
metric = evaluate.load("squad")
def compute_metrics_for_optuna(eval_preds):
    y_pred = np.argmax(eval_preds[0], -1).T

    f1_score = 0
    exact_match = 0
    for data, pred in zip(DATASETS_for_optuna['validation'], y_pred):
        # Convert answer start and end into characters positions in
        # original text using the offset mapping list
        start_char = data['offset_mapping'][pred[0]][0]
        end_char = data['offset_mapping'][pred[1]][1]
        # Create predictions and references dictionaries for metric function
        predictions = [{'prediction_text': data['context'][start_char:end_char],
                        'id': str(data['id'])}]
        references = [{'answers': data['answers'],
                       'id': str(data['id'])}]
        results = metric.compute(predictions=predictions,
                                 references=references)
        # Add metric to running sum variable to calculate average after,
        # change outputs from 0-100 range to 0-1 range
        f1_score += results['f1'] / 100
        exact_match += results['exact_match'] / 100
    # Calculate the average
    f1_score /= len(DATASETS_for_optuna['validation'])
    exact_match /= len(DATASETS_for_optuna['validation'])
    return {'f1': f1_score, 'exact_match': exact_match}

In [None]:
def compute_metrics(eval_preds):
    y_pred = np.argmax(eval_preds[0], -1).T

    f1_score = 0
    exact_match = 0
    for data, pred in zip(DATASETS['validation'], y_pred):
        # Convert answer start and end into characters positions in
        # original text using the offset mapping list
        start_char = data['offset_mapping'][pred[0]][0]
        end_char = data['offset_mapping'][pred[1]][1]
        # Create predictions and references dictionaries for metric function
        predictions = [{'prediction_text': data['context'][start_char:end_char],
                        'id': str(data['id'])}]
        references = [{'answers': data['answers'],
                       'id': str(data['id'])}]
        results = metric.compute(predictions=predictions,
                                 references=references)
        # Add metric to running sum variable to calculate average after,
        # change outputs from 0-100 range to 0-1 range
        f1_score += results['f1'] / 100
        exact_match += results['exact_match'] / 100
    # Calculate the average
    f1_score /= len(DATASETS['validation'])
    exact_match /= len(DATASETS['validation'])
    return {'f1': f1_score, 'exact_match': exact_match}

# Model

In [None]:
# Гиперпараметры
LR_MIN = 4e-5
LR_CEIL = 0.01
WD_MIN = 4e-5
WD_CEIL = 0.01
WR_MIN = 0.01
WR_CEIL = 0.2
MIN_GRAD_ACC = 1
MAX_GRAD_ACC = 5
MIN_EPOCHS = 2
MAX_EPOCHS = 5
PER_DEVICE_EVAL_BATCH = 10
PER_DEVICE_TRAIN_BATCH = 10
NUM_TRIALS = 3
SAVE_DIR = 'opt-test'
NAME_OF_MODEL = 'huggingoptunaface'

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()
model.to(device)

DebertaV2ForQuestionAnswering(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(251000, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True

In [None]:
learning_rate = 0.0002839561415811
weight_decay = 6.811462411625139e-05
warmup_ratio = 0.1283623708167592
gradient_accumulation_step = 1
epoch = 3

In [None]:
training_args = TrainingArguments("mdeberta-v3-base-squad2",
                                  evaluation_strategy="steps",
                                  eval_steps=1000,
                                  logging_steps=1000,
                                  save_steps=5000,
                                  optim="adamw_torch",
                                  learning_rate=learning_rate,
                                  per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH,
                                  per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
                                  warmup_steps=200,
                                  lr_scheduler_type='cosine',
                                  weight_decay=weight_decay,
                                  warmup_ratio=warmup_ratio,
                                  gradient_accumulation_steps=gradient_accumulation_step,
                                  num_train_epochs=epoch)

trainer = Trainer(model,
                  training_args,
                  train_dataset=DATASETS['train'],
                  eval_dataset=DATASETS['validation'],
                  tokenizer=tokenizer,
                  compute_metrics=compute_metrics)

In [None]:
torch.cuda.empty_cache()
trainer.train()

Step,Training Loss,Validation Loss,F1,Exact Match
1000,2.5336,2.395486,0.63488,0.420969
2000,2.4297,2.206887,0.674568,0.455123
3000,2.3681,2.205993,0.671488,0.462669
4000,2.3176,2.207917,0.708822,0.487887
5000,2.2062,2.095395,0.713921,0.49444
6000,2.0839,2.037686,0.716817,0.499404
7000,1.9888,1.937767,0.722898,0.515687
8000,1.9127,1.903542,0.739591,0.532764
9000,1.831,1.818703,0.752485,0.549047
10000,1.6048,1.847346,0.737164,0.53336


TrainOutput(global_step=13599, training_loss=1.976883267819842, metrics={'train_runtime': 16222.0027, 'train_samples_per_second': 8.383, 'train_steps_per_second': 0.838, 'total_flos': 2.6649614865752064e+16, 'train_loss': 1.976883267819842, 'epoch': 3.0})

# Get Prediction on Test Data

In [None]:
# Функция тренера для получения прогноза
predictions, _, _ = trainer.predict(DATASETS['test'])

# Получение прогнозов
preds = np.argmax(predictions, axis=2)

f1_score = 0
exact_match = 0
example_predictions = []
example_references = []

# Прогнозирование на тестовом наборе данных
for data, pred in zip(DATASETS['test'], preds):
    start_char = data['offset_mapping'][pred[0]][0]
    end_char = data['offset_mapping'][pred[1]][1]

    predictions = [{'prediction_text': data['context'][start_char:end_char],
                     'id': str(data['id'])}]
    references = [{'answers': data['answers'],
                   'id': str(data['id'])}]
    example_predictions.append(predictions[0]['prediction_text'])
    example_references.append(references[0]['answers']['text'][0])

    results = metric.compute(predictions=predictions, references=references)

    f1_score += results['f1'] / 100
    exact_match += results['exact_match'] / 100

# Расчет средней точности
f1_score /= len(DATASETS['test'])
exact_match /= len(DATASETS['test'])

print(f"F1 Score: {f1_score}")
print(f"Exact Match: {exact_match}")

# Вывод некоторых примеров предсказаний
print("\nRandom Predicted Examples:")
for pred, ref in zip(example_predictions[:5], example_references[:5]):
    print(f"Ref Answer: {ref}\nPred Answer: {pred}\n")

F1 Score: 0.0
Exact Match: 0.0

Random Predicted Examples:
Ref Answer: 
Pred Answer: Наряду с кровеносной системой у позвоночных есть другая, связанная с ней, сосудистая система — лимфатическая. Она состоит из лимфатических сосудов и лимфатических желёз. Ли

Ref Answer: 
Pred Answer:  преобразований в области лексики

