# Load and Prepare Data

In [None]:
!pip install transformers -q
!pip install datasets -q
!pip install evaluate -q
!pip install optuna -q
#!pip install --upgrade accelerate

In [None]:
import collections
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random
import time
import os

from tqdm import tqdm

# pytorch libraries
import torch
import torch.nn as nn

import transformers
from transformers import AutoTokenizer, AutoModel, AutoModelForQuestionAnswering
from transformers import TrainingArguments, Trainer
from datasets import load_dataset, Dataset, DatasetDict

import evaluate
import optuna

import matplotlib.pyplot as plt

In [None]:
chekpoints = ["timpal0l/mdeberta-v3-base-squad2", "bert-large-uncased-whole-word-masking-finetuned-squad","deepset/roberta-base-squad2"]

models = []
tokenizers = []
for chekpoint in chekpoints:
    tokenizer = AutoTokenizer.from_pretrained(chekpoint)
    model = AutoModelForQuestionAnswering.from_pretrained(chekpoint)
    tokenizers.append(tokenizer)
    models.append(model)

In [None]:
from datasets import load_dataset

dataset = load_dataset("sberquad")

In [None]:
context = dataset["train"][2]["context"]
question = dataset["train"][2]["question"]

inputs = tokenizer(question, context)
tokenizer.decode(inputs["input_ids"])

In [None]:
# 64 1024

stride=128
max_seq_length = 384
def preprocess_examples(examples):
    questions = [q.strip() for q in examples['question']]
    inputs = tokenizer(
        questions,
        examples['context'],
        max_length=max_seq_length,
        truncation='only_second',
        stride=stride,
        return_offsets_mapping=True,
        padding='max_length',
        )

    offset_mapping = inputs['offset_mapping']
    answers = examples['answers']
    start_positions = []
    end_positions = []
    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer['answer_start'][0]
        end_char = answer['answer_start'][0] + len(answer['text'][0])
        sequence_ids = inputs.sequence_ids(i)

    # Find the start and end of the context

        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1
     # If the answer is not fully inside the context, label is (0, 0)

        if offset[context_start][0] > end_char \
            or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:

      # Otherwise it's the start and end token positions

            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs['start_positions'] = start_positions
    inputs['end_positions'] = end_positions
    return inputs

In [None]:
#Для ускорения процесса подбора гиперпараметров уменьшим датасет
part_of_data = 0.05
DATASETS_for_optuna = DatasetDict({
    'train': dataset["train"].map(
        preprocess_examples,
       batched=True).select(
            np.random.choice(range(len(dataset["train"])), int(len(dataset["train"])*part_of_data), replace=False)
        ),
    'validation': dataset["validation"].map(
        preprocess_examples,
        batched=True).select(
            np.random.choice(range(len(dataset["validation"])), int(len(dataset["validation"])*part_of_data), replace=False)
        ),
    'test': dataset["test"].map(
        preprocess_examples,
        batched=True).select(
            np.random.choice(range(len(dataset["test"])), int(len(dataset["test"])*part_of_data), replace=False)
        )
})
DATASETS_for_optuna

In [None]:
DATASETS = DatasetDict({
    'train': dataset["train"].map(
        preprocess_examples,
        batched=True).select(
            np.random.choice(range(len(dataset["train"])), int(len(dataset["train"])), replace=False)
        ),
    'validation': dataset["validation"].map(
        preprocess_examples,
        batched=True).select(
            np.random.choice(range(len(dataset["validation"])), int(len(dataset["validation"])), replace=False)
        ),
    'test': dataset["test"].map(
        preprocess_examples,
        batched=True).select(
            np.random.choice(range(len(dataset["test"])), int(len(dataset["test"])), replace=False)
        )
})
DATASETS

# Метрики

In [None]:
metric = evaluate.load("squad")
def compute_metrics_for_optuna(eval_preds):
    y_pred = np.argmax(eval_preds[0], -1).T

    f1_score = 0
    exact_match = 0
    for data, pred in zip(DATASETS_for_optuna['validation'], y_pred):
        # Convert answer start and end into characters positions in
        # original text using the offset mapping list
        start_char = data['offset_mapping'][pred[0]][0]
        end_char = data['offset_mapping'][pred[1]][1]
        # Create predictions and references dictionaries for metric function
        predictions = [{'prediction_text': data['context'][start_char:end_char],
                        'id': str(data['id'])}]
        references = [{'answers': data['answers'],
                       'id': str(data['id'])}]
        results = metric.compute(predictions=predictions,
                                 references=references)
        # Add metric to running sum variable to calculate average after,
        # change outputs from 0-100 range to 0-1 range
        f1_score += results['f1'] / 100
        exact_match += results['exact_match'] / 100
    # Calculate the average
    f1_score /= len(DATASETS_for_optuna['validation'])
    exact_match /= len(DATASETS_for_optuna['validation'])
    return {'f1': f1_score, 'exact_match': exact_match}

In [None]:
def compute_metrics(eval_preds):
    y_pred = np.argmax(eval_preds[0], -1).T

    f1_score = 0
    exact_match = 0
    for data, pred in zip(DATASETS['validation'], y_pred):
        # Convert answer start and end into characters positions in
        # original text using the offset mapping list
        start_char = data['offset_mapping'][pred[0]][0]
        end_char = data['offset_mapping'][pred[1]][1]
        # Create predictions and references dictionaries for metric function
        predictions = [{'prediction_text': data['context'][start_char:end_char],
                         'id': str(data['id'])}]
        references = [{'answers': data['answers'],
                       'id': str(data['id'])}]
        results = metric.compute(predictions=predictions,
                                 references=references)
        # Add metric to running sum variable to calculate average after,
        # change outputs from 0-100 range to 0-1 range
        f1_score += results['f1'] / 100
        exact_match += results['exact_match'] / 100
    # Calculate the average
    f1_score /= len(DATASETS['validation'])
    exact_match /= len(DATASETS['validation'])
    return {'f1': f1_score, 'exact_match': exact_match}

# Model

In [None]:
# Гиперпараметры
LR_MIN = 4e-5
LR_CEIL = 0.01
WD_MIN = 4e-5
WD_CEIL = 0.01
WR_MIN = 0.01
WR_CEIL = 0.2
MIN_GRAD_ACC = 1
MAX_GRAD_ACC = 5
MIN_EPOCHS = 2
MAX_EPOCHS = 5
PER_DEVICE_EVAL_BATCH = 10
PER_DEVICE_TRAIN_BATCH = 10
NUM_TRIALS = 3
SAVE_DIR = 'opt-test'
NAME_OF_MODEL = 'huggingoptunaface'

In [None]:
!pip install --upgrade transformers[torch] accelerate

In [None]:
! pip install -U accelerate
! pip install -U transformers

In [None]:
import accelerate
import transformers

transformers.__version__, accelerate.__version__

In [None]:
learning_rate = 0.0002839561415811
weight_decay = 6.811462411625139e-05
warmup_ratio = 0.1283623708167592
gradient_accumulation_step = 1
epoch = 3

In [None]:
import gc
model_names = ['model1', 'model2', 'model3']

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./results', # output directory
    num_train_epochs=epoch, # total number of training epochs
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH//4, # batch size per device during training
    per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH//4, # batch size for evaluation
    warmup_steps=200, # number of warmup steps for learning rate scheduler
    weight_decay=weight_decay, # strength of weight decay
    logging_dir='./logs', # directory for storing logs
    logging_steps=1000,
    save_steps=5000,
    fp16=True, # if you want to use mixed precision training, requires NVIDIA Apex library
)

trainers = []
for i, (model, tokenizer) in enumerate(zip(models, tokenizers)):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    torch.cuda.empty_cache()
    model.to(device)

    trainer = Trainer(
        model=model, # the instantiated 🤗 Transformers model to be trained
        args=training_args, # training arguments, defined above
        train_dataset=DATASETS['train'], # training dataset
        eval_dataset=DATASETS['validation'], # evaluation dataset
        tokenizer=tokenizer, # this is important to ensure the tokenizer is saved along with the model
        compute_metrics=compute_metrics,
    )

    trainer.train()
    trainers.append(trainer)

    trainer.save_model(f'{model_names[i]}_chekpoint') # Saves the model into the model_dir defined in our TrainingArguments

    del model
    torch.cuda.empty_cache()
    gc.collect()


# Get Prediction

In [None]:
import random

def ensemble_predictions(*predictions):
    # Simply takes the mean of the predictions
    # You might want to consider weighted mean, voting etc.
    return sum(predictions) / len(predictions)

# Получение прогнозов для каждой модели
all_predictions = []
for trainer in trainers:
    predictions, _, _ = trainer.predict(DATASETS['test'])
    all_predictions.append(predictions)

# Получение ансамбля прогнозов
ensemble_preds = ensemble_predictions(*all_predictions)

# Преобразование в формат ответов
y_pred = np.argmax(ensemble_preds, -1).T

f1_score = 0
exact_match = 0
example_predictions = []
example_references = []
for data, pred in zip(DATASETS['test'], y_pred):
    start_char = data['offset_mapping'][pred[0]][0]
    end_char = data['offset_mapping'][pred[1]][1]

    predictions = [{'prediction_text': data['context'][start_char:end_char],
                     'id': str(data['id'])}]
    references = [{'answers': data['answers'],
                   'id': str(data['id'])}]
    example_predictions.append(predictions[0]['prediction_text'])
    example_references.append(references[0]['answers']['text'][0])

    results = metric.compute(predictions=predictions, references=references)

    f1_score += results['f1'] / 100
    exact_match += results['exact_match'] / 100

# Посчитать среднюю точность
f1_score /= len(DATASETS['test'])
exact_match /= len(DATASETS['test'])

print(f"Ensemble F1 Score: {f1_score}")
print(f"Ensemble Exact Match: {exact_match}")

# Вывод некоторых примеров предсказаний
print("\nRandom Predicted Examples from Ensemble:")
for pred, ref in zip(example_predictions[:5], example_references[:5]):
    print(f"Ref Answer: {ref}\nPred Answer: {pred}\n")
