In [1]:
import torch
from datasets import load_from_disk
from transformers import (
    BartTokenizer,
    BartForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Настройки
MODEL_NAME = "facebook/bart-base"
MODEL_PATH = "models/bart-testcase"
BATCH_SIZE = 16
MAX_INPUT_LENGTH = 32
MAX_TARGET_LENGTH = 88
LEARNING_RATE = 2e-5
EPOCHS = 20
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Загрузка датасета
dataset = load_from_disk('dataset')
train_dataset = dataset['train']
test_dataset = dataset['test']

# Инициализация токенизатора и модели
tokenizer = BartTokenizer.from_pretrained(MODEL_NAME)
model = BartForConditionalGeneration.from_pretrained(MODEL_NAME).to(DEVICE)

In [3]:
# Функция для анализа длин в токенах
def analyze_token_lengths(dataset, column_name):
    token_lengths = [len(tokenizer.encode(text)) for text in dataset[column_name]]
    max_length = max(token_lengths)
    min_length = min(token_lengths)
    avg_length = sum(token_lengths) / len(token_lengths)
    p95_length = sorted(token_lengths)[int(len(token_lengths) * 0.95)]
    return {
        "max": max_length,
        "min": min_length,
        "avg": avg_length,
        "p95": p95_length
    }

# Анализ длин входных данных в токенах
input_token_stats = analyze_token_lengths(train_dataset, "test_scenario")
print("Токенизированные длины для test_scenario:")
print(input_token_stats)

# Анализ длин выходных данных в токенах
target_token_stats = analyze_token_lengths(train_dataset, "test_steps")
print("\nТокенизированные длины для test_steps:")
print(target_token_stats)

Токенизированные длины для test_scenario:
{'max': 31, 'min': 5, 'avg': 15.937637969094922, 'p95': 22}

Токенизированные длины для test_steps:
{'max': 86, 'min': 11, 'avg': 40.184878587196465, 'p95': 62}


In [6]:

# Предобработка данных
def preprocess_function(examples):
    inputs = examples['test_scenario']
    targets = examples['test_steps']
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding='max_length'
    )
    labels = tokenizer(
        text_target=targets,
        max_length=MAX_TARGET_LENGTH,
        truncation=True,
        padding='max_length'
    )
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 3624/3624 [00:01<00:00, 2987.60 examples/s]
Map: 100%|██████████| 906/906 [00:00<00:00, 2694.74 examples/s]


In [7]:
# Настройка data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Конфигурация обучения
training_args = Seq2SeqTrainingArguments(
    output_dir='./models',
    eval_strategy='epoch',
    save_strategy='epoch',
    logging_steps=10,
    save_total_limit=2,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=2,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    predict_with_generate=True,
    bf16=True,  # Включаем bfloat16
    fp16=False,
    report_to="none",
    load_best_model_at_end=True
)

# Инициализация тренера
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
    tokenizer=tokenizer
)

  trainer = Seq2SeqTrainer(


In [8]:
# Обучение модели
trainer.train()

# Сохранение модели
trainer.save_model(MODEL_PATH)

Epoch,Training Loss,Validation Loss
1,1.7942,1.181428
2,0.624,0.511679
3,0.4906,0.441208
4,0.439,0.408015
5,0.4113,0.3848
6,0.3857,0.36934
7,0.3726,0.355075
8,0.3461,0.344816
9,0.3272,0.336054
10,0.2804,0.32784


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


In [9]:
# Проверка результатов
def generate_steps(text):
    inputs = tokenizer(text, return_tensors='pt', max_length=MAX_INPUT_LENGTH, truncation=True).to(DEVICE)
    output = model.generate(**inputs, max_length=MAX_TARGET_LENGTH)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Тестирование на примерах
sample = test_dataset.select(range(5))
for example in sample:
    print(f"Scenario: {example['test_scenario']}")
    print(f"Generated steps: {generate_steps(example['test_scenario'])}")
    print(f"Actual steps: {example['test_steps']}\n{'='*50}")

Scenario: Verify that the event management platform can integrate with the mobile app successfully.
Generated steps: 1. Launch the event management platform on the mobile app.
2. Navigate to the integration settings.
3. Click on the "Integrate" button for the integrated platform.
Actual steps: 1. Launch the event management platform on the mobile app.
2. Log in using valid credentials.
3. Navigate to the event integration feature.
4. Attempt to integrate a sample event with the mobile app.
Scenario: Test the platform's performance during peak usage times on a specific day (e.g., Black Friday sales).
Generated steps: 1. Simulate high traffic by increasing the number of concurrent users on the platform.
2. Monitor the platform's response time and performance during peak usage times.
Actual steps: 1. Simulate high traffic volume during peak hours on a designated day.
2. Analyze the platform's response time and server capacity under heavy load.
Scenario: Verify that the platform recommends