In [1]:
import torch
from datasets import load_from_disk
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
import numpy as np
import random

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
# Константы
EPOCHS = 10
BATCH_SIZE = 8
LEARNING_RATE = 2e-5
MODEL_NAME = "meta-llama/Llama-3.2-1B"
MODEL_PATH = "./models/llama3.2-testcase"

# Загрузка датасета
dataset = load_from_disk("dataset")
train_dataset = dataset["train"]
test_dataset = dataset["test"]

In [3]:
# Загрузка и настройка токенизатора
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

# Загрузка модели
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

In [4]:
# Определение максимальной длины последовательности в токенах
def get_max_length(dataset, tokenizer):
    max_len = 0
    for example in dataset:
        scenario = example['test_scenario']
        steps = example['test_steps']
        text = f"test_scenario:{scenario}\ntest_steps:{steps}"
        tokens = tokenizer(text, return_tensors='pt', truncation=False, padding=False)
        current_len = tokens['input_ids'].shape[1]
        if current_len > max_len:
            max_len = current_len
    return max_len

max_length_train = get_max_length(train_dataset, tokenizer)
max_length_test = get_max_length(test_dataset, tokenizer)
print(f"Max train_dataset sequence length: {max_length_train}")
print(f"Max test_dataset sequence length: {max_length_test}")

Max train_dataset sequence length: 98
Max test_dataset sequence length: 92


In [5]:
# Препроцессинг
def preprocess_function(examples):
    inputs = [f"test_scenario:\n{scenario}\ntest_steps:\n{step}<|end_of_text|>" 
             for scenario, step in zip(examples['test_scenario'], examples['test_steps'])]
    
    model_inputs = tokenizer(inputs, max_length=98, truncation=True, padding=True)
    return model_inputs

# Применяем предобработку
train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=train_dataset.column_names
)
test_dataset = test_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=test_dataset.column_names
)

Map: 100%|██████████| 3624/3624 [00:00<00:00, 18211.15 examples/s]
Map: 100%|██████████| 906/906 [00:00<00:00, 18875.62 examples/s]


In [8]:
# Настройка Data Collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Тренировочные аргументы
training_args = TrainingArguments(
    output_dir=MODEL_PATH,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=2,
    evaluation_strategy="steps",
    eval_steps=50,
    log_level='info',
    logging_dir="./logs",
    logging_steps=50,
    fp16=False,
    bf16=True,
    bf16_full_eval=True,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01, # Регуляризация
    warmup_steps=100,
    save_total_limit=2,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="none",
    dataloader_num_workers=4,
    remove_unused_columns=False,
    load_best_model_at_end=True
)

# Инициализация тренера
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

PyTorch: setting up devices
  trainer = Trainer(
Using auto half precision backend


In [9]:
# Обучение модели
trainer.train()

# Сохранение модели
model.save_pretrained(MODEL_PATH)
tokenizer.save_pretrained(MODEL_PATH)

***** Running training *****
  Num examples = 3,624
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 2,260
  Number of trainable parameters = 1,235,814,400


Step,Training Loss,Validation Loss
50,1.9351,1.377245
100,1.1724,1.032737
150,0.9874,0.946957
200,0.9079,0.869639
250,0.7632,0.827481
300,0.6618,0.781371
350,0.6241,0.764081
400,0.5956,0.726053
450,0.5806,0.687277
500,0.409,0.71023



***** Running Evaluation *****
  Num examples = 906
  Batch size = 8

***** Running Evaluation *****
  Num examples = 906
  Batch size = 8

***** Running Evaluation *****
  Num examples = 906
  Batch size = 8

***** Running Evaluation *****
  Num examples = 906
  Batch size = 8

***** Running Evaluation *****
  Num examples = 906
  Batch size = 8

***** Running Evaluation *****
  Num examples = 906
  Batch size = 8

***** Running Evaluation *****
  Num examples = 906
  Batch size = 8

***** Running Evaluation *****
  Num examples = 906
  Batch size = 8

***** Running Evaluation *****
  Num examples = 906
  Batch size = 8

***** Running Evaluation *****
  Num examples = 906
  Batch size = 8
Saving model checkpoint to ./models/llama3.2-testcase\checkpoint-500
Configuration saved in ./models/llama3.2-testcase\checkpoint-500\config.json
Configuration saved in ./models/llama3.2-testcase\checkpoint-500\generation_config.json
Model weights saved in ./models/llama3.2-testcase\checkpoint-500\m

('./models/llama3.2-testcase\\tokenizer_config.json',
 './models/llama3.2-testcase\\special_tokens_map.json',
 './models/llama3.2-testcase\\tokenizer.json')

In [16]:
# Генерация примеров
def generate_examples(model, tokenizer, dataset, num_examples=5):
    device = model.device
    model.eval()
    results = []
    
    for _ in range(num_examples):
        sample = random.choice(dataset)
        inputs = tokenizer(f"test_scenario:\n{sample["test_scenario"]}\ntest_steps:\n", return_tensors="pt").to(device)
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=128,
                num_beams=5,
                no_repeat_ngram_size=3,
                temperature=0.9,
                early_stopping=True,
                pad_token_id=tokenizer.eos_token_id
            )
        
        generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
        results.append({
            "Input": sample["test_scenario"],
            "Generated": generated,
            "Expected": sample["test_steps"]
        })
    
    return results

# Проверка результатов
examples = generate_examples(model, tokenizer, test_dataset)
for i, ex in enumerate(examples, 1):
    print(f"Пример {i}:")
    print(f"Вход: {ex['Input']}")
    print(f"Ожидаемый результат: {ex['Expected']}")
    print(f"Сгенерированный результат: {ex['Generated']}")
    print("="*80)

Пример 1:
Вход: Verify that users can view suggested groups based on events they've attended.
Ожидаемый результат: 1. Log in to the social networking app using valid credentials.
2. Navigate to the account management section.
3. Check for the "Suggested Groups" tab or section.
4. Verify that the suggested groups are based on events the user has attended.
Сгенерированный результат: test_scenario:
Verify that users can view suggested groups based on events they've attended.
test_steps:
1. Log in to the social networking app using valid credentials.
2. Navigate to the "Events" section.
3. Click on an event that the user has attended.
4. Check if suggested groups are displayed based on the attended event.
Пример 2:
Вход: User tries to change username without entering any value.
Ожидаемый результат: 1. Log in to the social networking app.
2. Navigate to the account settings.
3. Click on the "Edit Profile" option.
4. Leave the username field blank.
5. Save the changes.
Сгенерированный резуль