In [1]:
import torch
from torch.utils.data import DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, get_linear_schedule_with_warmup, DataCollatorForSeq2Seq
from datasets import load_from_disk
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Настройки
MODEL_NAME = "google/flan-t5-small"
MODEL_PATH = "models/t5-test-steps"
BATCH_SIZE = 12
MAX_INPUT_LENGTH = 32
MAX_TARGET_LENGTH = 256
LEARNING_RATE = 3e-4
EPOCHS = 10
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Проверка поддержки bf16
if DEVICE.type == "cuda" and torch.cuda.is_bf16_supported():
    print("BF16 is supported! Using bfloat16 mixed precision")
    BF16_AVAILABLE = True
else:
    raise ValueError("BF16 is not supported on your device")

# Загрузка датасета
dataset = load_from_disk("dataset")
train_dataset = dataset["train"].select_columns(["test_scenario", "test_steps"])
test_dataset = dataset["test"].select_columns(["test_scenario", "test_steps"])

# Загрузка токенизатора
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

# Загрузка модели
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(DEVICE)

# Токенизация данных
def preprocess_function(examples):
    inputs = [f"generate test steps: {scenario}" for scenario in examples["test_scenario"]]
    targets = examples["test_steps"]
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding="max_length"
    )
    labels = tokenizer(
        targets,
        max_length=MAX_TARGET_LENGTH,
        truncation=True,
        padding="max_length"
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_tokenized = train_dataset.map(preprocess_function, batched=True)
test_tokenized = test_dataset.map(preprocess_function, batched=True)

# Добавляем форматирование в тензоры
train_tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Используем специальный коллектор для seq2seq
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    padding="longest",
    return_tensors="pt"
)

# Создание DataLoader
train_dataloader = DataLoader(train_tokenized, shuffle=True, batch_size=BATCH_SIZE, collate_fn=data_collator)
test_dataloader = DataLoader(test_tokenized, batch_size=BATCH_SIZE, collate_fn=data_collator)



# Оптимизатор и scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

BF16 is supported! Using bfloat16 mixed precision


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [3]:
# Обучение с прогресс-баром
progress_bar = tqdm(range(total_steps), desc="Training progress")

for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch+1}/{EPOCHS}")
    total_loss = 0
    model.train()
    
    for batch in train_dataloader:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)
        
        with torch.autocast(device_type=DEVICE.type, dtype=torch.bfloat16, enabled=True):
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
        
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
        progress_bar.update(1)
    
    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Training loss: {avg_train_loss:.4f}")

# Сохранение модели
model.save_pretrained(MODEL_PATH)
tokenizer.save_pretrained(MODEL_PATH)

  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)



Epoch 1/10


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Training progress:  10%|█         | 313/3120 [00:43<05:57,  7.86it/s]

Training loss: 1.6383

Epoch 2/10


Training progress:  20%|██        | 625/3120 [01:24<05:04,  8.19it/s]

Training loss: 0.2914

Epoch 3/10


Training progress:  30%|███       | 937/3120 [02:05<04:34,  7.95it/s]

Training loss: 0.2600

Epoch 4/10


Training progress:  40%|████      | 1249/3120 [02:46<03:58,  7.84it/s]

Training loss: 0.2413

Epoch 5/10


Training progress:  50%|█████     | 1561/3120 [03:28<03:11,  8.14it/s]

Training loss: 0.2277

Epoch 6/10


Training progress:  60%|██████    | 1873/3120 [04:09<02:45,  7.54it/s]

Training loss: 0.2174

Epoch 7/10


Training progress:  70%|███████   | 2185/3120 [04:49<01:58,  7.89it/s]

Training loss: 0.2092

Epoch 8/10


Training progress:  80%|████████  | 2497/3120 [05:31<01:19,  7.85it/s]

Training loss: 0.2026

Epoch 9/10


Training progress:  90%|█████████ | 2809/3120 [06:12<00:39,  7.84it/s]

Training loss: 0.1982

Epoch 10/10


Training progress: 100%|█████████▉| 3119/3120 [06:53<00:00,  7.69it/s]

Training loss: 0.1945


('models/t5-test-steps\\tokenizer_config.json',
 'models/t5-test-steps\\special_tokens_map.json',
 'models/t5-test-steps\\spiece.model',
 'models/t5-test-steps\\added_tokens.json')

In [3]:
# Загрузка дообученной модели
model = T5ForConditionalGeneration.from_pretrained(
    pretrained_model_name_or_path=MODEL_PATH,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)
# Загрузка токенизатора
tokenizer = T5Tokenizer.from_pretrained(MODEL_PATH)

# Перемещение модели на GPU или CPU
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

In [4]:
# Проверка генерации
model.eval()
generation_config = {
    "max_length": MAX_TARGET_LENGTH,
    "num_beams": 4,
    "early_stopping": True,
    "no_repeat_ngram_size": 2,  # Запрет повторяющихся биграмм
    "temperature": 0.7,         # Добавление случайности
    "top_k": 50,                # Ограничение топ-k токенов
    "top_p": 0.95,              # Ядерная выборка
    "do_sample": True
}

def generate_test_steps(scenario):
    inputs = tokenizer(f"generate test steps: {scenario}", return_tensors="pt").to(DEVICE)
    outputs = model.generate(
        **inputs,
        **generation_config
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Пример генерации на тестовых данных
sample = test_dataset[200]
print("Вход:", sample["test_scenario"])
print("Ожидаемый результат:", sample["test_steps"])
print("Сгенерированный результат:", generate_test_steps(sample["test_scenario"]))

Вход: Verify that the content ratings are correctly displayed for different movies and TV shows.
Ожидаемый результат: 1. Navigate to the movie section.
2. Check the content rating displayed for a PG-13 movie.
Сгенерированный результат: 1. Navigate to the entertainment streaming platform. 2. Check the content ratings displayed for different movies and TV shows.
