In [1]:
import torch
from torch.utils.data import DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, get_linear_schedule_with_warmup, DataCollatorForSeq2Seq
from datasets import load_from_disk
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
# Настройки
MODEL_NAME = "google/flan-t5-small"
MODEL_PATH = "models/t5-testcase"
BATCH_SIZE = 16
MAX_INPUT_LENGTH = 32
MAX_TARGET_LENGTH = 80
LEARNING_RATE = 3e-4
EPOCHS = 20
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Проверка поддержки bf16
if DEVICE.type == "cuda" and torch.cuda.is_bf16_supported():
    print("BF16 is supported! Using bfloat16 mixed precision")
else:
    raise ValueError("BF16 is not supported on your device")

# Загрузка датасета
dataset = load_from_disk("dataset")
train_dataset = dataset["train"].select_columns(["test_scenario", "test_steps"])
test_dataset = dataset["test"].select_columns(["test_scenario", "test_steps"])

# Загрузка токенизатора
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

# Загрузка модели
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(DEVICE)

BF16 is supported! Using bfloat16 mixed precision


In [3]:
# Функция для анализа длин в токенах
def analyze_token_lengths(dataset, column_name):
    token_lengths = [len(tokenizer.encode(text)) for text in dataset[column_name]]
    max_length = max(token_lengths)
    min_length = min(token_lengths)
    avg_length = sum(token_lengths) / len(token_lengths)
    p95_length = sorted(token_lengths)[int(len(token_lengths) * 0.95)]
    return {
        "max": max_length,
        "min": min_length,
        "avg": avg_length,
        "p95": p95_length
    }

# Анализ длин входных данных в токенах
input_token_stats = analyze_token_lengths(train_dataset, "test_scenario")
print("Токенизированные длины для test_scenario:")
print(input_token_stats)

# Анализ длин выходных данных в токенах
target_token_stats = analyze_token_lengths(train_dataset, "test_steps")
print("\nТокенизированные длины для test_steps:")
print(target_token_stats)

Токенизированные длины для test_scenario:
{'max': 32, 'min': 4, 'avg': 16.223233995584987, 'p95': 23}

Токенизированные длины для test_steps:
{'max': 80, 'min': 9, 'avg': 36.45115894039735, 'p95': 56}


In [12]:

# Токенизация данных
def preprocess_function(examples):
    inputs = [f"generate test steps: {scenario}" for scenario in examples["test_scenario"]]
    targets = examples["test_steps"]
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding="max_length"
    )
    labels = tokenizer(
        targets,
        max_length=MAX_TARGET_LENGTH,
        truncation=True,
        padding="max_length"
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_tokenized = train_dataset.map(preprocess_function, batched=True)
test_tokenized = test_dataset.map(preprocess_function, batched=True)

# Добавляем форматирование в тензоры
train_tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Используем специальный коллектор для seq2seq
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    padding="longest",
    return_tensors="pt"
)

# Создание DataLoader
train_dataloader = DataLoader(train_tokenized, shuffle=True, batch_size=BATCH_SIZE, collate_fn=data_collator)
test_dataloader = DataLoader(test_tokenized, batch_size=BATCH_SIZE, collate_fn=data_collator)



# Оптимизатор и scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [14]:
# Обучение с прогресс-баром
progress_bar = tqdm(range(total_steps), desc="Training progress")

for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch+1}/{EPOCHS}")
    total_loss = 0
    model.train()
    
    for batch in train_dataloader:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)
        
        with torch.autocast(device_type=DEVICE.type, dtype=torch.bfloat16, enabled=True):
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
        
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
        progress_bar.update(1)
    
    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Training loss: {avg_train_loss:.4f}")

    # Валидация после эпохи
    model.eval()
    val_loss = 0
    for batch in test_dataloader:
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            val_loss += outputs.loss.item()
    
    avg_val_loss = val_loss / len(test_dataloader)
    print(f"Validation Loss: {avg_val_loss:.4f}")
    model.train()

# Сохранение модели
model.save_pretrained(MODEL_PATH)
tokenizer.save_pretrained(MODEL_PATH)

Training progress:   5%|▍         | 218/4540 [00:29<09:39,  7.45it/s]



Epoch 1/20


Training progress:   5%|▌         | 227/4540 [00:22<07:27,  9.64it/s]

Training loss: 0.7463


Training progress:   5%|▌         | 228/4540 [00:24<42:11,  1.70it/s]

Validation Loss: 0.5956

Epoch 2/20


Training progress:  10%|▉         | 453/4540 [00:46<06:40, 10.21it/s]

Training loss: 0.6544


Training progress:  10%|█         | 455/4540 [00:48<28:43,  2.37it/s]

Validation Loss: 0.5465

Epoch 3/20


Training progress:  15%|█▍        | 680/4540 [01:11<06:44,  9.54it/s]

Training loss: 0.5947


Training progress:  15%|█▌        | 682/4540 [01:13<33:02,  1.95it/s]

Validation Loss: 0.5099

Epoch 4/20


Training progress:  20%|██        | 908/4540 [01:35<05:57, 10.17it/s]

Training loss: 0.5469
Validation Loss: 0.4833

Epoch 5/20


Training progress:  25%|██▌       | 1135/4540 [01:59<05:52,  9.66it/s]

Training loss: 0.5077


Training progress:  25%|██▌       | 1136/4540 [02:01<34:30,  1.64it/s]

Validation Loss: 0.4591

Epoch 6/20


Training progress:  30%|███       | 1362/4540 [02:23<05:37,  9.41it/s]

Training loss: 0.4733


Training progress:  30%|███       | 1363/4540 [02:25<34:01,  1.56it/s]

Validation Loss: 0.4431

Epoch 7/20


Training progress:  35%|███▍      | 1588/4540 [02:48<04:35, 10.71it/s]

Training loss: 0.4432


Training progress:  35%|███▌      | 1590/4540 [02:50<18:05,  2.72it/s]

Validation Loss: 0.4251

Epoch 8/20


Training progress:  40%|████      | 1816/4540 [03:12<04:11, 10.82it/s]

Training loss: 0.4173
Validation Loss: 0.4100

Epoch 9/20


Training progress:  45%|████▌     | 2043/4540 [03:36<03:48, 10.92it/s]

Training loss: 0.3929


Training progress:  45%|████▌     | 2045/4540 [03:38<15:10,  2.74it/s]

Validation Loss: 0.3993

Epoch 10/20


Training progress:  50%|█████     | 2270/4540 [04:00<03:31, 10.71it/s]

Training loss: 0.3744
Validation Loss: 0.3915

Epoch 11/20


Training progress:  55%|█████▌    | 2497/4540 [04:24<03:07, 10.90it/s]

Training loss: 0.3560
Validation Loss: 0.3789

Epoch 12/20


Training progress:  60%|██████    | 2724/4540 [04:48<02:43, 11.12it/s]

Training loss: 0.3402
Validation Loss: 0.3747

Epoch 13/20


Training progress:  65%|██████▌   | 2951/4540 [05:13<02:27, 10.80it/s]

Training loss: 0.3257
Validation Loss: 0.3675

Epoch 14/20


Training progress:  70%|██████▉   | 3177/4540 [05:37<02:09, 10.56it/s]

Training loss: 0.3139


Training progress:  70%|███████   | 3179/4540 [05:39<08:49,  2.57it/s]

Validation Loss: 0.3643

Epoch 15/20


Training progress:  75%|███████▍  | 3404/4540 [06:01<01:59,  9.49it/s]

Training loss: 0.3025


Training progress:  75%|███████▌  | 3406/4540 [06:03<09:08,  2.07it/s]

Validation Loss: 0.3592

Epoch 16/20


Training progress:  80%|████████  | 3632/4540 [06:25<01:31,  9.93it/s]

Training loss: 0.2951


Training progress:  80%|████████  | 3633/4540 [06:27<07:31,  2.01it/s]

Validation Loss: 0.3548

Epoch 17/20


Training progress:  85%|████████▍ | 3858/4540 [06:49<01:10,  9.61it/s]

Training loss: 0.2884


Training progress:  85%|████████▌ | 3860/4540 [06:51<05:14,  2.16it/s]

Validation Loss: 0.3527

Epoch 18/20


Training progress:  90%|█████████ | 4086/4540 [07:13<00:45,  9.87it/s]

Training loss: 0.2846


Training progress:  90%|█████████ | 4087/4540 [07:15<04:09,  1.82it/s]

Validation Loss: 0.3519

Epoch 19/20


Training progress:  95%|█████████▍| 4312/4540 [07:37<00:23,  9.78it/s]

Training loss: 0.2807


Training progress:  95%|█████████▌| 4314/4540 [07:39<01:39,  2.26it/s]

Validation Loss: 0.3514

Epoch 20/20


Training progress: 100%|█████████▉| 4539/4540 [08:01<00:00,  9.19it/s]

Training loss: 0.2785
Validation Loss: 0.3514


('models/t5-testcase\\tokenizer_config.json',
 'models/t5-testcase\\special_tokens_map.json',
 'models/t5-testcase\\spiece.model',
 'models/t5-testcase\\added_tokens.json')

In [15]:
# Загрузка дообученной модели
model = T5ForConditionalGeneration.from_pretrained(
    pretrained_model_name_or_path=MODEL_PATH,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)
# Загрузка токенизатора
tokenizer = T5Tokenizer.from_pretrained(MODEL_PATH)

# Перемещение модели на GPU или CPU
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

In [16]:
# Проверка генерации
model.eval()
generation_config = {
    "max_length": MAX_TARGET_LENGTH,
    "num_beams": 4,
    "early_stopping": True,
    "no_repeat_ngram_size": 2,  # Запрет повторяющихся биграмм
    "temperature": 0.7,         # Добавление случайности
    "top_k": 50,                # Ограничение топ-k токенов
    "top_p": 0.95,              # Ядерная выборка
    "do_sample": True
}

def generate_test_steps(scenario):
    inputs = tokenizer(f"generate test steps: {scenario}", return_tensors="pt").to(DEVICE)
    outputs = model.generate(
        **inputs,
        **generation_config
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Пример генерации на тестовых данных
sample = test_dataset.shuffle().select([0])[0]
print("Вход:", sample["test_scenario"])
print("Ожидаемый результат:", sample["test_steps"])
print("Сгенерированный результат:", generate_test_steps(sample["test_scenario"]))

Вход: Verify that a user cannot download content that does not support offline viewing.
Ожидаемый результат: 1. Navigate to content that does not have offline viewing support.
2. Check for the download button.
Сгенерированный результат: 1. Log in to the streaming platform. 2. Navigate to a content that does not support offline viewing. 3. Attempt to download the content.
