In [1]:
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, get_scheduler
from datasets import load_from_disk
from tqdm import tqdm
from torch.nn.utils.rnn import pad_sequence


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL_PATH = 'models/gpt2-testcase'
DATASET_PATH = 'dataset'

In [3]:
# Загрузка предобработанного датасета
dataset = load_from_disk(DATASET_PATH)

In [4]:
# Разделение датасета на тренировочный и проверочный
train_dataset = dataset['train']
val_dataset = dataset['test']

In [5]:
# Загрузка токенизатора и модели
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
model = GPT2LMHeadModel.from_pretrained('distilgpt2')

# Настройка токенизатора (у GPT-2 нет pad_token по умолчанию)
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

In [6]:
# Определение максимальной длины последовательности в токенах
def get_max_length(dataset, tokenizer):
    max_len = 0
    for example in dataset:
        scenario = example['test_scenario']
        steps = example['test_steps']
        text = f"test_scenario:{scenario}\ntest_steps:{steps}"
        tokens = tokenizer(text, return_tensors='pt', truncation=False, padding=False)
        current_len = tokens['input_ids'].shape[1]
        if current_len > max_len:
            max_len = current_len
    return min(max_len, model.config.n_positions)  # Ограничение максимальной длины модели

max_length = get_max_length(train_dataset, tokenizer)
print(f"Max sequence length: {max_length}")

Max sequence length: 109


In [7]:
# Функция предобработки данных
def preprocess_function(examples):
    scenarios = examples['test_scenario']
    steps = examples['test_steps']
    texts = [f"test_scenario:\n{scenario}\ntest_steps:\n{step}" for scenario, step in zip(scenarios, steps)]
    model_inputs = tokenizer(texts, max_length=max_length, padding='max_length', truncation=True)
    
    # Создание labels с маскированием сценария
    labels = []
    for i in range(len(texts)):
        # Токенизация сценария для определения его длины
        scenario_tokens = tokenizer(scenarios[i], add_special_tokens=False)
        scenario_length = len(scenario_tokens['input_ids'])
        # Добавляем 1 для токена новой строки
        sep_token = '\n'
        sep_token_ids = tokenizer(sep_token, add_special_tokens=False)['input_ids']
        total_scenario_length = scenario_length + len(sep_token_ids)
        
        # Маскируем сценарий и разделитель
        label = [-100] * total_scenario_length + model_inputs['input_ids'][i][total_scenario_length:]
        labels.append(label)
    
    model_inputs['labels'] = labels
    return model_inputs

In [8]:
# Применение предобработки
train_dataset_p = train_dataset.map(preprocess_function, batched=True, remove_columns=['test_scenario', 'test_steps'])
val_dataset_p = val_dataset.map(preprocess_function, batched=True, remove_columns=['test_scenario', 'test_steps'])

Map: 100%|██████████| 3624/3624 [00:01<00:00, 2135.92 examples/s]
Map: 100%|██████████| 906/906 [00:00<00:00, 2026.85 examples/s]


In [9]:
# Функция для коллекции батчей
def collate_fn(batch):
    input_ids = [torch.tensor(item['input_ids']) for item in batch]
    attention_mask = [torch.tensor(item['attention_mask']) for item in batch]
    labels = [torch.tensor(item['labels']) for item in batch]
    
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = pad_sequence(labels, batch_first=True, padding_value=-100)
    
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

# Создание DataLoader'ов с collate_fn
train_dataloader = DataLoader(train_dataset_p, batch_size=6, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset_p, batch_size=6, collate_fn=collate_fn)

# Настройка устройства и модели
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Оптимизатор и планировщик
optimizer = optim.Adam(model.parameters(), lr=5e-5)
num_epochs = 20
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [11]:
# Смешанная точность для ускорения обучения
scaler = torch.amp.GradScaler('cuda')

In [12]:
# Обучение модели
progress_bar = tqdm(range(num_training_steps))
model.train()

for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.amp.autocast('cuda'):
            outputs = model(**batch)
            loss = outputs.loss
        
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    
    # Валидация после эпохи
    model.eval()
    val_loss = 0
    for batch in val_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            val_loss += outputs.loss.item()
    
    avg_val_loss = val_loss / len(val_dataloader)
    print(f"\nEpoch {epoch+1}/{num_epochs}, Validation Loss: {avg_val_loss:.4f}")
    model.train()

# Сохранение модели и токенизатора
model.save_pretrained(MODEL_PATH)
tokenizer.save_pretrained(MODEL_PATH)

  0%|          | 0/12080 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
  5%|▌         | 606/12080 [00:35<1:23:07,  2.30it/s]


Epoch 1/20, Validation Loss: 0.6099


 10%|█         | 1211/12080 [01:08<59:32,  3.04it/s] 


Epoch 2/20, Validation Loss: 0.5359


 15%|█▌        | 1814/12080 [01:40<56:38,  3.02it/s]


Epoch 3/20, Validation Loss: 0.4870


 20%|██        | 2419/12080 [02:13<53:08,  3.03it/s]


Epoch 4/20, Validation Loss: 0.4470


 25%|██▌       | 3021/12080 [02:46<49:56,  3.02it/s]


Epoch 5/20, Validation Loss: 0.4202


 30%|███       | 3627/12080 [03:19<46:52,  3.01it/s]


Epoch 6/20, Validation Loss: 0.4011


 35%|███▌      | 4229/12080 [03:52<43:04,  3.04it/s]


Epoch 7/20, Validation Loss: 0.3865


 40%|████      | 4834/12080 [04:25<39:49,  3.03it/s]


Epoch 8/20, Validation Loss: 0.3726


 45%|████▌     | 5440/12080 [04:57<27:16,  4.06it/s]


Epoch 9/20, Validation Loss: 0.3597


 50%|█████     | 6041/12080 [05:30<33:16,  3.02it/s]


Epoch 10/20, Validation Loss: 0.3518


 55%|█████▌    | 6647/12080 [06:03<24:15,  3.73it/s]


Epoch 11/20, Validation Loss: 0.3472


 60%|██████    | 7249/12080 [06:35<26:35,  3.03it/s]


Epoch 12/20, Validation Loss: 0.3418


 65%|██████▌   | 7855/12080 [07:08<23:17,  3.02it/s]


Epoch 13/20, Validation Loss: 0.3407


 70%|███████   | 8459/12080 [07:41<16:11,  3.73it/s]


Epoch 14/20, Validation Loss: 0.3364


 75%|███████▌  | 9063/12080 [08:14<16:36,  3.03it/s]


Epoch 15/20, Validation Loss: 0.3322


 80%|████████  | 9666/12080 [08:46<13:18,  3.02it/s]


Epoch 16/20, Validation Loss: 0.3329


 85%|████████▌ | 10269/12080 [09:19<09:57,  3.03it/s]


Epoch 17/20, Validation Loss: 0.3339


 90%|█████████ | 10875/12080 [09:52<06:38,  3.03it/s]


Epoch 18/20, Validation Loss: 0.3303


 95%|█████████▌| 11479/12080 [10:25<03:19,  3.01it/s]


Epoch 19/20, Validation Loss: 0.3312


100%|█████████▉| 12079/12080 [10:54<00:00, 20.27it/s]


Epoch 20/20, Validation Loss: 0.3307


('models/gpt2-testcase\\tokenizer_config.json',
 'models/gpt2-testcase\\special_tokens_map.json',
 'models/gpt2-testcase\\vocab.json',
 'models/gpt2-testcase\\merges.txt',
 'models/gpt2-testcase\\added_tokens.json')

100%|██████████| 12080/12080 [11:06<00:00, 20.27it/s]

In [13]:
# Загрузка токенизатора
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_PATH)

# Загрузка модели
model = GPT2LMHeadModel.from_pretrained(
    pretrained_model_name_or_path=MODEL_PATH,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)
# Перемещение модели на GPU или CPU
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

In [14]:
# Проверка генерации
model.eval()
# Берем случайный пример из валидационного датасета
random_sample = val_dataset.shuffle().select([0])['test_scenario'][0]
# Токенизация с настройкой максимальной длины контекста
inputs = tokenizer(
    f"test_scenario:\n{random_sample}\ntest_steps:\n", 
    return_tensors='pt',
    max_length=model.config.n_positions,  # Максимальная длина для модели
    truncation=True
).to(device)

with torch.no_grad():
    generated_ids = model.generate(
        **inputs,
        max_length=512,  # Увеличенная максимальная длина
        min_length=50,   # Минимальная длина ответа
        num_beams=5,     # Лучший поиск для качества
        no_repeat_ngram_size=3,
        early_stopping=False,  # Отключаем преждевременную остановку
        do_sample=True,        # Включаем сэмплирование
        temperature=0.7,       # Температура для разнообразия
        top_p=0.9,             # Ядерная выборка
        top_k=50               # Ограничение топ-k токенов
    )

print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


test_scenario:
Verify that a user can successfully subscribe to the sports and fitness equipment website.
test_steps:
1. Navigate to the subscription page.
2. Select a subscription plan.
3. Enter valid payment details.
4. Click on the "Subscribe" button.
