In [3]:
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, get_scheduler
from datasets import load_from_disk
from tqdm import tqdm
from torch.nn.utils.rnn import pad_sequence


In [None]:
MODEL_PATH = 'models/distilgpt2-finetuned'
DATASET_PATH = 'dataset'

In [4]:
# Загрузка предобработанного датасета
dataset = load_from_disk(DATASET_PATH)

In [5]:
# Разделение датасета на тренировочный и проверочный
train_dataset = dataset['train']
val_dataset = dataset['test']

In [6]:
# Загрузка токенизатора и модели
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
model = GPT2LMHeadModel.from_pretrained('distilgpt2')

# Настройка токенизатора (у GPT-2 нет pad_token по умолчанию)
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

In [7]:
# Определение максимальной длины последовательности
def get_max_length(dataset, tokenizer):
    max_len = 0
    for example in dataset:
        scenario = example['test_scenario']
        steps = example['test_steps']
        text = f"test_scenario:{scenario}\ntest_steps:{steps}"
        tokens = tokenizer(text, return_tensors='pt', truncation=False, padding=False)
        current_len = tokens['input_ids'].shape[1]
        if current_len > max_len:
            max_len = current_len
    return min(max_len, model.config.n_positions)  # Ограничение максимальной длины модели

max_length = get_max_length(train_dataset, tokenizer)
print(f"Max sequence length: {max_length}")

Max sequence length: 326


In [8]:
# Функция предобработки данных
def preprocess_function(examples):
    scenarios = examples['test_scenario']
    steps = examples['test_steps']
    texts = [f"test_scenario:\n{scenario}\ntest_steps:\n{step}" for scenario, step in zip(scenarios, steps)]
    model_inputs = tokenizer(texts, max_length=max_length, padding='max_length', truncation=True)
    
    # Создание labels с маскированием сценария
    labels = []
    for i in range(len(texts)):
        # Токенизация сценария для определения его длины
        scenario_tokens = tokenizer(scenarios[i], add_special_tokens=False)
        scenario_length = len(scenario_tokens['input_ids'])
        # Добавляем 1 для токена новой строки
        sep_token = '\n'
        sep_token_ids = tokenizer(sep_token, add_special_tokens=False)['input_ids']
        total_scenario_length = scenario_length + len(sep_token_ids)
        
        # Маскируем сценарий и разделитель
        label = [-100] * total_scenario_length + model_inputs['input_ids'][i][total_scenario_length:]
        labels.append(label)
    
    model_inputs['labels'] = labels
    return model_inputs

In [9]:
# Применение предобработки
train_dataset_p = train_dataset.map(preprocess_function, batched=True, remove_columns=['test_scenario', 'test_steps'])
val_dataset_p = val_dataset.map(preprocess_function, batched=True, remove_columns=['test_scenario', 'test_steps'])

Map: 100%|██████████| 3738/3738 [00:02<00:00, 1774.95 examples/s]
Map: 100%|██████████| 935/935 [00:00<00:00, 1785.79 examples/s]


In [10]:
# Функция для коллекции батчей
def collate_fn(batch):
    input_ids = [torch.tensor(item['input_ids']) for item in batch]
    attention_mask = [torch.tensor(item['attention_mask']) for item in batch]
    labels = [torch.tensor(item['labels']) for item in batch]
    
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = pad_sequence(labels, batch_first=True, padding_value=-100)
    
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

# Создание DataLoader'ов с collate_fn
train_dataloader = DataLoader(train_dataset_p, batch_size=6, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset_p, batch_size=6, collate_fn=collate_fn)

# Настройка устройства и модели
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Оптимизатор и планировщик
optimizer = optim.Adam(model.parameters(), lr=5e-5)
num_epochs = 20
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [11]:
# Смешанная точность для ускорения обучения
scaler = torch.amp.GradScaler('cuda')

In [12]:
# Обучение модели
progress_bar = tqdm(range(num_training_steps))
model.train()

for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.amp.autocast('cuda'):
            outputs = model(**batch)
            loss = outputs.loss
        
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    
    # Валидация после эпохи
    model.eval()
    val_loss = 0
    for batch in val_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            val_loss += outputs.loss.item()
    
    avg_val_loss = val_loss / len(val_dataloader)
    print(f"\nEpoch {epoch+1}/{num_epochs}, Validation Loss: {avg_val_loss:.4f}")
    model.train()

# Сохранение модели и токенизатора
model.save_pretrained(MODEL_PATH)
tokenizer.save_pretrained(MODEL_PATH)

  0%|          | 0/12460 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
  5%|▌         | 624/12460 [01:17<8:56:15,  2.72s/it]


Epoch 1/20, Validation Loss: 0.2026


 10%|█         | 1247/12460 [02:32<8:25:31,  2.71s/it]


Epoch 2/20, Validation Loss: 0.1770


 15%|█▌        | 1870/12460 [03:48<7:56:49,  2.70s/it]


Epoch 3/20, Validation Loss: 0.1612


 20%|██        | 2493/12460 [05:03<7:29:04,  2.70s/it]


Epoch 4/20, Validation Loss: 0.1509


 25%|██▌       | 3116/12460 [06:18<7:00:55,  2.70s/it]


Epoch 5/20, Validation Loss: 0.1412


 30%|███       | 3739/12460 [07:32<6:32:37,  2.70s/it]


Epoch 6/20, Validation Loss: 0.1341


 35%|███▌      | 4362/12460 [08:47<6:05:00,  2.70s/it]


Epoch 7/20, Validation Loss: 0.1274


 40%|████      | 4985/12460 [10:02<5:36:51,  2.70s/it]


Epoch 8/20, Validation Loss: 0.1238


 45%|████▌     | 5608/12460 [11:17<5:09:43,  2.71s/it]


Epoch 9/20, Validation Loss: 0.1195


 50%|█████     | 6231/12460 [12:32<4:42:08,  2.72s/it]


Epoch 10/20, Validation Loss: 0.1159


 55%|█████▌    | 6854/12460 [13:47<4:13:56,  2.72s/it]


Epoch 11/20, Validation Loss: 0.1125


 60%|██████    | 7477/12460 [15:02<3:44:50,  2.71s/it]


Epoch 12/20, Validation Loss: 0.1109


 65%|██████▌   | 8100/12460 [16:17<3:17:29,  2.72s/it]


Epoch 13/20, Validation Loss: 0.1096


 70%|███████   | 8723/12460 [17:32<2:49:02,  2.71s/it]


Epoch 14/20, Validation Loss: 0.1075


 75%|███████▌  | 9346/12460 [18:47<2:21:03,  2.72s/it]


Epoch 15/20, Validation Loss: 0.1064


 80%|████████  | 9969/12460 [20:02<1:52:52,  2.72s/it]


Epoch 16/20, Validation Loss: 0.1049


 85%|████████▌ | 10592/12460 [21:17<1:24:36,  2.72s/it]


Epoch 17/20, Validation Loss: 0.1053


 90%|█████████ | 11215/12460 [22:32<56:25,  2.72s/it]  


Epoch 18/20, Validation Loss: 0.1046


 95%|█████████▌| 11838/12460 [23:48<28:15,  2.73s/it]


Epoch 19/20, Validation Loss: 0.1045


100%|██████████| 12460/12460 [24:54<00:00,  9.39it/s]


Epoch 20/20, Validation Loss: 0.1042


('./distilgpt2-finetuned\\tokenizer_config.json',
 './distilgpt2-finetuned\\special_tokens_map.json',
 './distilgpt2-finetuned\\vocab.json',
 './distilgpt2-finetuned\\merges.txt',
 './distilgpt2-finetuned\\added_tokens.json')

In [13]:
# Загрузка токенизатора
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_PATH)

# Загрузка модели
model = GPT2LMHeadModel.from_pretrained(
    pretrained_model_name_or_path=MODEL_PATH,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)
# Перемещение модели на GPU или CPU
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

In [36]:
# Проверка генерации
model.eval()
# Берем случайный пример из валидационного датасета
random_sample = val_dataset.shuffle().select([0])['test_scenario'][0]
# Токенизация с настройкой максимальной длины контекста
inputs = tokenizer(
    f"test_scenario:\n{random_sample}\ntest_steps:\n", 
    return_tensors='pt',
    max_length=model.config.n_positions,  # Максимальная длина для модели
    truncation=True
).to(device)

with torch.no_grad():
    generated_ids = model.generate(
        **inputs,
        max_length=512,  # Увеличенная максимальная длина
        min_length=50,   # Минимальная длина ответа
        num_beams=5,     # Лучший поиск для качества
        no_repeat_ngram_size=3,
        early_stopping=False,  # Отключаем преждевременную остановку
        do_sample=True,        # Включаем сэмплирование
        temperature=0.7,       # Температура для разнообразия
        top_p=0.9,             # Ядерная выборка
        top_k=50               # Ограничение топ-k токенов
    )

print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


test_scenario:
Test the functionality for recording the live class session.
test_steps:
1. Access the live classes section.
2. Click on a specific recording feature.
3. Record the live session using the recorded feature.
