In [1]:
!pip install transformers # Сначала установим библиотеку transformers



In [27]:
# import
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

import os
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling

In [3]:
# Устройство
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Используется устройство: {DEVICE}")

Используется устройство: cuda


In [26]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("grafstor/19-000-russian-poems")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/grafstor/19-000-russian-poems?dataset_version_number=1...


100%|██████████| 13.3M/13.3M [00:00<00:00, 87.1MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/grafstor/19-000-russian-poems/versions/1


In [30]:
data_path = "/root/.cache/kagglehub/datasets/grafstor/19-000-russian-poems/versions/1/poems.csv"

In [31]:
# Загружаем CSV
dataset = load_dataset("csv", data_files=data_path)

Generating train split: 0 examples [00:00, ? examples/s]

In [44]:
dataset

DatasetDict({
    train: Dataset({
        features: ['writer', 'poem', 'text'],
        num_rows: 19316
    })
})

In [45]:
# 1. Форматируем примеры в Q&A
def format_example(example):
    writer = example.get("writer") or "неизвестный автор"
    poem = example.get("poem") or "без названия"
    text = example.get("text") or ""

    formatted = f'Вопрос: Напиши стихотворение в стиле {writer} под названием "{poem}"\nОтвет: {text.strip()}'
    return {"text": formatted}

dataset = dataset.map(format_example)

Map:   0%|          | 0/19316 [00:00<?, ? examples/s]

In [41]:
# Загружаем токенизатор и модель
model_name = "ai-forever/rugpt3medium_based_on_gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(model_name)

In [46]:
# 2. Убираем пустые строки
def filter_empty(example):
    return example["text"] is not None and len(example["text"].strip()) > 0

dataset = dataset.filter(filter_empty)

Filter:   0%|          | 0/19316 [00:00<?, ? examples/s]

In [48]:
# 3. Токенизация
# Токенизация
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=256,
    )

tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset["train"].column_names
)
# 4. Установка формата для PyTorch
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

Map:   0%|          | 0/19316 [00:00<?, ? examples/s]

In [51]:

# Коллектор пакетов (динамическое паддинг)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # GPT не использует маскированное моделирование
)

# Аргументы обучения
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=1,             # попробуй сначала 1 эпоху
    per_device_train_batch_size=2,  # можно увеличить при большем GPU
    per_device_eval_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100,
    # evaluation_strategy="steps",
    eval_steps=500,
    fp16=torch.cuda.is_available(),  # ускорение на GPU
    report_to="none",                # не слать метрики в wandb
)


In [None]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"] if "test" in tokenized_dataset else None,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Запуск обучения
trainer.train()

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,4.2219
200,3.5161
300,3.492
400,3.463
500,3.3927
600,3.3018
700,3.3419
800,3.3415
900,3.3172
1000,3.2757


ls: cannot access './output/': No such file or directory


In [None]:
prompt = 'Вопрос: Напиши стихотворение в стиле Пушкин под названием "Зимний вечер"\nОтвет:'
inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)

output = model.generate(
    **inputs,
    max_length=120,
    temperature=0.9,
    top_k=50,
    top_p=0.95,
    do_sample=True
)

print(tokenizer.decode(output[0], skip_special_tokens=True))