In [5]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
from torch.utils.data import Dataset
from sklearn.metrics import mean_absolute_error
import re

# === Шаг 1. Загрузка предобученной модели GPT-2 ===
model_name = "gpt2-large"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Установка pad_token
tokenizer.pad_token = tokenizer.eos_token

# Устройство для обучения
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# === Шаг 2. Загрузка и сэмплирование небольшого датасета ===
dataset = load_dataset("ashraq/movielens_ratings", split="train")

# Берем небольшой подмножество
n = 10000
small_dataset = dataset.select(range(n))
train_dataset = small_dataset.select(range(int(0.8 * n)))
eval_dataset = small_dataset.select(range(int(0.8 * n), n))

# === Шаг 3. Преобразование датасета для обучения модели ===
class MovieLensDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length=128):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        row = self.dataset[idx]
        user_id = row["user_id"]
        movie_title = row["title"]
        genres = row["genres"]
        rating = row["rating"]

        # Формируем prompt для модели
        prompt = (
            f"User ID: {user_id}\n"
            f"Movie title: {movie_title}\n"
            f"Genres: {genres}\n"
            "Predict the user's rating (1-5): Answer only with a number between 1 and 5."
        )
        target = f" {rating}"

        # Токенизация prompt и целевого значения
        inputs = self.tokenizer(
            prompt,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt",
        )
        labels = self.tokenizer(
            target,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt",
        )["input_ids"]

        # Добавляем метки в inputs
        inputs["labels"] = labels.squeeze(0)
        return {key: val.squeeze(0) for key, val in inputs.items()}

# Преобразуем датасет для обучения
train_dataset = MovieLensDataset(train_dataset, tokenizer)
eval_dataset = MovieLensDataset(eval_dataset, tokenizer)

# === Шаг 4. Настройка обучения ===
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,  # Уменьшаем до 1 эпохи
    per_device_train_batch_size=4,
    save_steps=50,  # Сохранение модели каждые 10 шагов
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=25,  # Логирование каждые 5 шагов
    eval_strategy="steps",
    eval_steps=50,  # Оценка каждые 10 шагов
    report_to="none",
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# === Шаг 5. Обучение модели ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

print("Starting training on small dataset...")
trainer.train()
print("Training completed!")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Starting training on small dataset...




Step,Training Loss,Validation Loss
50,0.5714,0.538517
100,0.5317,0.507674
150,0.5114,0.486699
200,0.4909,0.474364
250,0.4915,0.469081




Training completed!


In [6]:
# === Шаг 6. Функция для предсказания рейтинга ===
from tqdm import tqdm
def clean_prediction(prediction):
    """Извлечение числа из строки"""
    match = re.search(r"\b[1-5]\b", prediction)
    return int(match.group(0)) if match else None

def predict_from_eval_dataset(eval_dataset):
    true_ratings = []
    predicted_ratings = []

    for idx in tqdm(range(len(eval_dataset.dataset))):
        row = eval_dataset.dataset[idx]
        user_id = row["user_id"]
        movie_title = row["title"]
        genres = row["genres"]
        true_rating = row["rating"]

        prompt = (
            f"User ID: {user_id}\n"
            f"Movie title: {movie_title}\n"
            f"Genres: {genres}\n"
            "Predict the user's rating (1-5): Answer only with a number between 1 and 5."
        )
        inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)
        outputs = model.generate(
            inputs,
            max_new_tokens=5,  # Ограничиваем длину только предсказания
            pad_token_id=tokenizer.pad_token_id
        )
        predicted_rating = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Очищаем вывод модели
        predicted_rating = clean_prediction(predicted_rating)
        if predicted_rating is not None:
            true_ratings.append(true_rating)
            predicted_ratings.append(predicted_rating)
        else:
            print(f"Invalid prediction: {predicted_rating}, skipping this example.")
    
    return true_ratings, predicted_ratings

In [7]:
# === Шаг 7. Тестирование модели и расчет метрики ===
true_ratings, predicted_ratings = predict_from_eval_dataset(train_dataset)

# Расчет метрики MAE
if true_ratings and predicted_ratings:
    mae = mean_absolute_error(true_ratings, predicted_ratings)
    print(f"Mean Absolute Error (MAE) for train: {mae}")
else:
    print("No valid predictions were made.")

# === Шаг 7. Тестирование модели и расчет метрики ===
true_ratings, predicted_ratings = predict_from_eval_dataset(eval_dataset)

# Расчет метрики MAE
if true_ratings and predicted_ratings:
    mae = mean_absolute_error(true_ratings, predicted_ratings)
    print(f"Mean Absolute Error (MAE) for val: {mae}")
else:
    print("No valid predictions were made.")

  0%|          | 0/8000 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
100%|██████████| 8000/8000 [13:10<00:00, 10.12it/s]


Mean Absolute Error (MAE) for train: 2.5455


100%|██████████| 2000/2000 [03:17<00:00, 10.11it/s]

Mean Absolute Error (MAE) for val: 2.58025



