In [12]:
import torch
from src.data_utils import clean_text

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [14]:
# Read txt-file
with open('data/raw_dataset.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()

# delete \n
texts = [line.strip() for line in lines if line.strip()]

# clean data
cleaned_dataset = [clean_text(text) for text in texts]

In [15]:
n = len(cleaned_dataset)
train_size = int(0.8 * n)
val_size = int(0.1 * n)

train_texts = cleaned_dataset[:train_size]
val_texts = cleaned_dataset[train_size:train_size + val_size]
test_texts = cleaned_dataset[train_size + val_size:]

In [16]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "distilgpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

# GPT2 не имеет padding token по умолчанию, добавим
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [29]:
from torch.utils.data import DataLoader, Dataset

from torch.utils.data import Dataset
import torch

class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len=128):
        """
        texts: list of items, где item может быть:
            - str  (сырой текст)
            - list[int] (токены)
            - torch.Tensor (1D tensor of token ids)
        tokenizer: transformers tokenizer (used if item is str)
        """
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def _convert_item_to_ids(self, item):
        # 1) если это уже torch.Tensor -> вернуть long tensor
        if isinstance(item, torch.Tensor):
            return item.long()
        # 2) если это список/кортеж
        if isinstance(item, (list, tuple)):
            # если элементы списка — числа -> считаем это list of ids
            if len(item) == 0:
                return torch.tensor([], dtype=torch.long)
            first = item[0]
            # если список целых — преобразуем напрямую
            if isinstance(first, (int,)):
                return torch.tensor(list(item), dtype=torch.long)
            # если список строк (например, список слов/символов) — соберём в строку и токенизируем
            if isinstance(first, str):
                # небольшая эвристика: если элементы — одиночные символы (len==1), склеим без пробелов
                if all(isinstance(t, str) and len(t) == 1 for t in item):
                    text = "".join(item)
                else:
                    # иначе предполагаем список слов -> соединяем пробелом
                    text = " ".join(item)
                enc = self.tokenizer.encode(text, truncation=True, max_length=self.max_len)
                return torch.tensor(enc, dtype=torch.long)
            # иначе — попробуем сконвертировать элементарно
            return torch.tensor(list(item), dtype=torch.long)

        # 3) если это строка — токенизируем
        if isinstance(item, str):
            enc = self.tokenizer.encode(item, truncation=True, max_length=self.max_len)
            return torch.tensor(enc, dtype=torch.long)

        # 4) если ни одно из выше — бросаем ошибку
        raise TypeError(f"Unsupported item type: {type(item)}")

    def __getitem__(self, idx):
        # поддержка слайса: возвращаем список элементов в том же формате, как обычный Python slicing
        if isinstance(idx, slice):
            indices = range(*idx.indices(len(self)))
            return [self._convert_item_to_ids(self.texts[i]) for i in indices]

        # single index
        item = self.texts[idx]
        ids = self._convert_item_to_ids(item)
        return ids



train_dataset = TextDataset(train_texts, tokenizer)
val_dataset = TextDataset(val_texts, tokenizer)

In [26]:
def generate_completion(model, tokenizer, input_ids, gen_fraction=0.25, max_len=None):
    """
    input_ids: torch.Tensor [L]
    """
    L = input_ids.size(0)
    L_gen = int(L * gen_fraction)
    seed = input_ids[:L - L_gen].unsqueeze(0).to(device)  # batch 1

    gen_output = model.generate(
        seed,
        max_length=(seed.size(1) + L_gen),
        do_sample=False,       # greedy
        pad_token_id=tokenizer.pad_token_id
    )
    return gen_output.squeeze(0)

In [None]:
from rouge_score import rouge_scorer

def compute_rouge_gpt(model, dataset, tokenizer, gen_fraction=0.25):
    model.eval()
    scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rougeL'], use_stemmer=True)
    scores_list = []

    for input_ids in dataset:
        input_ids = input_ids.to(device)
        L = input_ids.size(0)
        L_gen = int(L * gen_fraction)

        gen_ids = generate_completion(model, tokenizer, input_ids, gen_fraction=gen_fraction)
        gen_text = tokenizer.decode(gen_ids[L - L_gen:], skip_special_tokens=True)
        tgt_text = tokenizer.decode(input_ids[L - L_gen:], skip_special_tokens=True)

        scores = scorer.score(tgt_text, gen_text)
        scores_list.append(scores)

    # усреднение
    avg_scores = {}
    for key in ['rouge1','rouge2','rougeL']:
        avg_scores[key] = sum([s[key].fmeasure for s in scores_list]) / len(scores_list)
    return avg_scores

In [30]:
train_dataset = TextDataset(train_texts, tokenizer)
val_dataset   = TextDataset(val_texts, tokenizer)

# безопасный перебор первых 5 примеров:
for i in range(5):
    input_ids = val_dataset[i].unsqueeze(0).to(device)  # shape [1, L]
    gen_ids = model.generate(
        input_ids,
        max_new_tokens=50,
        do_sample=True,
        temperature=0.8,
        top_p=0.9,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id
    )
    L = input_ids.size(1)
    seed_len = int(0.75 * L)
    print("Input:", tokenizer.decode(input_ids[0, :seed_len].cpu().tolist(), skip_special_tokens=True))
    print("Generated:", tokenizer.decode(gen_ids[0, seed_len:].cpu().tolist(), skip_special_tokens=True))
    print("---")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Input:  quot girl with beautiful eyes quot joined as user 's colleague in ascendas chennai
Generated:  he wants the name help him ㅎㅎㅎ









































---
Input: peeing a river ahh my best game
Generated:  of beer pong


















































---
Input:  user she is so fucking adorable aww
Generated:  i love kittens.
I love you.
---
Input:  user love that
Generated:  thing, and I really love it. And I've heard that the thing is you're just gonna want to do it to do it, because if you do it then you'll want to do it because it's a fantastic idea. So the problem is
---
Input: good morning from
Generated:  italy,” he said. …It’s good to have the good people and the bad people.”


…There’s a whole other story about the night,” he said. …We
---


In [32]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# 1️⃣ Настройка модели и устройства
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

# 2️⃣ Подготовка данных (используем твой класс TextDataset)
val_dataset = TextDataset(val_texts, tokenizer, max_len=128)

# 3️⃣ Функция генерации (используем простую версию для GPT)
def generate_completion(model, tokenizer, input_ids, gen_fraction=0.25, max_new_tokens=50):
    """Генерация продолжения текста на основе части входа"""
    model.eval()
    input_ids = input_ids.unsqueeze(0).to(device)
    L = input_ids.size(1)
    seed_len = int(L * (1 - gen_fraction))

    gen_ids = model.generate(
        input_ids[:, :seed_len],
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.8,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id
    )

    return gen_ids[0].cpu()

# 4️⃣ Вставляем твою функцию compute_rouge_gpt
from rouge_score import rouge_scorer

def compute_rouge_gpt(model, dataset, tokenizer, gen_fraction=0.25):
    model.eval()
    scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rougeL'], use_stemmer=True)
    scores_list = []

    for input_ids in dataset[:20]:  # ограничим 20 примерами для скорости
        input_ids = input_ids.to(device)
        L = input_ids.size(0)
        L_gen = int(L * gen_fraction)

        gen_ids = generate_completion(model, tokenizer, input_ids, gen_fraction=gen_fraction)
        gen_text = tokenizer.decode(gen_ids[L - L_gen:], skip_special_tokens=True)
        tgt_text = tokenizer.decode(input_ids[L - L_gen:], skip_special_tokens=True)

        scores = scorer.score(tgt_text, gen_text)
        scores_list.append(scores)

    # усреднение
    avg_scores = {}
    for key in ['rouge1','rouge2','rougeL']:
        avg_scores[key] = sum([s[key].fmeasure for s in scores_list]) / len(scores_list)
    return avg_scores

# 5️⃣ Запуск вычисления метрик
rouge_scores = compute_rouge_gpt(model, val_dataset, tokenizer, gen_fraction=0.25)
print(f"ROUGE-1={rouge_scores['rouge1']:.4f}  ROUGE-2={rouge_scores['rouge2']:.4f}  ROUGE-L={rouge_scores['rougeL']:.4f}")


ROUGE-1=0.0224  ROUGE-2=0.0022  ROUGE-L=0.0224
