In [3]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import StepLR

from tqdm.auto import tqdm
import nltk
import string
from collections import Counter
from typing import List

In [4]:
# Простой искусственный словарь
vocab = ['<pad>', '<bos>', '<eos>', '<unk>', 'hello', 'love', 'you', 'more', 'then', 'live', 'my']
word2ind = {word: i for i, word in enumerate(vocab)}
ind2word = {i: word for word, i in word2ind.items()}

# Пример датасета, представляющего последовательности слов в виде индексов
sentences = [
    [word2ind['love'], word2ind['you'], word2ind['more'],  word2ind['then'], word2ind['live']],
    [word2ind['love'], word2ind['live']],
    [word2ind['love'], word2ind['you']],
    [word2ind['hello'], word2ind['my'], word2ind['love']],
    [word2ind['love'], word2ind['you'], word2ind['more']],
]
print ('sentences', sentences)

# Создаем Dataset
class SimpleDataset(Dataset):
    def __init__(self, data):
        self.data = data
    def __getitem__(self, idx):
        return self.data[idx]
    def __len__(self):
        return len(self.data)

# Функция паддинга для создания батчей
def collate_fn_with_padding(input_batch, pad_id=word2ind['<pad>']):
    seq_lens = [len(x) for x in input_batch]
    max_seq_len = max(seq_lens)
    new_batch = []
    for sequence in input_batch:
        padded_sequence = sequence + [pad_id] * (max_seq_len - len(sequence))
        new_batch.append(padded_sequence)
    sequences = torch.LongTensor(new_batch)
    return {
        'input_ids': sequences[:, :-1],  # Входные данные
        'target_ids': sequences[:, 1:]   # Целевые данные
    }
# Создаем DataLoader для train, eval, test наборов
train_dataset = SimpleDataset(sentences)
eval_dataset = SimpleDataset(sentences[:2])  # возьмем половину для оценки
test_dataset = SimpleDataset(sentences[2:])  # другая половина для тестирования

train_dataloader = DataLoader(train_dataset, collate_fn=collate_fn_with_padding, batch_size=2)
eval_dataloader = DataLoader(eval_dataset, collate_fn=collate_fn_with_padding, batch_size=2)
test_dataloader = DataLoader(test_dataset, collate_fn=collate_fn_with_padding, batch_size=2)

for i, batch in enumerate(train_dataloader):
    print(f"в train Batch {i + 1}:")
    print("в train Input и Target IDs состоит из элементов:", batch)
for i, batch in enumerate(eval_dataloader):
    print(f"в eval Batch {i + 1}:")
    print("в eval Input IDs состоит из элементов:", batch)
for i, batch in enumerate(test_dataloader):
    print(f"в test Batch {i + 1}:")
    print("в test Input IDs состоит из элементов:", batch)

# Модель
class LanguageModel(nn.Module):
    def __init__(self, hidden_dim: int, vocab_size: int):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_dim)
        self.lstm  = nn.LSTM(hidden_dim, hidden_dim, num_layers=1, batch_first=True)
        self.linear = nn.Linear(hidden_dim, hidden_dim)
        self.projection = nn.Linear(hidden_dim, vocab_size)
        self.non_lin = nn.Tanh()
        self.dropout = nn.Dropout(p=0.2)
    def forward(self, input_batch: torch.Tensor) -> torch.Tensor:
        embeddings = self.embedding(input_batch)  # [batch_size, seq_len, hidden_dim]
        output, _ = self.lstm(embeddings)  # [batch_size, seq_len, hidden_dim]
        output = self.dropout(self.linear(self.non_lin(output)))  # [batch_size, seq_len, hidden_dim]
        projection = self.projection(self.non_lin(output))  # [batch_size, seq_len, vocab_size]
        return projection

# Инициализация модели, функции потерь, оптимизатора, планировщика
hidden_dim = 16
model = LanguageModel(hidden_dim, len(vocab)).to('cpu')  # для демонстрации используем CPU
criterion = nn.CrossEntropyLoss(ignore_index=word2ind['<pad>'])
optimizer = torch.optim.Adam(model.parameters())
scheduler = StepLR(optimizer, step_size=1, gamma=0.8)

# Функция для вывода содержимого батча
def print_batch_info(batch, batch_num):
    print(f"\n--- Batch {batch_num} ---")
    print(f"Input IDs (input_batch):\n{batch['input_ids']}")
    print(f"Target IDs (target_batch):\n{batch['target_ids']}")

# Модифицированная функция тренировки с подробным выводом
def train_model_verbose(model, criterion, optimizer, scheduler, train_dataloader, eval_dataloader, epochs=5):
    perplexities = []
    losses = []
    for epoch in tqdm(range(epochs), desc="epoch"):
        print(f"\nEpoch {epoch+1}/{epochs}")
        model.train()  # Установка режима обучения
        epoch_losses = []

        for batch_num, train_batch in enumerate(tqdm(train_dataloader, desc="train"), 1):
            # Выводим информацию о текущем батче
            print_batch_info(train_batch, batch_num)
            # Получение предсказаний от модели (логитов)
            #Размер логита равен размеру словаря vocab

            output = model(train_batch["input_ids"]).flatten(start_dim=0, end_dim=1)

            print(f"\nLogits (predicted probabilities) for Batch {batch_num}:\n{output}")
            # Вычисление потерь
            loss = criterion(output, train_batch["target_ids"].flatten())
            print(f"Loss for Batch {batch_num}: {loss.item()}")
            # Обратный проход (backward) для вычисления градиентов
            loss.backward()
            # Шаг оптимизатора
            optimizer.step()
            optimizer.zero_grad()  # Обнуляем градиенты
            # Сохраняем потери для данной эпохи
            epoch_losses.append(loss.item())
        # Шаг планировщика скорости обучения
        scheduler.step()
        # Вычисляем средние потери за эпоху и сохраняем их
        average_loss = sum(epoch_losses) / len(epoch_losses)
        losses.append(average_loss)
        print(f"Average loss for Epoch {epoch+1}: {average_loss}")

        # Оценка на валидационном наборе
        perplexity = evaluate(model, criterion, eval_dataloader)
        perplexities.append(perplexity)
        print(f"Perplexity after Epoch {epoch+1}: {perplexity}")

    return perplexities, losses

# Функция оценки остается прежней
def evaluate(model, criterion, dataloader) -> float:
    model.eval()
    perplexity = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="validation"):
            logits = model(batch['input_ids']).flatten(start_dim=0, end_dim=1)
            loss = criterion(logits, batch['target_ids'].flatten())
            perplexity.append(torch.exp(loss).item())
    return sum(perplexity) / len(perplexity)

# Запуск модифицированной функции с детальным выводом
perplexities, losses = train_model_verbose(model, criterion, optimizer, scheduler, train_dataloader, eval_dataloader, epochs=2)

sentences [[5, 6, 7, 8, 9], [5, 9], [5, 6], [4, 10, 5], [5, 6, 7]]
в train Batch 1:
в train Input и Target IDs состоит из элементов: {'input_ids': tensor([[5, 6, 7, 8],
        [5, 9, 0, 0]]), 'target_ids': tensor([[6, 7, 8, 9],
        [9, 0, 0, 0]])}
в train Batch 2:
в train Input и Target IDs состоит из элементов: {'input_ids': tensor([[ 5,  6],
        [ 4, 10]]), 'target_ids': tensor([[ 6,  0],
        [10,  5]])}
в train Batch 3:
в train Input и Target IDs состоит из элементов: {'input_ids': tensor([[5, 6]]), 'target_ids': tensor([[6, 7]])}
в eval Batch 1:
в eval Input IDs состоит из элементов: {'input_ids': tensor([[5, 6, 7, 8],
        [5, 9, 0, 0]]), 'target_ids': tensor([[6, 7, 8, 9],
        [9, 0, 0, 0]])}
в test Batch 1:
в test Input IDs состоит из элементов: {'input_ids': tensor([[ 5,  6],
        [ 4, 10]]), 'target_ids': tensor([[ 6,  0],
        [10,  5]])}
в test Batch 2:
в test Input IDs состоит из элементов: {'input_ids': tensor([[5, 6]]), 'target_ids': tensor([[6, 

epoch:   0%|          | 0/2 [00:00<?, ?it/s]


Epoch 1/2


train:   0%|          | 0/3 [00:00<?, ?it/s]


--- Batch 1 ---
Input IDs (input_batch):
tensor([[5, 6, 7, 8],
        [5, 9, 0, 0]])
Target IDs (target_batch):
tensor([[6, 7, 8, 9],
        [9, 0, 0, 0]])

Logits (predicted probabilities) for Batch 1:
tensor([[ 2.2649e-02,  4.8745e-02, -2.5378e-02,  1.6534e-01, -3.1969e-02,
          2.9414e-03, -1.0806e-01, -2.2709e-01,  1.6811e-01, -3.8105e-02,
         -2.3821e-01],
        [-2.4355e-02,  4.2928e-02,  6.3947e-02,  1.2785e-01, -7.1539e-02,
         -1.1502e-02, -1.3770e-01, -1.4866e-01,  2.0306e-01, -3.5826e-02,
         -2.9012e-01],
        [-3.9662e-02, -5.9555e-02, -3.9900e-02,  1.0237e-01,  7.6742e-02,
          7.8711e-02, -2.0173e-01, -1.7646e-01,  1.2664e-01,  2.2367e-05,
         -2.3598e-01],
        [-6.8962e-03,  7.6402e-02,  1.0503e-01,  2.1888e-01, -1.1318e-01,
         -4.6649e-02, -2.3362e-01, -3.7980e-01,  2.9704e-01, -1.1371e-01,
         -3.2133e-01],
        [-4.1908e-03,  5.0093e-04,  2.7977e-02,  2.1820e-01, -3.3155e-02,
          3.2853e-03, -1.0180e-01, -

validation:   0%|          | 0/1 [00:00<?, ?it/s]

Perplexity after Epoch 1: 11.107569694519043

Epoch 2/2


train:   0%|          | 0/3 [00:00<?, ?it/s]


--- Batch 1 ---
Input IDs (input_batch):
tensor([[5, 6, 7, 8],
        [5, 9, 0, 0]])
Target IDs (target_batch):
tensor([[6, 7, 8, 9],
        [9, 0, 0, 0]])

Logits (predicted probabilities) for Batch 1:
tensor([[-0.0126, -0.0461, -0.0167,  0.1815,  0.0053,  0.0047, -0.1081, -0.2115,
          0.1295, -0.0074, -0.2651],
        [-0.0827,  0.0321,  0.0438,  0.2062, -0.0495,  0.0266, -0.1262, -0.2281,
          0.1598, -0.1143, -0.3977],
        [-0.0605,  0.0287, -0.0723,  0.1264, -0.0788,  0.0507,  0.0047, -0.2469,
          0.1747, -0.1361, -0.2692],
        [ 0.0285,  0.0511,  0.0481,  0.1811, -0.1197, -0.0602, -0.2020, -0.3259,
          0.2700, -0.0500, -0.2635],
        [-0.0192, -0.0217,  0.1156,  0.2797, -0.0067, -0.0338, -0.1672, -0.2790,
          0.1315, -0.0851, -0.3773],
        [ 0.0373, -0.0085,  0.1068,  0.1785, -0.0738, -0.0406, -0.2073, -0.2156,
          0.0934, -0.1445, -0.4001],
        [ 0.0117,  0.0182, -0.0992,  0.0847, -0.0901,  0.0791, -0.1122, -0.2501,
     

validation:   0%|          | 0/1 [00:00<?, ?it/s]

Perplexity after Epoch 2: 10.983153343200684
