### Лабораторная работа №3: **Генерация текста с использованием рекуррентных нейронных сетей (RNN)**

#### Теоретическая часть

**Рекуррентные нейронные сети (RNN)** – это вид нейронных сетей, специально разработанный для работы с последовательностями данных, такими как текст. Они способны запоминать предыдущие состояния и использовать эту информацию для прогнозирования следующего элемента последовательности.

#### Практическая задача

Создайте модель RNN для генерации текста на основе корпуса данных литературных произведений:

1. Загрузите и предобработайте данные: токенизируйте текст, создайте словарь и преобразуйте текст в индексы.
2. Определите архитектуру RNN: количество слоев, размер скрытого слоя, функции активации.
3. Обучите модель на тренировочном наборе данных.
4. Проверьте способность модели генерировать осмысленный текст.
5. Оцените качество генерируемого текста вручную и с помощью метрик, таких как BLEU.

#### Указания по выполнению

1. Используйте библиотеку `TensorFlow` или `PyTorch` для построения и обучения модели.
2. Экспериментируйте с различными гиперпараметрами модели (размер батча, количество эпох, скорость обучения).
3. Проведите сравнение производительности вашей модели с другими подходами, такими как LSTM или GRU.
4. Подготовьте отчет, включающий код, примеры сгенерированного текста и анализ результатов.

In [None]:
from nltk.corpus import gutenberg
from nltk.tokenize import word_tokenize
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import string
import torch.nn.functional as F

### 1. Загрузите и предобработайте данные: токенизируйте текст, создайте словарь и преобразуйте текст в индексы.

In [2]:
print(gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [3]:
file_ids = gutenberg.fileids()
texts_tokens = {} 

for fileid in file_ids:
    text = gutenberg.raw(fileid)
    tokens = [token.lower() for token in word_tokenize(text) if token not in string.punctuation]
    texts_tokens[fileid] = tokens

all_tokens = [token for tokens in texts_tokens.values() for token in tokens]
vocab = {token: idx for idx, token in enumerate(sorted(set(all_tokens)))}

print("Размер словаря:", len(vocab))

Размер словаря: 52494


In [4]:
texts_indices = {}
for fileid, tokens in texts_tokens.items():
    indices = [vocab[token] for token in tokens]
    texts_indices[fileid] = indices

print(texts_indices[file_ids[0]][:20])

[18621, 11210, 27723, 8239, 1914, 50340, 25779, 12151, 25779, 18621, 51871, 23995, 12903, 7047, 39668, 51719, 5517, 13357, 25209, 7047]


### 2. Определите архитектуру RNN: количество слоев, размер скрытого слоя, функции активации.


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers):
        super(RNNModel, self).__init__()
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(
            input_size=embedding_dim, 
            hidden_size=hidden_size, 
            num_layers=num_layers, 
            batch_first=True, 
            nonlinearity='tanh'
        )
        self.fc = nn.Linear(hidden_size, vocab_size)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

    def forward(self, x, hidden):
        embed = self.embedding(x)  
        output, hidden = self.rnn(embed, hidden)  
        output = self.fc(output)  
        return output, hidden

    def init_hidden(self, batch_size):
        return torch.zeros(self.num_layers, batch_size, self.hidden_size)


vocab_size = len(vocab)
embedding_dim = 128
hidden_size = 256
num_layers = 2    

model = RNNModel(vocab_size, embedding_dim, hidden_size, num_layers).to(device)

### 3. Обучите модель на тренировочном наборе данных.

In [6]:
sample_file = list(texts_indices.keys())[0]
data = texts_indices[sample_file]
seq_length = 30

class TextDataset(Dataset):
    def __init__(self, data, seq_length):
        self.data = data
        self.seq_length = seq_length

    def __len__(self):
        return len(self.data) - self.seq_length

    def __getitem__(self, idx):
        x = self.data[idx:idx+self.seq_length]
        y = self.data[idx+1: idx+self.seq_length+1]
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)

dataset = TextDataset(data, seq_length)
batch_size = 64
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [7]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001)
num_epochs = 5

model.train()
for epoch in range(num_epochs):
    epoch_loss = 0.0
    for batch_idx, (inputs, targets) in enumerate(dataloader):
        optimizer.zero_grad()
        inputs = inputs.to(device)
        targets = targets.to(device)

        current_batch_size = inputs.size(0)
        hidden = model.init_hidden(current_batch_size).to(device)

        outputs, hidden = model(inputs, hidden)
        outputs = outputs.reshape(-1, vocab_size)
        targets = targets.reshape(-1)

        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        if batch_idx % 100 == 0:
            print(f"Epoch {epoch+1}/{num_epochs}, Batch {batch_idx}, Loss: {loss.item():.4f}")
    avg_loss = epoch_loss / len(dataloader)
    print(f"Epoch {epoch+1} average loss: {avg_loss:.4f}")

Epoch 1/5, Batch 0, Loss: 10.8809
Epoch 1/5, Batch 100, Loss: 6.2510
Epoch 1/5, Batch 200, Loss: 6.1821
Epoch 1/5, Batch 300, Loss: 5.6563
Epoch 1/5, Batch 400, Loss: 5.3170
Epoch 1/5, Batch 500, Loss: 5.0477
Epoch 1/5, Batch 600, Loss: 4.8388
Epoch 1/5, Batch 700, Loss: 4.7114
Epoch 1/5, Batch 800, Loss: 4.5842
Epoch 1/5, Batch 900, Loss: 4.5493
Epoch 1/5, Batch 1000, Loss: 4.4937
Epoch 1/5, Batch 1100, Loss: 4.3641
Epoch 1/5, Batch 1200, Loss: 4.1875
Epoch 1/5, Batch 1300, Loss: 4.1423
Epoch 1/5, Batch 1400, Loss: 4.0706
Epoch 1/5, Batch 1500, Loss: 3.9702
Epoch 1/5, Batch 1600, Loss: 3.9397
Epoch 1/5, Batch 1700, Loss: 3.7653
Epoch 1/5, Batch 1800, Loss: 3.6708
Epoch 1/5, Batch 1900, Loss: 3.6410
Epoch 1/5, Batch 2000, Loss: 3.4713
Epoch 1/5, Batch 2100, Loss: 3.4618
Epoch 1/5, Batch 2200, Loss: 3.3776
Epoch 1/5, Batch 2300, Loss: 3.3132
Epoch 1/5, Batch 2400, Loss: 3.2149
Epoch 1/5, Batch 2500, Loss: 3.0893
Epoch 1/5, Batch 2600, Loss: 3.0636
Epoch 1 average loss: 4.3042
Epoch 2/5,

### 4. Проверьте способность модели генерировать осмысленный текст.

In [8]:
def generate_text(model, start_text, vocab, inv_vocab, device, gen_length=10):
    model.eval()
    tokens = word_tokenize(start_text)
    input_indices = [vocab.get(token, 0) for token in tokens]
    
    input_tensor = torch.tensor(input_indices, dtype=torch.long).unsqueeze(0).to(device)
    hidden = model.init_hidden(input_tensor.size(0)).to(device)
    
    generated_tokens = tokens.copy()
    
    for _ in range(gen_length):
        outputs, hidden = model(input_tensor, hidden)
        logits = outputs[:, -1, :] 
        probs = F.softmax(logits, dim=-1)
        next_token_idx = torch.multinomial(probs, num_samples=1).item()
        generated_tokens.append(inv_vocab[next_token_idx])
        input_tensor = torch.tensor([[next_token_idx]], dtype=torch.long).to(device)
    
    return " ".join(generated_tokens)

inv_vocab = {idx: token for token, idx in vocab.items()}

start_text = "Alice was beginning to get very tired"
generated = generate_text(model, start_text, vocab, inv_vocab, device, gen_length=50)
print("Сгенерированный текст:\n", generated)

Сгенерированный текст:
 Alice was beginning to get very tired they were gone so very painful when the baby was fetched on and for it seemed as if ever willing to be constantly with her at possible and trying for their state she never met with her without carrying on isabella 's happiness had been prepared to have received a


Сгенерированный текст демонстрирует, что модель способна продолжать текст на основе заданного начала. Однако, как видно, результат может быть несколько бессвязным или фрагментированным, что часто бывает при ограниченном обучении или недостаточной мощности модели.

In [9]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers):
        super(LSTMModel, self).__init__()
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True
        )
        self.fc = nn.Linear(hidden_size, vocab_size)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

    def forward(self, x, hidden):
        embed = self.embedding(x)
        output, hidden = self.lstm(embed, hidden)
        output = self.fc(output)
        return output, hidden

    def init_hidden(self, batch_size):
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size)
        return (h0, c0)

vocab_size = len(vocab)
embedding_dim = 128
hidden_size = 256
num_layers = 2

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_lstm = LSTMModel(vocab_size, embedding_dim, hidden_size, num_layers).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model_lstm.parameters(), lr=0.001)
num_epochs = 5

model_lstm.train()
for epoch in range(num_epochs):
    epoch_loss = 0.0
    for batch_idx, (inputs, targets) in enumerate(dataloader):
        optimizer.zero_grad()
        inputs = inputs.to(device)
        targets = targets.to(device)
        current_batch_size = inputs.size(0)
        hidden = model_lstm.init_hidden(current_batch_size)
        hidden = (hidden[0].to(device), hidden[1].to(device))
        outputs, hidden = model_lstm(inputs, hidden)
        outputs = outputs.reshape(-1, vocab_size)
        targets = targets.reshape(-1)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        if batch_idx % 100 == 0:
            print(f"LSTM Epoch {epoch+1}/{num_epochs}, Batch {batch_idx}, Loss: {loss.item():.4f}")
    avg_loss = epoch_loss / len(dataloader)
    print(f"LSTM Epoch {epoch+1} average loss: {avg_loss:.4f}")

LSTM Epoch 1/5, Batch 0, Loss: 10.8700
LSTM Epoch 1/5, Batch 100, Loss: 6.4026
LSTM Epoch 1/5, Batch 200, Loss: 6.2967
LSTM Epoch 1/5, Batch 300, Loss: 6.1839
LSTM Epoch 1/5, Batch 400, Loss: 6.3843
LSTM Epoch 1/5, Batch 500, Loss: 6.2551
LSTM Epoch 1/5, Batch 600, Loss: 6.0833
LSTM Epoch 1/5, Batch 700, Loss: 5.9969
LSTM Epoch 1/5, Batch 800, Loss: 5.9000
LSTM Epoch 1/5, Batch 900, Loss: 5.8403
LSTM Epoch 1/5, Batch 1000, Loss: 5.6221
LSTM Epoch 1/5, Batch 1100, Loss: 5.7723
LSTM Epoch 1/5, Batch 1200, Loss: 5.5979
LSTM Epoch 1/5, Batch 1300, Loss: 5.5823
LSTM Epoch 1/5, Batch 1400, Loss: 5.5244
LSTM Epoch 1/5, Batch 1500, Loss: 5.3660
LSTM Epoch 1/5, Batch 1600, Loss: 5.3731
LSTM Epoch 1/5, Batch 1700, Loss: 5.4039
LSTM Epoch 1/5, Batch 1800, Loss: 5.3224
LSTM Epoch 1/5, Batch 1900, Loss: 5.2762
LSTM Epoch 1/5, Batch 2000, Loss: 5.2308
LSTM Epoch 1/5, Batch 2100, Loss: 5.1516
LSTM Epoch 1/5, Batch 2200, Loss: 5.1651
LSTM Epoch 1/5, Batch 2300, Loss: 5.1178
LSTM Epoch 1/5, Batch 2400,

In [None]:
def generate_text_lstm(model, start_text, vocab, inv_vocab, device, gen_length=30):
    model.eval()
    tokens = word_tokenize(start_text)
    input_indices = [vocab.get(token, 0) for token in tokens]
    input_tensor = torch.tensor(input_indices, dtype=torch.long).unsqueeze(0).to(device)
    hidden = model.init_hidden(input_tensor.size(0))
    hidden = (hidden[0].to(device), hidden[1].to(device))
    generated_tokens = tokens.copy()
    for _ in range(gen_length):
        outputs, hidden = model(input_tensor, hidden)
        logits = outputs[:, -1, :]
        probs = F.softmax(logits, dim=-1)
        next_token_idx = torch.multinomial(probs, num_samples=1).item()
        generated_tokens.append(inv_vocab[next_token_idx])
        input_tensor = torch.tensor([[next_token_idx]], dtype=torch.long).to(device)
    return " ".join(generated_tokens)

generated_lstm = generate_text_lstm(model_lstm, "Alice was beginning to get very tired", vocab, inv_vocab, device, gen_length=30)
print("Сгенерированный текст LSTM:\n", generated_lstm)

Сгенерированный текст LSTM:
 Alice was beginning to get very tired of his affections were he could returning her but with many people -- no present to sit for the help of a little deal of intimacy first in the drawing-room


In [13]:
class GRUModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers):
        super(GRUModel, self).__init__()
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True
        )
        self.fc = nn.Linear(hidden_size, vocab_size)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

    def forward(self, x, hidden):
        embed = self.embedding(x)
        output, hidden = self.gru(embed, hidden)
        output = self.fc(output)
        return output, hidden

    def init_hidden(self, batch_size):
        return torch.zeros(self.num_layers, batch_size, self.hidden_size)

model_gru = GRUModel(vocab_size, embedding_dim, hidden_size, num_layers).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model_gru.parameters(), lr=0.001)
num_epochs = 5

model_gru.train()
for epoch in range(num_epochs):
    epoch_loss = 0.0
    for batch_idx, (inputs, targets) in enumerate(dataloader):
        optimizer.zero_grad()
        inputs = inputs.to(device)
        targets = targets.to(device)
        current_batch_size = inputs.size(0)
        hidden = model_gru.init_hidden(current_batch_size).to(device)
        outputs, hidden = model_gru(inputs, hidden)
        outputs = outputs.reshape(-1, vocab_size)
        targets = targets.reshape(-1)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        if batch_idx % 100 == 0:
            print(f"GRU Epoch {epoch+1}/{num_epochs}, Batch {batch_idx}, Loss: {loss.item():.4f}")
    avg_loss = epoch_loss / len(dataloader)
    print(f"GRU Epoch {epoch+1} average loss: {avg_loss:.4f}")

GRU Epoch 1/5, Batch 0, Loss: 10.8745
GRU Epoch 1/5, Batch 100, Loss: 6.2945
GRU Epoch 1/5, Batch 200, Loss: 6.2676
GRU Epoch 1/5, Batch 300, Loss: 6.2466
GRU Epoch 1/5, Batch 400, Loss: 6.1190
GRU Epoch 1/5, Batch 500, Loss: 6.0919
GRU Epoch 1/5, Batch 600, Loss: 6.1945
GRU Epoch 1/5, Batch 700, Loss: 5.9153
GRU Epoch 1/5, Batch 800, Loss: 5.8092
GRU Epoch 1/5, Batch 900, Loss: 5.8010
GRU Epoch 1/5, Batch 1000, Loss: 5.7039
GRU Epoch 1/5, Batch 1100, Loss: 5.7064
GRU Epoch 1/5, Batch 1200, Loss: 5.6133
GRU Epoch 1/5, Batch 1300, Loss: 5.5367
GRU Epoch 1/5, Batch 1400, Loss: 5.4066
GRU Epoch 1/5, Batch 1500, Loss: 5.3310
GRU Epoch 1/5, Batch 1600, Loss: 5.3405
GRU Epoch 1/5, Batch 1700, Loss: 5.2307
GRU Epoch 1/5, Batch 1800, Loss: 5.1176
GRU Epoch 1/5, Batch 1900, Loss: 4.8999
GRU Epoch 1/5, Batch 2000, Loss: 4.9798
GRU Epoch 1/5, Batch 2100, Loss: 4.8942
GRU Epoch 1/5, Batch 2200, Loss: 4.7802
GRU Epoch 1/5, Batch 2300, Loss: 4.7155
GRU Epoch 1/5, Batch 2400, Loss: 4.6195
GRU Epoch 1

In [14]:
def generate_text_gru(model, start_text, vocab, inv_vocab, device, gen_length=10):
    model.eval()
    tokens = word_tokenize(start_text)
    input_indices = [vocab.get(token, 0) for token in tokens]
    input_tensor = torch.tensor(input_indices, dtype=torch.long).unsqueeze(0).to(device)
    hidden = model.init_hidden(input_tensor.size(0)).to(device)
    generated_tokens = tokens.copy()
    for _ in range(gen_length):
        outputs, hidden = model(input_tensor, hidden)
        logits = outputs[:, -1, :]
        probs = F.softmax(logits, dim=-1)
        next_token_idx = torch.multinomial(probs, num_samples=1).item()
        generated_tokens.append(inv_vocab[next_token_idx])
        input_tensor = torch.tensor([[next_token_idx]], dtype=torch.long).to(device)
    return " ".join(generated_tokens)

generated_gru = generate_text_gru(model_gru, "Alice was beginning to get very tired", vocab, inv_vocab, device, gen_length=30)
print("Сгенерированный текст GRU:\n", generated_gru)

Сгенерированный текст GRU:
 Alice was beginning to get very tired that could really be feeling reasonable he would be so simple since his father 's marriage talk as possible undoubtedly man has notice her father and he seemed to have
