# Двунаправленная LSTM

Импорт необходимых библиотек:

In [7]:
import numpy as np
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import Sequence
import re
import nltk
from nltk.tokenize import word_tokenize

Получаю текст для обучения модели, а также удаляю все лишние символы:

In [8]:
with open('/Users/dr0ozd/coding/AI_Lab_Generative/model_text.txt', 'r', encoding='utf-8') as file:
    text = file.read()

text = re.sub(r'[^a-zA-Z\s]', '', text)

Просмотрю получившийся текст (первые несколько строк):

In [10]:
# Собираю текст в строки
text_lines = text.splitlines()

# Вывод первых 10 строк
for line in text_lines[:10]:
    print(line)

The devil was looking out of the window Yet the traffic in the streets
was unchecked The cablecars whizzed past with a clanging clamor
Great rumbling vans laden with freight alternated with carriages
rolling noiselessly on rubbertired wheels The sidewalks were crowded
with pedestrians Men and boys ladies and little children boldly came
and went over the neighboring crossing although they could plainly see
the devils head poking out of a high window in the newspaper building
and hear the shrill tones of the devils voice as he discoursed to his
friend within



Для создадания токенизатора и словаря слов нужно скачать и установить пакет токенизации ```punkt```, при решении данной задачи возникли некоторые проблемы с сертификатами, а также определением местонахождения пакета, поэтому было принять решение поставить их вручную через графический интерфейс ```nltk```:

In [29]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [11]:
import nltk
from nltk.tokenize import word_tokenize

tokens = word_tokenize(text)

word_dict_test = {}
for word in tokens:
    if word not in word_dict_test:
        word_dict_test[word] = len(word_dict_test) + 1

reversed_word_dict = {v: k for k, v in word_dict_test.items()}

sequences = [word_dict_test[word] for word in tokens]

Класс ```DataGenerator```:  
Создаём класс генератора данных для последовательной загрузки батчей (частей) данных, что в конечном итоге улучшит итоговую производительность:

In [12]:
class DataGenerator(Sequence):
    def __init__(self, sequences, sequence_length, batch_size, vocab_size):
        self.sequences = sequences
        self.sequence_length = sequence_length
        self.batch_size = batch_size
        self.vocab_size = vocab_size

    def __len__(self):
        # Убедимся, что результат всегда >= 1
        return max((len(self.sequences) - self.sequence_length) // self.batch_size, 1)

    def __getitem__(self, idx):
        batch_x = []
        batch_y = []
        start_idx = idx * self.batch_size
        end_idx = min(start_idx + self.batch_size, len(self.sequences) - self.sequence_length)  # Корректируем, чтобы не выйти за пределы

        for i in range(start_idx, end_idx):
            x = self.sequences[i:i + self.sequence_length]
            y = self.sequences[i + self.sequence_length]
            batch_x.append(x)
            batch_y.append(y)

        batch_x = pad_sequences(batch_x, maxlen=self.sequence_length)
        batch_y = to_categorical(batch_y, num_classes=self.vocab_size)

        return np.array(batch_x), np.array(batch_y)


In [13]:
sequence_length = 50
batch_size = 128
vocab_size = len(word_dict_test) + 1

# Инициализация генератора данных
data_gen = DataGenerator(sequences, sequence_length, batch_size, vocab_size)

Создадим и обучим модель

In [17]:
# Построение модели
embedding_dim = 50
hidden_units = 128

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=sequence_length))
model.add(Bidirectional(LSTM(hidden_units)))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

Вместо model.fit используется model.fit(data_gen), чтобы модель обучалась на батчах данных, а не на всех данных сразу.

In [66]:
# Обучение модели
model.fit(data_gen, epochs=20)

Epoch 1/20
[1m361/361[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 183ms/step - accuracy: 0.0716 - loss: 6.9150
Epoch 2/20
[1m361/361[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 190ms/step - accuracy: 0.0882 - loss: 6.5659
Epoch 3/20
[1m361/361[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 173ms/step - accuracy: 0.0955 - loss: 6.3011
Epoch 4/20
[1m361/361[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 172ms/step - accuracy: 0.1162 - loss: 6.0248
Epoch 5/20
[1m361/361[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 168ms/step - accuracy: 0.1248 - loss: 5.7817
Epoch 6/20
[1m361/361[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 170ms/step - accuracy: 0.1385 - loss: 5.4655
Epoch 7/20
[1m361/361[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 169ms/step - accuracy: 0.1417 - loss: 5.1704
Epoch 8/20
[1m361/361[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 170ms/step - accuracy: 0.1530 - loss: 4.8385
Epoch 9/20
[1m3

<keras.src.callbacks.history.History at 0x2d721e3d0>

In [14]:
def generate_text(seed_text):
    seed_tokens = word_tokenize(seed_text)
    seed_sequence = [word_dict_test[word] for word in seed_tokens]

    for _ in range(100):
        input_sequence = pad_sequences([seed_sequence], maxlen=sequence_length)
        predicted_probs = model.predict(input_sequence)[0]
        predicted_word_index = np.random.choice(np.argsort(predicted_probs)[-5:][::-1])
        seed_sequence.append(predicted_word_index)
        seed_sequence = seed_sequence[-sequence_length:]

    return seed_text + ' ' + ' '.join([reversed_word_dict[index] for index in seed_sequence])

In [18]:
input_text = "Peter Bateman"
generated_text = generate_text(input_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 638ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2

In [19]:
print(generated_text)

Peter Bateman pride turning oblivious reveal reveal accepted quick realized greatly jobs Batemans nowthe message reciprocated message outer uncommunicativeness Whenever arraying week sequence libel suspicious discover Cambridge grip Silence defunct nettling overzealous city lurid No DESPOT dabbles startled backward memories outright march propelling Refund unmistakably Yes unfamiliar shameless surly imagine designs imagine
