# Recurrent Neural Network (Генерация текста)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences, to_categorical

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.layers import LSTM, GRU, Dropout, Embedding
from tensorflow.keras.regularizers import l2

In [2]:
DATA_PATH = './data/alice_in_wonderland.txt'

## Загрузка и подготовка данных

In [3]:
# Загрузка файла
data = None
with open(DATA_PATH, 'r', encoding='utf-8') as file:
    data = file.read()

corpus = data.lower().split('\n')
corpus = [text for text in corpus if text.strip()]

In [4]:
# Обучим токенайзер на нашем тексте
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(corpus)

# Сделаем из каждой строки по несколько последовательностей слов:
# Мама [мыла]
# Мама мыла [раму]
# ...
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Сделаем все последовательности одинаковой длинны
max_sequence_len = max([len(sequence) for sequence in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences,
                                         maxlen=max_sequence_len,
                                         padding='pre'))

# Разобьем последовательности на входные данные и предсказания
predictors, label = input_sequences[:, :-1], input_sequences[:, -1]
label = to_categorical(label)

In [5]:
total_words = len(tokenizer.word_index) + 1

loss = 'categorical_crossentropy'
optimizer = 'adam'
metrics = ['accuracy']

In [6]:
model = Sequential()
model.add(Embedding(total_words, 128, input_length=max_sequence_len-1))
model.add(LSTM(256, activity_regularizer=l2(0.01)))
model.add(Dropout(0.4))
model.add(Dense(1024, activation='leaky_relu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.3))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 17, 128)           435072    
                                                                 
 lstm (LSTM)                 (None, 256)               394240    
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense (Dense)               (None, 1024)              263168    
                                                                 
 dropout_1 (Dropout)         (None, 1024)              0         
                                                                 
 dense_1 (Dense)             (None, 3399)              3483975   
                                                                 
Total params: 4,576,455
Trainable params: 4,576,455
Non-

In [7]:
from_index = {value: key for key, value in tokenizer.word_index.items()}

def predict(text, count=10):
    for _ in range(count):
        tokens = tokenizer.texts_to_sequences([text])
        tokens = pad_sequences(tokens, maxlen=max_sequence_len-1, padding='pre')
        pred = model.predict(tokens, verbose=0)
        pred = pred.argmax()
        word = from_index[pred]
        text += ' ' + word

    return text

In [8]:
iterations = 5
epochs = 5
text = 'here were doors all round the hall'

for i in range(iterations):
    print(f'Iteration {i}: {predict(text)}\n')
    model.fit(predictors, label,
              epochs=epochs,
              validation_split=0.2,
              verbose=1)

print(f'Iteration {iterations}: {predict(text)}')

Iteration 0: here were doors all round the hall quiver pennyworth croqueting pennyworth leave leave wherever wherever shrieked identification

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Iteration 1: here were doors all round the hall and the mock turtle was the little voice and the

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Iteration 2: here were doors all round the hall and she had not a little thing to be a

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Iteration 3: here were doors all round the hall and the gryphon said the gryphon in a little tone

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Iteration 4: here were doors all round the hall but she was to find it out down it was

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Iteration 5: here were doors all round the hall but she had to herself in a minute or two
