In [1]:
import keras
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')
path = keras.utils.get_file('nietzsche.txt',
                            origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')

text = open(path).read().lower()
print('Corpus length:', len(text))

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Corpus length: 600901


In [2]:
maxlen = 60 # Извлечение последовательностей по 60 символов
step = 3 # Новые последовательности выбираются через каждые 3 символа
sentences = [] # Хранение извлеченных последовательностей
next_chars = [] # Хранение целей (символов, следующих за последовательностями

In [3]:
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('Number of sequences:', len(sentences))
chars = sorted(list(set(text))) # Список уникальных символов в корпусе
print('Unique characters:', len(chars))
char_indices = dict((char, chars.index(char)) for char in chars) # Словарь, отображающий уникальные символы в их индексы в списке «chars»
print('Vectorization...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
        y[i, char_indices[next_chars[i]]] = 1 # Прямое кодирование символов в бинарные массивы

Number of sequences: 200281
Unique characters: 59
Vectorization...


In [4]:
from keras import layers
model = keras.models.Sequential()
model.add(layers.LSTM(128, input_shape=(maxlen, len(chars))))
model.add(layers.Dense(len(chars), activation='softmax'))

In [5]:
optimizer = keras.optimizers.RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [6]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [7]:
import random
import sys
for epoch in range(1, 60): # Обучение модели в течение 60 эпох
    print('epoch', epoch)
    model.fit(x, y, batch_size=128, epochs=1) # Выполнение одной итерации обучения
start_index = random.randint(0, len(text) - maxlen - 1)
generated_text = text[start_index: start_index + maxlen]
print('--- Generating with seed: "' + generated_text + '"')
for temperature in [0.2, 0.5, 1.0, 1.2]:
    print('------ temperature:', temperature)
    sys.stdout.write(generated_text)
    for i in range(400):# Генерация 400 символов, начиная с начального текста
        sampled = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(generated_text):
            sampled[0, t, char_indices[char]] = 1.
        
        preds = model.predict(sampled, verbose=0)[0] # Выбор следующего символа
        next_index = sample(preds, temperature)
        next_char = chars[next_index]
        
        generated_text += next_char
        generated_text = generated_text[1:]
        
        sys.stdout.write(next_char)

epoch 1

Epoch 1/1
epoch 2
Epoch 1/1
epoch 3
Epoch 1/1
epoch 4
Epoch 1/1
epoch 5
Epoch 1/1
epoch 6
Epoch 1/1
epoch 7
Epoch 1/1
epoch 8
Epoch 1/1
epoch 9
Epoch 1/1
epoch 10
Epoch 1/1
epoch 11
Epoch 1/1
epoch 12
Epoch 1/1
epoch 13
Epoch 1/1
epoch 14
Epoch 1/1
epoch 15
Epoch 1/1
epoch 16
Epoch 1/1
epoch 17
Epoch 1/1
epoch 18
Epoch 1/1
epoch 19
Epoch 1/1
epoch 20
Epoch 1/1
epoch 21
Epoch 1/1
epoch 22
Epoch 1/1
epoch 23
Epoch 1/1
epoch 24
Epoch 1/1
epoch 25
Epoch 1/1
epoch 26
Epoch 1/1
epoch 27
Epoch 1/1
epoch 28
Epoch 1/1
epoch 29
Epoch 1/1
epoch 30
Epoch 1/1
epoch 31
Epoch 1/1
epoch 32
Epoch 1/1
epoch 33
Epoch 1/1
epoch 34
Epoch 1/1
epoch 35
Epoch 1/1
epoch 36
Epoch 1/1
epoch 37
Epoch 1/1
epoch 38
Epoch 1/1
epoch 39
Epoch 1/1
epoch 40
Epoch 1/1
epoch 41
Epoch 1/1
epoch 42
Epoch 1/1
epoch 43
Epoch 1/1
epoch 44
Epoch 1/1
epoch 45
Epoch 1/1
epoch 46
Epoch 1/1
epoch 47
Epoch 1/1
epoch 48
Epoch 1/1
epoch 49
Epoch 1/1
epoch 50
Epoch 1/1
epoch 51
Epoch 1/1
epoch 52
Epoch 1/1
epoch 53
Epoch 1/1
e