
 ### Генерация текста с помощью LSTM RNN на примере пацанских цитат/юморесок.



#### Делаем word2vec модель для подачи на вход сети

In [71]:
!pip install --upgrade gensim


Requirement already up-to-date: gensim in /usr/local/lib/python3.6/dist-packages (3.6.0)


In [0]:
from gensim.models import Word2Vec


In [0]:
class SentencesGenerator():
  def __init__(self, _filepath):
    self.filepath = _filepath
    self.length = 0 #количество строк
    self.max_sentence_len = 0
    
  def __iter__(self):
    import re
    sentences = []
    regex = re.compile(r"[\w']+|[.,!?;]")
    with open(self.filepath, 'r') as f:
      self.length = len(f.readlines())
      f.seek(0)
      for line in f:
        #self.length += 1
        if len(line) > self.max_sentence_len:
          self.max_sentence_len = len(line)
        
        yield regex.findall(line.lower())
        


In [0]:
def get_sentences(filepath):
  import re
  sentences = []
  max_sentence_len = 0
  regex = re.compile(r"[\w']+|[.,!?;]")
  with open(filepath, 'r', encoding='utf-8') as f:
    for line in f:
      sentences.append(regex.findall(line.lower()))
  return sentences


In [0]:
filepath = "HumourResult.txt"
sentences = get_sentences(filepath)
for i, sentence in enumerate(sentences):
  if not sentence:
    del sentences[i]
max_sentence_len = max([len(sentence) for sentence in sentences])

In [140]:
!pip install Cython # чтобы работала многопоточность



In [0]:
word_model = Word2Vec(sentences, min_count=1, workers=4, iter=100, size=100)

In [0]:
#model.save('word2vec.model')

In [143]:
word_weights = word_model.wv.syn0

  """Entry point for launching an IPython kernel.


In [0]:
vocab_size, embedding_size = word_weights.shape

In [145]:
print(vocab_size, embedding_size)

124359 100


In [0]:
def word2idx(word):
  return word_model.wv.vocab[word].index
def idx2word(idx):
   return word_model.wv.index2word[idx]

### Объявляем keras-модель.

In [1]:
from keras.callbacks import LambdaCallback
from keras.layers.recurrent import LSTM
from keras.layers import CuDNNLSTM, Bidirectional
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Activation
from keras.models import Sequential
from keras.utils.data_utils import get_file

Using TensorFlow backend.


In [0]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_size,
                    weights=[word_weights]))
#model.add(LSTM(units=embedding_size)) # для работы на CPU
model.add(Bidirectional(CuDNNLSTM(units=embedding_size)))
model.add(Dense(units=vocab_size))
model.add(Activation('softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

### Подготавливаем тренировочные данные

In [0]:
import numpy as np

In [0]:
train_x = np.zeros([len(sentences), max_sentence_len], dtype=np.int64)
train_y = np.zeros([len(sentences)], dtype=np.int64)
for i, sentence in enumerate(sentences):
  for t, word in enumerate(sentence[:-1]):
    train_x[i, t] = word2idx(word)
  train_y[i] = word2idx(sentence[-1])

In [0]:
def sample(preds, temperature=1.0):
  if temperature <= 0:
    return np.argmax(preds)
  preds = np.asarray(preds).astype('float64')
  preds = np.log(preds) / temperature
  exp_preds = np.exp(preds)
  preds = exp_preds / np.sum(exp_preds)
  probas = np.random.multinomial(1, preds, 1)
  return np.argmax(probas)

def generate_next(text, num_generated=10):
  word_idxs = [word2idx(word) for word in text.lower().split()]
  for i in range(num_generated):
    prediction = model.predict(x=np.array(word_idxs))
    idx = sample(prediction[-1], temperature=0.5)
    word_idxs.append(idx)
  return ' '.join(idx2word(idx) for idx in word_idxs)

def on_epoch_end(epoch, _):
  from random import choice
  texts = [choice(sentences)[0] for i in range(5)]
  for text in texts:
    sample = generate_next(text, num_generated=100)
    print('%s... -> %s' % (text, sample))
    
    

In [0]:
batch_size = 32
epochs = 60

In [0]:
model.fit(train_x, train_y,
          batch_size=batch_size,
          epochs=epochs,
          callbacks=[LambdaCallback(on_epoch_end=on_epoch_end)])

In [0]:
generate_next("я", num_generated=50)