# 相关包调取

In [1]:
import tensorflow as tf
import os
from six.moves import cPickle
import collections
import codecs
import numpy as np
import jieba

ModuleNotFoundError: No module named 'tensorflow'

# 读取数据

In [4]:
# Load the book as a string
FILE_PATH = '/content/drive/My Drive/mine/dataset/zhuxian.txt'

# Raw corpus of the book
corpus_raw = u""

with codecs.open(FILE_PATH, 'r', 'utf-8') as book_file:
    corpus_raw += book_file.read()

print("Corpus is {} characters long".format(len(corpus_raw)))

Corpus is 3126568 characters long


In [0]:
USE_SPLIT = True

# 分词

In [6]:
def create_lookup_tables(text, use_split=USE_SPLIT):
  """
  Create lookup tables for vocab
  :param text: The corpus text split into words
  :return: A tuple of dicts (vocab_to_int, int_to_vocab)
  """
  words = list(jieba.cut(text))
  vocab = set(words) if use_split else set(text)
    
  int_to_vocab = {key: word for key, word in enumerate(vocab)}
  vocab_to_int = {word: key for key, word in enumerate(vocab)}
    
  if use_split:
      text_index = [vocab_to_int[word] for word in words]
  else:
      text_index = [vocab_to_int[word] for word in text]
    
  return vocab_to_int, int_to_vocab, text_index


vocab_to_int, int_to_vocab, corpus_int = create_lookup_tables(corpus_raw)
print("Vocabulary size: {}, number of Chinese words in text: {}".format(len(corpus_int), len(vocab_to_int)))

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.838 seconds.
Prefix dict has been built successfully.


Vocabulary size: 2044284, number of Chinese words in text: 38094


# 构造网络

## 参数设置

In [0]:
# Hyperparameters
num_epochs = 400        # 将诛仙从头到尾训练多少遍
batch_size = 512        # 每次训练feed的batch大小
rnn_size = 128         # RNN Cell的Hidden Units的大小
num_layers = 2         # RNN的层数
keep_prob = 0.7         # Dropout保留率
embed_dim = 128         # 词向量的维度，这个要和RNN Hidden Units的大小一致
seq_length = 30         # Sequence的长度
learning_rate = 0.001      # 学习率
save_dir = '/content/drive/My Drive/mine/model/'       # 保存模型的位置

## 网络

In [8]:
# TensorFlow Train Graph
train_graph = tf.Graph()

with train_graph.as_default():
  # Initialize input placeholders
  input_text = tf.placeholder(tf.int32, [None, None], name='input')
  targets = tf.placeholder(tf.int32, [None, None], name='targets')
  lr = tf.placeholder(tf.float32, name='learning_rate')
    
  # Calculate text attributes
  vocab_size = len(int_to_vocab)
  input_text_shape = tf.shape(input_text)
    
  # Build the RNN cell
  lstm = tf.contrib.rnn.BasicLSTMCell(num_units=rnn_size)
  drop_cell = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
  cell = tf.contrib.rnn.MultiRNNCell([drop_cell] * num_layers)
    
  # Set the initial state
  initial_state = cell.zero_state(input_text_shape[0], tf.float32)
  initial_state = tf.identity(initial_state, name='initial_state')
    
  # Create word embedding as input to RNN
  embed = tf.contrib.layers.embed_sequence(input_text, vocab_size, embed_dim)
    
  # Build RNN
  outputs, final_state = tf.nn.dynamic_rnn(cell, embed, dtype=tf.float32)
  final_state = tf.identity(final_state, name='final_state')
    
  # Take RNN output and make logits
  logits = tf.contrib.layers.fully_connected(outputs, vocab_size, activation_fn=None)
    
  # Calculate the probability of generating each word
  probs = tf.nn.softmax(logits, name='probs')
    
  # Define loss function
  cost = tf.contrib.seq2seq.sequence_loss(logits,
                       targets,
                       tf.ones([input_text_shape[0], input_text_shape[1]]))
    
  # Learning rate optimizer
  optimizer = tf.train.AdamOptimizer(learning_rate)
    
  # Gradient clipping to avoid exploding gradients
  gradients = optimizer.compute_gradients(cost)
  capped_gradients = [(tf.clip_by_value(grad, -1., 1.), value) for grad, value in gradients if grad is not None]
  train_op = optimizer.apply_gradients(capped_gradients)

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Please use `layer.__call__` method instead.


In [0]:
## create data batch

In [0]:
def get_batches(int_text, batch_size, seq_length):
  """
  Return batches of input and target data
  :param int_text: text with words replaced by their ids
  :param batch_size: the size that each batch of data should be
  :param seq_length: the length of each sequence
  :return: batches of data as a numpy array
  """
  words_per_batch = batch_size * seq_length
  num_batches = len(int_text)//words_per_batch
  int_text = int_text[:num_batches*words_per_batch]
  y = np.array(int_text[1:] + [int_text[0]])
  x = np.array(int_text)
    
  x_batches = np.split(x.reshape(batch_size, -1), num_batches, axis=1)
  y_batches = np.split(y.reshape(batch_size, -1), num_batches, axis=1)
    
  batch_data = list(zip(x_batches, y_batches))
  return np.array(batch_data)

## train network

In [0]:
import time

batches = get_batches(corpus_int, batch_size, seq_length)
num_batches = len(batches)
start_time = time.time()

print("Num Batches per Epoche : {}, Total Epochs : {}".format(num_batches, num_epochs))

with tf.Session(graph=train_graph) as sess:
  sess.run(tf.global_variables_initializer())
  for epoch in range(num_epochs):
    state = sess.run(initial_state, {input_text: batches[0][0]})
    for batch_index, (x, y) in enumerate(batches):
      batch_start_time = time.time()
      feed_dict = {input_text: x,
              targets: y,
              initial_state: state,
              lr: learning_rate}
      train_loss, state, _ = sess.run([cost, final_state, train_op], feed_dict)
      
      if batch_index % 100 == 0:
          time_elapsed   = time.time() - start_time
          time_per_batch = time.time() - batch_start_time
          num_batches_remaining = (num_epochs - epoch) * num_batches + num_batches - batch_index 
          print('Epoch {:>3} Batch {:>4}/{} train_loss = {:.3f} time_per_batch = {:.3f} time_elapsed = {:.3f}   time_remaining = {:.0f}'.format(
                    epoch + 1,
                    batch_index + 1,
                    num_batches,
                    train_loss,
                    time_per_batch,
                    time_elapsed,
                    num_batches_remaining * time_per_batch))
          
          # save model every 100 batches
          saver = tf.train.Saver()
          saver.save(sess, save_dir)
          print('Model Trained and Saved')

Num Batches per Epoche : 133, Total Epochs : 400
Epoch   1 Batch    1/133 train_loss = 10.548 time_per_batch = 2.202 time_elapsed = 10.080   time_remaining = 117431
Model Trained and Saved
Epoch   1 Batch  101/133 train_loss = 6.633 time_per_batch = 0.446 time_elapsed = 54.668   time_remaining = 23730
Model Trained and Saved
Epoch   2 Batch    1/133 train_loss = 6.636 time_per_batch = 0.449 time_elapsed = 69.636   time_remaining = 23893
Model Trained and Saved
Epoch   2 Batch  101/133 train_loss = 6.572 time_per_batch = 0.444 time_elapsed = 114.686   time_remaining = 23550
Model Trained and Saved
Epoch   3 Batch    1/133 train_loss = 6.595 time_per_batch = 0.448 time_elapsed = 129.708   time_remaining = 23784
Model Trained and Saved
Epoch   3 Batch  101/133 train_loss = 6.563 time_per_batch = 0.444 time_elapsed = 174.466   time_remaining = 23510
Model Trained and Saved
Epoch   4 Batch    1/133 train_loss = 6.585 time_per_batch = 0.434 time_elapsed = 189.961   time_remaining = 22955
Mod

## 文本生成

In [0]:
gen_length = 1000
prime_words = '一阵轻风吹过，屋檐下的铃铛迎风而响，绿色的衣角轻轻飘起，仿佛也带着几分笑意；清脆的铃声，随着风儿飘然而上，回荡在天地之间。'

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load the saved model
    loader = tf.train.import_meta_graph(save_dir + '.meta')
    loader.restore(sess, save_dir)
    
    # Get tensors from loaded graph
    input_text = loaded_graph.get_tensor_by_name('input:0')
    initial_state = loaded_graph.get_tensor_by_name('initial_state:0')
    final_state = loaded_graph.get_tensor_by_name('final_state:0')
    probs = loaded_graph.get_tensor_by_name('probs:0')
    
    # Sentences generation setup
    gen_sentences = list(jieba.cut(prime_words)) if USE_SPLIT else prime_words.split()
    prev_state = sess.run(initial_state, {input_text: np.array([[1 for word in gen_sentences]])})
    
    # Generate sentences
    for n in range(gen_length):
        # Dynamic Input
        dyn_input = [[vocab_to_int[word] for word in gen_sentences[-seq_length:]]]
        dyn_seq_length = len(dyn_input[0])

        # Get Prediction
        probabilities, prev_state = sess.run(
            [probs, final_state],
            {input_text: dyn_input, initial_state: prev_state})
        
        # Get predict word
        word_probs = probabilities[0][dyn_seq_length-1]
        pred_word = pick_word(word_probs, int_to_vocab)

        gen_sentences.append(pred_word)
        
    # Remove tokens
    chapter_text = ''.join(gen_sentences)
        
    print(chapter_text)