In [6]:
import tensorflow as tf
from tensorflow.python.layers.core import Dense

import numpy as np
import time
import tensorflow as tf
import os


In [21]:
#  数据处理

source_path="/Users/zhouwencheng/Desktop/Grass/data/txt/letters/letters_source.txt"
target_path="/Users/zhouwencheng/Desktop/Grass/data/txt/letters/letters_up.txt"

# 打开文件
with open(source_path, 'r') as f:
    source_text=f.read()
    source_texts=source_text.split('\n') # 进行分句后的句子
with open(target_path, 'r') as f:
    target_text=f.read()
    target_texts=target_text.split('\n')

#====== 特殊标记定义 =========#
start_flag='<GO>'
end_flag='<EOS>'
pad_flag='<PAD>'
unk_flag='<UNK>'

start_index=0
end_index=1
pad_index=2
unk_index=3
    
all_text=source_text.replace("\n", "")+target_text.replace("\n", "") # 获取全部文本数据
char_list=sorted(list(set(all_text))) # 获取字符的数组
char_list=[start_flag, end_flag, pad_flag, unk_flag]+char_list # 把特殊标记加入到数组中
index_to_char={idx: char for idx, char in enumerate(char_list)} # 建立 index_to_char字典
char_to_index={char:idx for idx, char in enumerate(char_list)} # 建立char_to_index字典

def text_to_index(texts, char_to_index): # 把数据转化为Index的形式
    texts_indexs=[]
    for item in texts:
        texts_indexs.append([char_to_index.get(char, unk_index) for char in item])
    return texts_indexs

source_indexs=text_to_index(source_texts, char_to_index) # 原句子转化为Index形式
target_indexs=text_to_index(target_texts, char_to_index)   # 目标句子转化为index形式

vocab_len=len(index_to_char) # 字典大小(包含多少个字符)
source_max_len=max([len(item) for item in source_indexs]) # 原数据句子最大长度
target_max_len=max([len(item) for item in target_indexs]) # 目标句子最大长度


def pad_sentence_batch(sentence_batch, pad_int): 
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [pad_int] * (max_sentence - len(sentence)) for sentence in sentence_batch]
def get_batches(batch_size=32):
    for batch_i in range(0, len(source_indexs)//batch_size):
            start_i = batch_i * batch_size
            sources_batch = source_indexs[start_i:start_i + batch_size]
            targets_batch  = target_indexs[start_i:start_i + batch_size]
            
            # 补全序列
            pad_sources_batch = np.array(pad_sentence_batch(sources_batch, pad_index))
            pad_targets_batch = np.array(pad_sentence_batch(targets_batch, pad_index))
            
            # 记录每条记录的长度
            targets_lengths = []
            for target in targets_batch:
                targets_lengths.append(len(target))

            source_lengths = []
            for source in sources_batch:
                source_lengths.append(len(source))
            yield pad_targets_batch, pad_sources_batch, targets_lengths, source_lengths
(valid_targets_batch, valid_sources_batch, valid_targets_lengths, valid_sources_lengths) = next(get_batches()) 

In [24]:
def get_inputs(): 
    inputs = tf.placeholder(tf.int32, (None, None), name='inputs') # 输入原句 (None, None)
    targets = tf.placeholder(tf.int32, (None, None), name='targets') # 目标句子 (None, None)
    learning_rate = tf.placeholder(tf.float32, name='learning_rate') # 学习率 
     
    target_sequence_length = tf.placeholder(tf.int32, (None,), name='target_sequence_length') # 目标数据长度 (None, )
    max_target_sequence_length = tf.reduce_max(target_sequence_length, name='max_target_len') # 最大目标长度
    source_sequence_length = tf.placeholder(tf.int32, (None,), name='source_sequence_length') # 原数据长度-(None,)
    
    return inputs, targets, learning_rate, target_sequence_length, max_target_sequence_length, source_sequence_length

In [25]:
def get_encoder_layer(input_data,     # 输入tensor
                                rnn_size,         # rnn隐层结点数量   
                                num_layers,     # 堆叠的rnn cell数量
                                source_sequence_length, # 源数据的序列长度
                                source_vocab_size,         # 源数据的词典大小
                                encoding_embedding_size): # embedding的大小
    encoder_embed_input = tf.contrib.layers.embed_sequence(input_data, source_vocab_size, encoding_embedding_size)
    def get_lstm_cell(rnn_size):  
        lstm_cell = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
        return lstm_cell
    cell = tf.contrib.rnn.MultiRNNCell([get_lstm_cell(rnn_size) for _ in range(num_layers)])
    encoder_output, encoder_state = tf.nn.dynamic_rnn(cell, encoder_embed_input, 
                                                      sequence_length=source_sequence_length, dtype=tf.float32)
    return encoder_output, encoder_state

In [28]:
def process_decoder_input(data, vocab_to_int, batch_size):
    ''' 补充<GO>，并移除最后一个字符 '''
    # cut掉最后一个字符
    ending = tf.strided_slice(data, [0, 0], [batch_size, -1], [1, 1])
    decoder_input = tf.concat([tf.fill([batch_size, 1], vocab_to_int['<GO>']), ending], 1)
    return decoder_input

In [27]:
def decoding_layer(target_letter_to_int,            #target数据的映射表
                            decoding_embedding_size,  # embed向量大小
                            num_layers,                       # 堆叠的RNN单元数量
                            rnn_size,                           # RNN单元的隐层结点数量
                            target_sequence_length,     # target数据序列长度
                            max_target_sequence_length, # target数据序列最大长度
                            encoder_state,                   # encoder端编码的状态向量
                            decoder_input,                   # decoder端输入
                            encoder_outputs,               # 添加一个注意力机制
                            source_sequence_length,    # 源数据长度
                            ):
     
    # 1. Embedding
    target_vocab_size = len(target_letter_to_int)
    decoder_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, decoding_embedding_size]))
    decoder_embed_input = tf.nn.embedding_lookup(decoder_embeddings, decoder_input)

    # 2. 构造Decoder中的RNN单元
    def get_decoder_cell(rnn_size):
        decoder_cell = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
        return decoder_cell
    
    #2.1 添加注意力机制的RNN 单元
    def get_decoder_cell_attention(rnn_size): 
        attention_states=encoder_outputs
        # Create an attention mechanism
        attention_mechanism = tf.contrib.seq2seq.LuongAttention(rnn_size, attention_states, memory_sequence_length=source_sequence_length)
        decoder_cell = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
        decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism, attention_layer_size=rnn_size)
        return decoder_cell 
    
    cell = tf.contrib.rnn.MultiRNNCell([get_decoder_cell_attention(rnn_size) for _ in range(num_layers)])
    
    # 3. Output全连接层
    output_layer = Dense(target_vocab_size,
                         kernel_initializer = tf.truncated_normal_initializer(mean = 0.0, stddev=0.1))

    # 4. Training decoder
    with tf.variable_scope("decode"):
        # 得到help对象
        training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=decoder_embed_input,
                                                            sequence_length=target_sequence_length,
                                                            time_major=False)
        # 构造decoder
        training_decoder = tf.contrib.seq2seq.BasicDecoder(cell,
                                                           training_helper,
                                                           initial_state=cell.zero_state(dtype=tf.float32,batch_size=batch_size)
                                                           ,output_layer=output_layer) 

        training_decoder_output, _ ,_= tf.contrib.seq2seq.dynamic_decode(training_decoder,
                                                                       impute_finished=True,
                                                                       maximum_iterations=max_target_sequence_length)
    # 5. Predicting decoder
    # 与training共享参数
    with tf.variable_scope("decode", reuse=True):
        # 创建一个常量tensor并复制为batch_size的大小
        start_tokens = tf.tile(tf.constant([target_letter_to_int['<GO>']], dtype=tf.int32), [batch_size], 
                               name='start_tokens')
        predicting_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(decoder_embeddings,
                                                                start_tokens,
                                                                target_letter_to_int['<EOS>'])
        predicting_decoder = tf.contrib.seq2seq.BasicDecoder(cell,
                                                        predicting_helper,
                                                        initial_state=cell.zero_state(dtype=tf.float32,batch_size=batch_size)
                                                            ,output_layer=output_layer)
        predicting_decoder_output, _,_  = tf.contrib.seq2seq.dynamic_decode(predicting_decoder,
                                                            impute_finished=True,
                                                            maximum_iterations=max_target_sequence_length)
    
    return training_decoder_output, predicting_decoder_output

In [36]:
def seq2seq_model(input_data, 
                  targets, 
                  lr, 
                  target_sequence_length, 
                  max_target_sequence_length, 
                  source_sequence_length,
                  source_vocab_size,
                  target_vocab_size,
                  encoder_embedding_size, 
                  decoder_embedding_size, 
                  rnn_size, 
                  num_layers): 
    # 获取encoder的状态输出
    encoder_outputs, encoder_state = get_encoder_layer(input_data, 
                                  rnn_size, 
                                  num_layers, 
                                  source_sequence_length,
                                  source_vocab_size, 
                                  encoding_embedding_size) 
    decoder_input = process_decoder_input(targets, char_to_index, batch_size) # 预处理后的decoder输入
    
    # 将状态向量与输入传递给decoder
    training_decoder_output, predicting_decoder_output = decoding_layer(char_to_index, 
                                                                       decoding_embedding_size, 
                                                                       num_layers, 
                                                                       rnn_size,
                                                                       target_sequence_length,
                                                                       max_target_sequence_length,
                                                                       encoder_state, 
                                                                       decoder_input,
                                                                        encoder_outputs,
                                                                        source_sequence_length
                                                                       ) 
    return training_decoder_output, predicting_decoder_output

In [37]:
# 超参数
# Number of Epochs
epochs =50
# Batch Size
batch_size =32
# RNN Size
rnn_size = 50
# Number of Layers
num_layers = 2
# Embedding Size
encoding_embedding_size = 128
decoding_embedding_size = 128
# Learning Rate
learning_rate = 0.05

In [51]:
# 构造graph
train_graph = tf.Graph()

with train_graph.as_default():
    
    # 获得模型输入    
    input_data, targets, lr, target_sequence_length, max_target_sequence_length, source_sequence_length = get_inputs()
    
    training_decoder_output, predicting_decoder_output = seq2seq_model(input_data, 
                                                                      targets, 
                                                                      lr, 
                                                                      target_sequence_length, 
                                                                      max_target_sequence_length, 
                                                                      source_sequence_length,
                                                                      len(char_to_index),
                                                                      len(char_to_index),
                                                                      encoding_embedding_size, 
                                                                      decoding_embedding_size, 
                                                                      rnn_size, 
                                                                      num_layers)    
    
    training_logits = tf.identity(training_decoder_output.rnn_output, 'logits')
    predicting_logits = tf.identity(predicting_decoder_output.sample_id, name='predictions')
    masks = tf.sequence_mask(target_sequence_length, max_target_sequence_length, dtype=tf.float32, name='masks')
    with tf.name_scope("optimization"):
        # Loss function
        cost = tf.contrib.seq2seq.sequence_loss(
            training_logits,
            targets,
            masks)
        # Optimizer
        optimizer = tf.train.AdamOptimizer(lr)
        # Gradient Clipping
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)

In [55]:
display_step = 50 # 每隔50轮输出loss  
checkpoint = "model/trained_model.ckpt" 
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    loader = tf.train.import_meta_graph(checkpoint + '.meta')
    loader.restore(sess, checkpoint)
    for epoch_i in range(1, epochs+1):
        for batch_i, (targets_batch, sources_batch, targets_lengths, sources_lengths) in enumerate(
                get_batches()):
            _, loss = sess.run(
                [train_op, cost],
                {input_data: sources_batch,
                 targets: targets_batch,
                 lr: learning_rate,
                 target_sequence_length: targets_lengths,
                 source_sequence_length: sources_lengths}) 
            if batch_i % display_step == 0: 
                # 计算validation loss
                validation_loss = sess.run(
                [cost],
                {input_data: valid_sources_batch,
                 targets: valid_targets_batch,
                 lr: learning_rate,
                 target_sequence_length: valid_targets_lengths,
                 source_sequence_length: valid_sources_lengths})
                
                print('Epoch {:>3}/{} Batch {:>4}/{} - Training Loss: {:>6.3f}  - Validation loss: {:>6.3f}'
                      .format(epoch_i,
                              epochs, 
                              batch_i, 
                              '未知', 
                              loss, 
                              validation_loss[0])) 
                saver = tf.train.Saver()
                saver.save(sess, "model/trained_model.ckpt")
                print('Model Trained and Saved')

Epoch   1/50 Batch    0/未知 - Training Loss:  2.133  - Validation loss:  1.824


ValueError: At least two variables have the same name: EmbedSequence/embeddings/Adam

In [49]:
# 输入一个单词
input_word =  "zxcv"
text = [char_to_index.get(char, unk_index) for char in input_word]

checkpoint = "model/trained_model.ckpt" 

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # 加载模型
    loader = tf.train.import_meta_graph(checkpoint + '.meta')
    loader.restore(sess, checkpoint)

    input_data = loaded_graph.get_tensor_by_name('inputs:0')
    logits = loaded_graph.get_tensor_by_name('predictions:0')
    source_sequence_length = loaded_graph.get_tensor_by_name('source_sequence_length:0')
    target_sequence_length = loaded_graph.get_tensor_by_name('target_sequence_length:0')
    
    answer_logits = sess.run(logits, {input_data: [text]*batch_size, 
                                      target_sequence_length: [len(input_word)]*batch_size, 
                                      source_sequence_length: [len(input_word)]*batch_size})[0] 


pad = 2
print(input_word)
print('{}'.format(" ".join([index_to_char[i] for i in answer_logits if i != pad])))

zxcv
Z X C C
