In [2]:
import tensorflow as tf
from tensorflow.python.layers.core import Dense

import numpy as np
import time
import tensorflow as tf

In [21]:
class Seq2Seq(object):
    def __init__(self,
                 source_vocab_size,
                 target_vocab_size,
                 target_start_flag_index=0,
                 target_end_flag_index=1,
                 target_pad_flag_index=2,
                 batch_size=32,
                 encode_embed_dim=128,
                 decode_embed_dim=128,
                 rnn_size=128,
                 num_layers=2,
                 trian_mode=True,
                ):
        self.source_vocab_size=source_vocab_size
        self.target_vocab_size=target_vocab_size
        self.target_start_flag_index=target_start_flag_index
        self.target_end_flag_index=target_end_flag_index
        self.target_pad_flag_index=target_pad_flag_index
        self.batch_size=batch_size
        self.encode_embed_dim=encode_embed_dim
        self.decode_embed_dim=decode_embed_dim
        self.rnn_size=rnn_size
        self.num_layers=num_layers
        self.trian_mode=trian_mode
        
        self.build_model()

    def get_inputs(self): 
        self.inputs = tf.placeholder(tf.int32, (None, None), name='inputs') # 输入原句 (None, None)
        self.targets = tf.placeholder(tf.int32, (None, None), name='targets') # 目标句子 (None, None)
        self.learning_rate = tf.placeholder(tf.float32, name='learning_rate') # 学习率 
        self.source_sequence_length = tf.placeholder(tf.int32, (None,), name='source_sequence_length') # 原数据长度-(None,) 
        self.target_sequence_length = tf.placeholder(tf.int32, (None,), name='target_sequence_length') # 目标数据长度 (None, )
        self.max_target_sequence_length = tf.reduce_max(self.target_sequence_length, name='max_target_len') # 最大目标长度
        

    def get_encoder_layer(self, 
                                input_data,     # 输入tensor   
                                source_sequence_length): # 源数据的序列长度  
        encoder_embed_input = tf.contrib.layers.embed_sequence(input_data, self.source_vocab_size, self.encode_embed_dim)
        def get_lstm_cell(rnn_size):  
            lstm_cell = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
            return lstm_cell
        cell = tf.contrib.rnn.MultiRNNCell([get_lstm_cell(self.rnn_size) for _ in range(self.num_layers)])
        encoder_output, encoder_state = tf.nn.dynamic_rnn(cell, encoder_embed_input, sequence_length=source_sequence_length, dtype=tf.float32)
        return encoder_output, encoder_state
    
    def process_decoder_input(self, data):
        ''' 补充start_flag，并移除最后一个字符 ''' 
        ending = tf.strided_slice(data, [0, 0], [self.batch_size, -1], [1, 1])  # cut掉最后一个字符
        decoder_input = tf.concat([tf.fill([self.batch_size, 1], self.target_start_flag_index), ending], 1)
        return decoder_input
    
    def decoding_layer(self,
                                source_sequence_length,    # 源数据长度
                                target_sequence_length,     # target数据序列长度
                                max_target_sequence_length, # target数据序列最大长度
                                encoder_state,                   # encoder端编码的状态向量
                                decoder_input,                   # decoder端输入
                                encoder_outputs,):               # 添加一个注意力机制 
                                
        # 1. Embedding 
        decoder_embeddings = tf.Variable(tf.random_uniform([self.target_vocab_size, self.decode_embed_dim]))
        decoder_embed_input = tf.nn.embedding_lookup(decoder_embeddings, decoder_input)

        # 2. 构造Decoder中的RNN单元
        def get_decoder_cell(rnn_size):
            decoder_cell = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
            return decoder_cell

        #2.1 添加注意力机制的RNN 单元
        def get_decoder_cell_attention(rnn_size): 
            attention_states=encoder_outputs 
            attention_mechanism = tf.contrib.seq2seq.LuongAttention(rnn_size, attention_states, memory_sequence_length=source_sequence_length)
            decoder_cell = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
            decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism, attention_layer_size=rnn_size)
            return decoder_cell  
        cell = tf.contrib.rnn.MultiRNNCell([get_decoder_cell_attention(self.rnn_size) for _ in range(self.num_layers)])

        # 3. Output全连接层
        output_layer = Dense(self.target_vocab_size, kernel_initializer = tf.truncated_normal_initializer(mean = 0.0, stddev=0.1))

        # 4. Training decoder
        with tf.variable_scope("decode"):
            # 得到help对象
            training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=decoder_embed_input,
                                                                sequence_length=target_sequence_length,
                                                                time_major=False)
            # 构造decoder
            training_decoder = tf.contrib.seq2seq.BasicDecoder(cell,
                                                               training_helper,
                                                               initial_state=cell.zero_state(dtype=tf.float32,batch_size=self.batch_size)
                                                               ,output_layer=output_layer) 

            training_decoder_output, _ ,_= tf.contrib.seq2seq.dynamic_decode(training_decoder,
                                                                           impute_finished=True,
                                                                           maximum_iterations=max_target_sequence_length)
        # 5. Predicting decoder
        # 与training共享参数
        with tf.variable_scope("decode", reuse=True):
            # 创建一个常量tensor并复制为batch_size的大小
            start_tokens = tf.tile(tf.constant([self.target_start_flag_index], dtype=tf.int32), [self.batch_size], 
                                   name='start_tokens')
            predicting_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(decoder_embeddings,
                                                                    start_tokens,
                                                                    self.target_end_flag_index)
            predicting_decoder = tf.contrib.seq2seq.BasicDecoder(cell,
                                                            predicting_helper,
                                                            initial_state=cell.zero_state(dtype=tf.float32,batch_size=self.batch_size)
                                                                ,output_layer=output_layer)
            predicting_decoder_output, _,_  = tf.contrib.seq2seq.dynamic_decode(predicting_decoder,
                                                                impute_finished=True,
                                                                maximum_iterations=max_target_sequence_length)
        return training_decoder_output, predicting_decoder_output
    
    def build_model(self):
        self.get_inputs() 
        encoder_outputs, encoder_state = self.get_encoder_layer(input_data=self.inputs, 
                                                                      source_sequence_length=self.source_sequence_length)
        decoder_input = self.process_decoder_input(self.targets) # 预处理后的decoder输入
        self.training_decoder_output, self.predicting_decoder_output=self.decoding_layer(
            source_sequence_length=self.source_sequence_length,
            target_sequence_length=self.target_sequence_length,
            encoder_state=encoder_state,
            max_target_sequence_length=self.max_target_sequence_length,
            decoder_input=decoder_input,
            encoder_outputs=encoder_outputs)
        self.masks = tf.sequence_mask(self.target_sequence_length, self.max_target_sequence_length, dtype=tf.float32, name='masks')
        self.loss = tf.contrib.seq2seq.sequence_loss(logits=self.training_decoder_output.rnn_output,
                                                             targets=self.targets,
                                                             weights=self.masks, 
                                                             )
        self.opt = tf.train.AdamOptimizer(self.learning_rate)
        gradients = self.opt.compute_gradients(self.loss)
        capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
        self.update = self.opt.apply_gradients(capped_gradients)
    
    def train(self, 
               sess, 
               encoder_inputs, 
               encoder_inputs_length,
               decoder_inputs, 
               decoder_inputs_length,
               learn_rate): 
        input_feed={
            self.inputs.name:encoder_inputs,
            self.source_sequence_length.name:encoder_inputs_length,
            self.targets.name:decoder_inputs,
            self.target_sequence_length.name:decoder_inputs_length,
            self.learning_rate.name:learn_rate
        }

        output_feed = [
            self.update, 
            self.loss ]
        _, loss = sess.run(output_feed, input_feed)
        return loss

In [22]:
#  数据处理

source_path="/Users/zhouwencheng/Desktop/Grass/data/txt/letters/letters_source.txt"
target_path="/Users/zhouwencheng/Desktop/Grass/data/txt/letters/letters_up.txt"

# 打开文件
with open(source_path, 'r') as f:
    source_text=f.read()
    source_texts=source_text.split('\n') # 进行分句后的句子
with open(target_path, 'r') as f:
    target_text=f.read()
    target_texts=target_text.split('\n')

#====== 特殊标记定义 =========#
start_flag='<GO>'
end_flag='<EOS>'
pad_flag='<PAD>'
unk_flag='<UNK>'

start_index=0
end_index=1
pad_index=2
unk_index=3
    
all_text=source_text.replace("\n", "")+target_text.replace("\n", "") # 获取全部文本数据
char_list=sorted(list(set(all_text))) # 获取字符的数组
char_list=[start_flag, end_flag, pad_flag, unk_flag]+char_list # 把特殊标记加入到数组中
index_to_char={idx: char for idx, char in enumerate(char_list)} # 建立 index_to_char字典
char_to_index={char:idx for idx, char in enumerate(char_list)} # 建立char_to_index字典

def text_to_index(texts, char_to_index): # 把数据转化为Index的形式
    texts_indexs=[]
    for item in texts:
        texts_indexs.append([char_to_index.get(char, unk_index) for char in item])
    return texts_indexs

source_indexs=text_to_index(source_texts, char_to_index) # 原句子转化为Index形式
target_indexs=text_to_index(target_texts, char_to_index)   # 目标句子转化为index形式

vocab_len=len(index_to_char) # 字典大小(包含多少个字符)
source_max_len=max([len(item) for item in source_indexs]) # 原数据句子最大长度
target_max_len=max([len(item) for item in target_indexs]) # 目标句子最大长度


def pad_sentence_batch(sentence_batch, pad_int): 
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [pad_int] * (max_sentence - len(sentence)) for sentence in sentence_batch]
def get_batches(batch_size=32):
    for batch_i in range(0, len(source_indexs)//batch_size):
            start_i = batch_i * batch_size
            sources_batch = source_indexs[start_i:start_i + batch_size]
            targets_batch  = target_indexs[start_i:start_i + batch_size]
            
            # 补全序列
            pad_sources_batch = np.array(pad_sentence_batch(sources_batch, pad_index))
            pad_targets_batch = np.array(pad_sentence_batch(targets_batch, pad_index))
            
            # 记录每条记录的长度
            targets_lengths = []
            for target in targets_batch:
                targets_lengths.append(len(target))

            source_lengths = []
            for source in sources_batch:
                source_lengths.append(len(source))
            yield pad_targets_batch, pad_sources_batch, targets_lengths, source_lengths
(valid_targets_batch, valid_sources_batch, valid_targets_lengths, valid_sources_lengths) = next(get_batches()) 

In [23]:
tf.reset_default_graph()
seq2seq=Seq2Seq(source_vocab_size=93, target_vocab_size=93)

In [25]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for index in range(1, 10):
        for batch_i, (targets_batch, sources_batch, targets_lengths, sources_lengths) in enumerate(
                get_batches()):
            loss=seq2seq.train(sess, sources_batch, sources_lengths, targets_batch,targets_lengths, 0.001)
        print("loss:",loss) 

loss: 0.3533422
loss: 0.030368052
loss: 0.0041936245
loss: 0.0029274183
loss: 0.0039266176
loss: 0.0009129228
loss: 0.00044267913
loss: 0.00027995108
loss: 0.00019522585
