In [2]:
import tensorflow as tf
from tensorflow.python.layers.core import Dense

import numpy as np
import time
import tensorflow as tf

In [3]:
import tensorflow as tf
from tensorflow.python.layers.core import Dense

class Seq2Seq(object):
    def __init__(self,
                 source_vocab_size,  # Int 源数据 covab大小
                 target_vocab_size,  # Int 目标数据 vocab大小
                 target_start_flag_index=0,  # Int 目标数据开始标记
                 target_end_flag_index=1,  # Int 目标数据介绍标记
                 batch_size=32,  # Int batch大小
                 encode_embed_dim=128,  # Int encode_dim 大小
                 decode_embed_dim=128,  # Int decoder_dim 大小
                 max_pred_len=128,  # Int 预测时最大长度(预测时需要)
                 rnn_size=128,  # Int 一层rnn的神经元格式
                 num_layers=2,  # Int 层数
                 learning_rate=0.001,  # float  学习率
                 train_mode=True,  # bool 是否为训练模式
                 ):

        self.source_vocab_size = source_vocab_size
        self.target_vocab_size = target_vocab_size
        self.target_start_flag_index = target_start_flag_index
        self.target_end_flag_index = target_end_flag_index
        self.batch_size = batch_size
        self.encode_embed_dim = encode_embed_dim
        self.decode_embed_dim = decode_embed_dim
        self.rnn_size = rnn_size
        self.num_layers = num_layers
        self.learning_rate = learning_rate
        self.train_mode = train_mode
        self.max_pred_len = max_pred_len

        self.build_model()  # 创建模型

    def get_inputs(self):
        """ 创建 placeholder """
        self.inputs = tf.placeholder(tf.int32, (None, None), name='inputs')  # 输入原句 (None, None)
        self.source_sequence_length = tf.placeholder(tf.int32, (None,), name='source_sequence_length')  # 原数据长度-(None,)
        if self.train_mode:
            self.targets = tf.placeholder(tf.int32, (None, None), name='targets')  # 目标句子 (None, None)
            self.target_sequence_length = tf.placeholder(tf.int32, (None,), name='target_sequence_length')  # 目标数据长度 (None, )
            self.max_target_sequence_length = tf.reduce_max(self.target_sequence_length, name='max_target_len')  # 最大目标长度

    def get_encoder_layer(self,
                          input_data,  # 输入tensor （None, None）
                          source_sequence_length):  # 源数据的序列长度
        """
        构建encoder层
        :param input_data: (None, None)
        :param source_sequence_length: (None,)
        :return: encoder_output  encoder_state
        """

        # (?, ?, 128) (batch_size, None, dim)
        encoder_embed_input = tf.contrib.layers.embed_sequence(ids=input_data,
                                                               vocab_size=self.source_vocab_size,
                                                               embed_dim=self.encode_embed_dim)
        def get_lstm_cell(rnn_size):
            return tf.contrib.rnn.LSTMCell(num_units=rnn_size,
                                           initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
        cell = tf.contrib.rnn.MultiRNNCell([get_lstm_cell(self.rnn_size) for _ in range(self.num_layers)])
        encoder_output, encoder_state = tf.nn.dynamic_rnn(cell=cell,
                                                          inputs=encoder_embed_input,
                                                          sequence_length=source_sequence_length,
                                                          dtype=tf.float32)

        # encoder_output (?, ?, 128) (batch_size, None, rnn_size)
        # encoder_state Tuple((None, 128), (None, 128))
        return encoder_output, encoder_state

    def process_decoder_input(self, data):
        """
        把最后一个字符移除，前面添加一个 start_flag_index
        例如：  A B C D <EOS>       (<EOS> 为结束标识符)
        --> <GO> A B C D           （<GO> 为开始标识符）
        """
        ''' 补充start_flag，并移除最后一个字符 '''
        ending = tf.strided_slice(data, [0, 0], [self.batch_size, -1], [1, 1])  # cut掉最后一个字符
        decoder_input = tf.concat([tf.fill([self.batch_size, 1], self.target_start_flag_index), ending], 1)
        return decoder_input

    def decoding_layer(self,
                       source_sequence_length,  # 源数据长度
                       encoder_outputs,  # 添加一个注意力机制
                       encoder_state,    # encode 的状态
                       decoder_input=None,  # decoder端输入
                       target_sequence_length=None,  # target数据序列长度
                       max_target_sequence_length=None, ):  # target数据序列最大长度

        decoder_embeddings = tf.Variable(tf.random_uniform([self.target_vocab_size, self.decode_embed_dim]))

        def get_decoder_cell(rnn_size):
            decoder_cell = tf.contrib.rnn.LSTMCell(num_units=rnn_size,
                                                   initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
            return decoder_cell

        cell = tf.contrib.rnn.MultiRNNCell([get_decoder_cell(self.rnn_size) for _ in range(self.num_layers)])

        # attention层
        attention_mechanism = tf.contrib.seq2seq.LuongAttention(num_units=self.rnn_size,
                                                                memory=encoder_outputs,
                                                                memory_sequence_length=source_sequence_length)
        cell = tf.contrib.seq2seq.AttentionWrapper(cell=cell,
                                                   attention_mechanism=attention_mechanism,
                                                   attention_layer_size=self.rnn_size)
        decoder_initial_state = cell.zero_state(batch_size=self.batch_size,
                                                dtype=tf.float32).clone(cell_state=encoder_state)

        #  Output全连接层
        output_layer = Dense(units=self.target_vocab_size,
                             kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1))
        # 4. Training decoder
        with tf.variable_scope("decode"):
            if self.train_mode:
                #  Embedding
                decoder_embed_input = tf.nn.embedding_lookup(params=decoder_embeddings,
                                                             ids=decoder_input)
                # 得到help对象
                training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=decoder_embed_input,
                                                                    sequence_length=target_sequence_length,
                                                                    time_major=False)
                # 构造decoder decoder_initial_state
                training_decoder = tf.contrib.seq2seq.BasicDecoder(cell=cell,
                                                                   helper=training_helper,
                                                                   initial_state=decoder_initial_state,
                                                                   output_layer=output_layer)
                training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder=training_decoder,
                                                                                  impute_finished=True,
                                                                                  maximum_iterations=max_target_sequence_length)
                self.training_decoder_output = training_decoder_output
            else:
                # 创建一个常量tensor并复制为batch_size的大小
                start_tokens = tf.tile(tf.constant([self.target_start_flag_index], dtype=tf.int32), [self.batch_size],
                                       name='start_tokens')
                predicting_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embedding=decoder_embeddings,
                                                                             start_tokens=start_tokens,
                                                                             end_token=self.target_end_flag_index)

                # decoder_initial_state
                predicting_decoder = tf.contrib.seq2seq.BasicDecoder(cell,
                                                                     predicting_helper,
                                                                     initial_state=decoder_initial_state
                                                                     , output_layer=output_layer)
                predicting_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder=predicting_decoder,
                                                                                    impute_finished=True,
                                                                                    maximum_iterations=self.max_pred_len)
                self.predicting_decoder_output = predicting_decoder_output
                self.predicting_decoder_output = self.predicting_decoder_output.sample_id

    def build_model(self):
        self.get_inputs()
        encoder_outputs, encoder_state = self.get_encoder_layer(input_data=self.inputs,
                                                                source_sequence_length=self.source_sequence_length)

        if self.train_mode:
            decoder_input = self.process_decoder_input(self.targets)  # 预处理后的decoder输入
            self.decoding_layer(
                source_sequence_length=self.source_sequence_length,
                encoder_outputs=encoder_outputs,
                encoder_state=encoder_state,
                target_sequence_length=self.target_sequence_length,
                max_target_sequence_length=self.max_target_sequence_length,
                decoder_input=decoder_input, )
            self.masks = tf.sequence_mask(lengths=self.target_sequence_length,
                                          maxlen=self.max_target_sequence_length,
                                          dtype=tf.float32, name='masks')
            self.loss = tf.contrib.seq2seq.sequence_loss(logits=self.training_decoder_output.rnn_output,
                                                         targets=self.targets,
                                                         weights=self.masks,
                                                         )
            self.opt = tf.train.AdamOptimizer(self.learning_rate)
            gradients = self.opt.compute_gradients(self.loss)
            capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
            self.update = self.opt.apply_gradients(capped_gradients)
        else:
            self.decoding_layer(
                source_sequence_length=self.source_sequence_length,
                encoder_outputs=encoder_outputs,
                encoder_state=encoder_state)
        self.saver = tf.train.Saver()

    def train(self,
              sess,
              encoder_inputs,
              encoder_inputs_length,
              decoder_inputs,
              decoder_inputs_length):
        input_feed = {
            self.inputs.name: encoder_inputs,
            self.source_sequence_length.name: encoder_inputs_length,
            self.targets.name: decoder_inputs,
            self.target_sequence_length.name: decoder_inputs_length
        }

        output_feed = [
            self.update,
            self.loss]
        _, loss = sess.run(output_feed, input_feed)
        return loss

    def predict(self, sess, encoder_inputs, encoder_inputs_length):
        """
        预测
        :param sess: tensorflow Session
        :param encoder_inputs: (batch_size, None) 二维数组
        :param encoder_inputs_length: (None,)  一维数组

        :return: (batch_size, max_pred_len) 二维数组
        """
        input_feed = {
            self.inputs.name: encoder_inputs,
            self.source_sequence_length.name: encoder_inputs_length
        }
        pred = sess.run(self.predicting_decoder_output, input_feed)
        return pred

    def save(self, sess, save_path):
        """
        保存模型
        :param sess: tensorflow Session
        :param save_path: 保存地址
        :return: None
        """
        self.saver.save(sess, save_path=save_path)

    def load(self, sess, save_path):
        """
        加载模型
        :param sess: tensorflow Session
        :param save_path: 加载地址
        :return:None
        """
        self.saver.restore(sess, save_path)

print("OK")



OK


In [4]:
import numpy as np

#  数据处理

source_path="/Users/zhouwencheng/Desktop/Grass/data/txt/letters/letters_source.txt"
target_path="/Users/zhouwencheng/Desktop/Grass/data/txt/letters/letters_up.txt"

# 打开文件
with open(source_path, 'r') as f:
    source_text=f.read()
    source_texts=source_text.split('\n') # 进行分句后的句子
with open(target_path, 'r') as f:
    target_text=f.read()
    target_texts=target_text.split('\n')

#====== 特殊标记定义 =========#
start_flag='<GO>'
end_flag='<EOS>'
pad_flag='<PAD>'
unk_flag='<UNK>'

start_index=0
end_index=1
pad_index=2
unk_index=3
    
all_text=source_text.replace("\n", "")+target_text.replace("\n", "") # 获取全部文本数据
char_list=sorted(list(set(all_text))) # 获取字符的数组
char_list=[start_flag, end_flag, pad_flag, unk_flag]+char_list # 把特殊标记加入到数组中
index_to_char={idx: char for idx, char in enumerate(char_list)} # 建立 index_to_char字典
char_to_index={char:idx for idx, char in enumerate(char_list)} # 建立char_to_index字典

def text_to_index(texts, char_to_index): # 把数据转化为Index的形式
    texts_indexs=[]
    for item in texts:
        texts_indexs.append([char_to_index.get(char, unk_index) for char in item])
    return texts_indexs

source_indexs=text_to_index(source_texts, char_to_index) # 原句子转化为Index形式
target_indexs=text_to_index(target_texts, char_to_index)   # 目标句子转化为index形式

vocab_len=len(index_to_char) # 字典大小(包含多少个字符)
source_max_len=max([len(item) for item in source_indexs]) # 原数据句子最大长度
target_max_len=max([len(item) for item in target_indexs]) # 目标句子最大长度


def pad_sentence_batch(sentence_batch, pad_int): 
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [end_index] + [pad_int] * (max_sentence - len(sentence)) for sentence in sentence_batch]
def get_batches(batch_size=32):
    for batch_i in range(0, len(source_indexs)//batch_size):
            start_i = batch_i * batch_size
            sources_batch = source_indexs[start_i:start_i + batch_size]
            targets_batch  = target_indexs[start_i:start_i + batch_size]
            
            # 补全序列
            pad_sources_batch = np.array(pad_sentence_batch(sources_batch, pad_index))
            pad_targets_batch = np.array(pad_sentence_batch(targets_batch, pad_index))
            
            # 记录每条记录的长度
            targets_lengths = []
            for target in targets_batch:
                targets_lengths.append(len(target)+1)

            source_lengths = []
            for source in sources_batch:
                source_lengths.append(len(source)+1)
            yield pad_targets_batch, pad_sources_batch, targets_lengths, source_lengths
(valid_targets_batch, valid_sources_batch, valid_targets_lengths, valid_sources_lengths) = next(get_batches()) 

print("OK")

OK


In [6]:
tf.reset_default_graph()
seq2seq=Seq2Seq(source_vocab_size=vocab_len, target_vocab_size=vocab_len)

model_save_path="/Users/zhouwencheng/Desktop/Grass/data/model/101seq2seqModel/203_s2s_at"
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    checkpoint = tf.train.latest_checkpoint(model_save_path)
    if checkpoint:
        seq2seq.load(sess, checkpoint)
    for index in range(1, 5):
        for batch_i, (targets_batch, sources_batch, targets_lengths, sources_lengths) in enumerate(
                get_batches()):
            loss=seq2seq.train(sess, sources_batch, sources_lengths, targets_batch,targets_lengths)
        print("第几个周期:", index, "loss:", loss) 
        seq2seq.save(sess, model_save_path+"/tf_s2s_at_203.ckpt")
        
print("OK")

W0814 00:52:48.345741 4321588096 deprecation.py:323] From /Users/zhouwencheng/Desktop/Grass/02Study/02PythonEnv/envpy3.7/lib/python3.7/site-packages/tensorflow/python/training/saver.py:1276: checkpoint_exists (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file APIs to check for files with this prefix.


第几个周期: 1 loss: 0.019352896
第几个周期: 2 loss: 0.059628222
第几个周期: 3 loss: 0.0008716241
第几个周期: 4 loss: 0.0004938206
OK


In [7]:
# 预测处理函数

def pred_text_to_ids(texts, char_to_index):
    unk_index=3
    pad_int=2
    end_index=1
    texts_indexs=[] 
    for item in texts:
        texts_indexs.append([char_to_index.get(char, unk_index) for char in item])
    max_sentence = max([len(sentence) for sentence in texts_indexs])
    inputs_pad= [sentence + [end_index] + [pad_int] * (max_sentence - len(sentence)) for sentence in texts_indexs]
    lengs=[len(item)+1 for item in texts]
    max_input_len=max([len(item) for item in text_tests])+1
    return inputs_pad, lengs, max_input_len
    
def index_to_text(ids, index_to_char):
    end_index=1
    texts=[]
    for item in ids:
        chars=[]
        for index in item:
            if index==end_index: 
                break 
            chars=chars+[index_to_char.get(index,  '<UNK>')] 
        texts.append("".join(chars))
    return texts

In [8]:
# 预测 
text_tests=["lkjgdaa", "af"]
inputs_pad, lengs, max_input_len = pred_text_to_ids(text_tests, char_to_index)
bath_size=len(text_tests)
model_save_path="/Users/zhouwencheng/Desktop/Grass/data/model/101seq2seqModel/203_s2s_at"
tf.reset_default_graph()
seq2seq=Seq2Seq(source_vocab_size=vocab_len, target_vocab_size=vocab_len, 
                batch_size=bath_size, max_pred_len = max_input_len,train_mode=False)
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    checkpoint = tf.train.latest_checkpoint(model_save_path)
    if checkpoint:
        seq2seq.load(sess, checkpoint) 
    pred=seq2seq.predict(sess, inputs_pad, lengs) 
    
pred_text=index_to_text(pred, index_to_char)
print(text_tests)
print(pred_text)

['lkjgdaa', 'af']
['LKJGDAA', 'AF']


In [9]:
text_tests=["lkjgdaaaaaa", "af"]
text_indexs=text_to_index(text_tests, char_to_index)
inputs=pad_sentence_batch(text_indexs, pad_index)
lengs=[len(item)+1 for item in text_tests]
max_input_len=max([len(item) for item in text_tests])+1
bath_size=len(text_tests)

model_save_path="/Users/zhouwencheng/Desktop/Grass/data/model/101seq2seqModel/203_s2s_at"
tf.reset_default_graph()
seq2seq=Seq2Seq(source_vocab_size=vocab_len, target_vocab_size=vocab_len, batch_size=bath_size, 
                max_pred_len = max_input_len,train_mode=False)
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    checkpoint = tf.train.latest_checkpoint(model_save_path)
    if checkpoint:
        seq2seq.load(sess, checkpoint) 
    pred=seq2seq.predict(sess, inputs, lengs) 
    
print(text_tests)
for item in pred:
    chars=[]
    for index in item:
        if index==end_index:
            print("".join(chars))
            break 
        chars=chars+[index_to_char.get(index,  '<UNK>')] 

['lkjgdaaaaaa', 'af']
LKJGDAAAA
AF
