In [1]:
%config ZMQInteractiveShell.ast_node_interactivity='all'

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras import activations
from tensorflow.keras.layers import Layer, Input, Embedding, LSTM, Dense, Attention
from tensorflow.keras.models import Model

In [2]:
class Config(object):
    raw_source_data =  "./data/ch_source_data_seg.txt"
    raw_target_data = "data/ch_target_data_seg.txt"
    vocab_file = "./data/ch_word_vocab.txt"
    num_sample = 10000
    
    maxlen = 10
    embedding_dim = 50
    hidden_units = 64
    
    epochs = 5
    batch_size = 32
    val_rate = 0.2
    
    model_file = "./output/seq2seq_attention_weights.h5"
    
config = Config()

## 加载数据

In [3]:
def read_data(data_path):
    datas = []
    with open(data_path, "r", encoding="utf8") as f:
        for line in f:
            words = line.strip().split()
            datas.append(words)
    return datas

source_data = read_data(config.raw_source_data)[:config.num_sample]
target_data = read_data(config.raw_target_data)[:config.num_sample]
source_data[10]
target_data[10]

['许兵', '是', '谁']

['是', '我', '善良', '可爱', '的', '主人', '的', '老公', '啊']

## 加载vocab

In [4]:
def read_vocab(vocab_file):
    vocab_words = []
    with open(vocab_file, "r", encoding="utf8") as f:
        for line in f:
            vocab_words.append(line.strip())
    return vocab_words

vocab_words = read_vocab(config.vocab_file)
special_words = ["<PAD>", "<UNK>", "<GO>", "<EOS>"]
vocab_words = special_words + vocab_words

vocab2id = {word: i for i, word in enumerate(vocab_words)}
id2vocab = {i: word for i, word in enumerate(vocab_words)}
print("vocab test: ", [id2vocab[i] for i in range(10)])

vocab test:  ['<PAD>', '<UNK>', '<GO>', '<EOS>', '呵呵', '不是', '怎么', '了', '开心', '点']


## 数值化

In [5]:
def process_data_index(datas, vocab2id):
    data_indexs = []
    for words in datas:
        line_index = [vocab2id[w] if w in vocab2id else vocab2id["<UNK>"] for w in words]
        data_indexs.append(line_index)
    return data_indexs

source_data_ids = process_data_index(source_data, vocab2id)
target_data_ids = process_data_index(target_data, vocab2id)
source_data_ids[10]
target_data_ids[10]

[26, 27, 24]

[27, 16, 9572, 436, 45, 452, 45, 274, 111]

## 构造模型

In [6]:
# encoder: 一个Embedding层，加上LSTM层
class Encoder(keras.Model):
    def __init__(self, vocab_size, embedding_dim, hidden_units):
        super(Encoder, self).__init__()
        self.embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)
        self.encoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True, name="encode_lstm")
        
    def call(self, inputs):
        encoder_embed = self.embedding(inputs)
        encoder_outputs, state_h, state_c = self.encoder_lstm(encoder_embed)
        return encoder_outputs, state_h, state_c

In [7]:
# decoder: 有三部分输入，一是encoder部分的每个时刻输出，二是encoder的隐藏状态输出，三是decoder的目标输入
# decoder还包含一个Attention层，计算decoder每个输入与encoder的注意力
class Decoder(keras.Model):
    def __init__(self, vocab_size, embedding_dim, hidden_units):
        super(Decoder, self).__init__()
        self.embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)
        self.decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True, name="decode_lstm")
        self.attention = Attention()
    
    def call(self, enc_outputs, dec_inputs, states_inputs):
        decoder_embed = self.embedding(dec_inputs)
        dec_outputs, dec_state_h, dec_state_c = self.decoder_lstm(decoder_embed, initial_state=states_inputs)
        attention_output = self.attention([dec_outputs, enc_outputs])
        
        return attention_output, dec_state_h, dec_state_c

In [8]:
def Seq2Seq(maxlen, embedding_dim, hidden_units, vocab_size):
    # Input Layer
    encoder_inputs = Input(shape=(maxlen,), name="encode_input")
    decoder_inputs = Input(shape=(None,), name="decode_input")
    # Encoder Layer
    encoder = Encoder(vocab_size, embedding_dim, hidden_units)
    enc_outputs, enc_state_h, enc_state_c = encoder(encoder_inputs)
    dec_states_inputs = [enc_state_h, enc_state_c]
    # Decoder Layer
    decoder = Decoder(vocab_size, embedding_dim, hidden_units)
    attention_output, dec_state_h, dec_state_c = decoder(enc_outputs, decoder_inputs, dec_states_inputs)
    # Dense Layer
    dense_outputs = Dense(vocab_size, activation='softmax', name="dense")(attention_output)
    # seq2seq model
    model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=dense_outputs)
    
    return model

## 模型输入预处理

In [9]:
def process_input_data(source_data_ids, target_data_ids, vocab2id):
    """
    输入输出加上开始结束标识
    """
    source_inputs = []
    decoder_inputs, decoder_outputs = [], []
    for source, target in zip(source_data_ids, target_data_ids):
        source_inputs.append([vocab2id["<GO>"]] + source + [vocab2id["<EOS>"]])
        decoder_inputs.append([vocab2id["<GO>"]] + target)
        decoder_outputs.append(target + [vocab2id["<EOS>"]])
    return source_inputs, decoder_inputs, decoder_outputs

source_input_ids, target_input_ids, target_output_ids = process_input_data(source_data_ids, target_data_ids, vocab2id)
len(source_input_ids)
len(target_input_ids)
len(target_output_ids)
print("encoder inputs: ", source_input_ids[:2])
print("decoder inputs: ", target_input_ids[:2])
print("decoder outputs: ", target_output_ids[:2])

10000

10000

10000

encoder inputs:  [[2, 4, 3], [2, 5, 3]]
decoder inputs:  [[2, 27, 37846, 756, 45, 180], [2, 38, 27, 84, 49272]]
decoder outputs:  [[27, 37846, 756, 45, 180, 3], [38, 27, 84, 49272, 3]]


In [10]:
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

source_input_ids = pad_sequences(source_input_ids, padding='post', maxlen=config.maxlen)
target_input_ids = pad_sequences(target_input_ids, padding='post',  maxlen=config.maxlen)
target_output_ids = pad_sequences(target_output_ids, padding='post',  maxlen=config.maxlen)
source_input_ids.shape
target_input_ids.shape
target_output_ids.shape
print(source_data_ids[:5])
print(target_input_ids[:5])
print(target_output_ids[:5])

(10000, 10)

(10000, 10)

(10000, 10)

[[4], [5], [6, 7], [8, 9, 10, 11, 12, 13, 14, 15], [16, 17, 18, 19, 11, 20]]
[[    2    27 37846   756    45   180     0     0     0     0]
 [    2    38    27    84 49272     0     0     0     0     0]
 [    2    16  6692    82 49273   320    16   518     0     0]
 [    2   526     0     0     0     0     0     0     0     0]
 [   16   438    22   328    19 49272 15817   254  1764 49272]]
[[   27 37846   756    45   180     3     0     0     0     0]
 [   38    27    84 49272     3     0     0     0     0     0]
 [   16  6692    82 49273   320    16   518     3     0     0]
 [  526     3     0     0     0     0     0     0     0     0]
 [  438    22   328    19 49272 15817   254  1764 49272     3]]


## 模型训练

In [11]:
K.clear_session()

vocab_size = len(vocab2id)
model = Seq2Seq(config.maxlen, config.embedding_dim, config.hidden_units, vocab_size)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encode_input (InputLayer)       [(None, 10)]         0                                            
__________________________________________________________________________________________________
encoder (Encoder)               ((None, 10, 64), (No 3536140     encode_input[0][0]               
__________________________________________________________________________________________________
decode_input (InputLayer)       [(None, None)]       0                                            
__________________________________________________________________________________________________
decoder (Decoder)               ((None, None, 64), ( 3536140     encoder[0][0]                    
______________________________________________________________________________________________

In [12]:
loss_fn = keras.losses.SparseCategoricalCrossentropy()
model.compile(loss=loss_fn, optimizer='adam')
model.fit([source_input_ids, target_input_ids], target_output_ids, 
          batch_size=config.batch_size, epochs=config.epochs, validation_split=config.val_rate)

Train on 8000 samples, validate on 2000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f04640ea890>

In [13]:
model.save_weights(config.model_file)
del model

## 加载模型

In [14]:
K.clear_session()

# 新加载时，需要先重新定义模型，然后load_weights加载权重
model = Seq2Seq(config.maxlen, config.embedding_dim, config.hidden_units, vocab_size)
model.load_weights(config.model_file)
print(model.summary())

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encode_input (InputLayer)       [(None, 10)]         0                                            
__________________________________________________________________________________________________
encoder (Encoder)               ((None, 10, 64), (No 3536140     encode_input[0][0]               
__________________________________________________________________________________________________
decode_input (InputLayer)       [(None, None)]       0                                            
__________________________________________________________________________________________________
decoder (Decoder)               ((None, None, 64), ( 3536140     encoder[0][0]                    
______________________________________________________________________________________________

### 获取Encoder子模块

In [15]:
# encoder_model用于对输入编码
def encoder_infer(model):
    encoder_model = Model(inputs=model.get_layer('encoder').inputs, 
                        outputs=model.get_layer('encoder').outputs)
    return encoder_model

encoder_model = encoder_infer(model)
print(encoder_model.summary())

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encode_input (InputLayer)    [(None, 10)]              0         
_________________________________________________________________
encoder (Encoder)            ((None, 10, 64), (None, 6 3536140   
Total params: 3,536,140
Trainable params: 3,536,140
Non-trainable params: 0
_________________________________________________________________
None


### 获取Decoder子模块

In [16]:
# decoder_model用于对输出按每个时刻解码
def decoder_infer(model, encoder_model):
    encoder_output = encoder_model.get_layer('encoder').output[0]
    maxlen, hidden_units = encoder_output.shape[1:]
    
    dec_input = model.get_layer('decode_input').input
    enc_output = Input(shape=(maxlen, hidden_units), name='enc_output')
    dec_input_state_h = Input(shape=(hidden_units,), name='input_state_h')
    dec_input_state_c = Input(shape=(hidden_units,), name='input_state_c')
    dec_input_states = [dec_input_state_h, dec_input_state_c]

    decoder = model.get_layer('decoder')
    dec_outputs, out_state_h, out_state_c = decoder(enc_output, dec_input, dec_input_states)
    dec_output_states = [out_state_h, out_state_c]

    decoder_dense = model.get_layer('dense')
    dense_output = decoder_dense(dec_outputs)

    decoder_model = Model(inputs=[enc_output, dec_input, dec_input_states], 
                          outputs=[dense_output]+dec_output_states)
    return decoder_model

decoder_model = decoder_infer(model, encoder_model)
print(decoder_model.summary())

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
enc_output (InputLayer)         [(None, 10, 64)]     0                                            
__________________________________________________________________________________________________
decode_input (InputLayer)       [(None, None)]       0                                            
__________________________________________________________________________________________________
input_state_h (InputLayer)      [(None, 64)]         0                                            
__________________________________________________________________________________________________
input_state_c (InputLayer)      [(None, 64)]         0                                            
____________________________________________________________________________________________

## 模型预测

In [19]:
def infer_predict(input_text, encoder_model, decoder_model, maxlen):
    """
    seq2seq模型预测时，与一般的模型是有区别的，这里Encoder部分直接预测得到输出，
    Decoder部分需要一次预测一个位置的输出，并把当前位置的输出作为下一时刻的输入。
    所以decoder部分需要自己写一个循环，预测每个时刻输出，最后拼接到一起。结束的判断要么达到最大长度，要么预测出结束符"<EOS>"。
    
    """
    text_words = input_text.split()[:maxlen]
    input_id = [vocab2id[w] if w in vocab2id else vocab2id["<UNK>"] for w in text_words]
    input_id = [vocab2id["<GO>"]] + input_id + [vocab2id["<EOS>"]]
    if len(input_id) < maxlen:
        input_id = input_id + [vocab2id["<PAD>"]] * (maxlen-len(input_id))

    input_source = np.array([input_id])
    input_target = np.array([vocab2id["<GO>"]])
    
    # 编码器encoder预测输出
    enc_outputs, enc_state_h, enc_state_c = encoder_model.predict([input_source])
    dec_inputs = input_target
    dec_states_inputs = [enc_state_h, enc_state_c]

    result_id = []
    result_text = []
    for i in range(maxlen):
        # 解码器decoder预测输出
        dense_outputs, dec_state_h, dec_state_c = decoder_model.predict([enc_outputs, dec_inputs]+dec_states_inputs)
        pred_id = np.argmax(dense_outputs[0][0])
        result_id.append(pred_id)
        result_text.append(id2vocab[pred_id])
        if id2vocab[pred_id] == "<EOS>":
            break
        dec_inputs = np.array([[pred_id]])
        dec_states_inputs = [dec_state_h, dec_state_c]
    return result_id, result_text

In [20]:
input_text = "你 在 干 什么 呢"
result_id, result_text = infer_predict(input_text, encoder_model, decoder_model, config.maxlen)

print("Input: ", input_text)
print("Output: ", result_text, result_id)

Input:  你 在 干 什么 呢
Output:  ['<EOS>'] [3]


In [23]:
import heapq

def infer_encoder_output(input_text, encoder, maxlen=10):
    text_words = input_text.split()[:maxlen]
    input_id = [vocab2id[w] if w in vocab2id else vocab2id["<UNK>"] for w in text_words]
    input_id = [vocab2id["<GO>"]] + input_id + [vocab2id["<EOS>"]]
    if len(input_id) < maxlen:
        input_id = input_id + [vocab2id["<PAD>"]] * (maxlen-len(input_id))
    input_source = np.array([input_id])
    # 编码器encoder预测输出
    enc_outputs, enc_state_h, enc_state_c = encoder.predict([input_source])
    enc_state_outputs = [enc_state_h, enc_state_c]
    return enc_outputs, enc_state_outputs


def infer_beam_search(enc_outputs, enc_state_outputs, decoder, maxlen=10, k=5):
    dec_inputs = [vocab2id["<GO>"]]
    states_curr = {0: enc_state_outputs}
    seq_scores = [[dec_inputs, 1.0, 0]]
    
    for _ in range(maxlen):
        cands = list()
        states_prev = states_curr
        for i in range(len(seq_scores)):
            seq, score, state_id = seq_scores[i]
            dec_inputs = np.array(seq[-1:])
            dec_states_inputs = states_prev[state_id]
            # 解码器decoder预测输出
            dense_outputs, dec_state_h, dec_state_c = decoder.predict([enc_outputs, dec_inputs]+dec_states_inputs)
            prob = dense_outputs[0][0]
            states_curr[i] = [dec_state_h, dec_state_c]


            for j in range(len(prob)):
                cand = [seq + [j], score * prob[j], i]
                cands.append(cand)
            
        seq_scores = heapq.nlargest(k, cands, lambda d: d[1])
            
    res = " ".join([id2vocab[i] for i in seq_scores[0][0]])
    return res

In [25]:
input_text = "你 在 干 什么 呢"

enc_outputs, enc_state_outputs = infer_encoder_output(input_text, encoder_model)
res = infer_beam_search(enc_outputs, enc_state_outputs, decoder_model)

print("Input: ", input_text)
print("Output: ", res)

Input:  你 在 干 什么 呢
Output:  <GO> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS>
