# 12.3 实现Seq2Seq中英翻译

In [1]:
import tensorflow as tf
L = tf.keras.layers

model = tf.keras.Sequential([
    L.Dense(12, input_dim=7, activation='relu', name='dense_layer0'),
    L.Dense(6, activation='relu', name='dense_layer1'),
    L.Dense(1, activation='sigmoid', name='output_layer')
])

In [2]:
# 定义层
# Input 层是个特殊的层，用于定义输入的形状，初始化后是一个张量
input_layer = L.Input(shape=(7,), name='input_layer')
# 除了 Input 外的层对象接受张量为参数，返回一个张量
dense_layer0 = L.Dense(12, activation='relu', name='dense_layer0')
dense_layer1 = L.Dense(6, activation='relu', name='dense_layer1')
output_layer = L.Dense(1, activation='sigmoid', name='output_layer')

# 定义运算流程
# input_layer 本身就是输入张量
inputs = input_layer
# 输入张量作为参数传入 dense_layer0 层，经过前向传播后得到张量 x
x = dense_layer0(inputs)
# 张量 x 作为参数传入 dense_layer1 层，经过前向传播后得到新张量 x
x = dense_layer1(x)
# 张量 x 作为参数传入 output_layer 层，经过前向传播后得到模型输出张量
predictions = output_layer(x)

# 使用输入和输出张量初始化一个函数式模型
model = tf.keras.Model(inputs=inputs, outputs=predictions)

In [4]:
# 引入该章节全部依赖
import collections
import operator
import random
from typing import List, Dict

import numpy as np
import pandas as pd
from hanziconv import HanziConv
from segtok.tokenizer import word_tokenizer
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

data_path = 'data/cmn-eng/cmn.txt'
df = pd.read_csv(data_path, header=None, sep='\t')

# 原始数据没有表头，我们用这个方法增加表头
df.columns = ['en', 'cn', 'cc']
# 为了加速训练过程，我们只取前 5000 条数据，你也可以使用全部数据进行训练
df = df[:5000]

# 把繁体中文转换文简体中文
df['cn'] = df['cn'].apply(lambda x: HanziConv.toSimplified(x))

# 使用 segtok 分词，分词前把全部文本转为小写
df['en_cutted'] = df['en'].apply(lambda x: word_tokenizer(x.lower()))
# 基于字的分词，同时增加开始和结束标志
df['cn_cutted'] = df['cn'].apply(lambda x: ['<BOS>'] + list(x) + ['<EOS>'])
df.head()

Unnamed: 0,en,cn,cc,en_cutted,cn_cutted
0,Hi.,嗨。,CC-BY 2.0 (France) Attribution: tatoeba.org #5...,"[hi, .]","[<BOS>, 嗨, 。, <EOS>]"
1,Hi.,你好。,CC-BY 2.0 (France) Attribution: tatoeba.org #5...,"[hi, .]","[<BOS>, 你, 好, 。, <EOS>]"
2,Run.,你用跑的。,CC-BY 2.0 (France) Attribution: tatoeba.org #4...,"[run, .]","[<BOS>, 你, 用, 跑, 的, 。, <EOS>]"
3,Wait!,等等！,CC-BY 2.0 (France) Attribution: tatoeba.org #1...,"[wait, !]","[<BOS>, 等, 等, ！, <EOS>]"
4,Wait!,等一下！,CC-BY 2.0 (France) Attribution: tatoeba.org #1...,"[wait, !]","[<BOS>, 等, 一, 下, ！, <EOS>]"


In [6]:
class Processor(object):

    def build_token_dict(self, corpus: List[List[str]]):
        """
        构建 token 字典，这个方法将会遍历分词后的语料，构建一个标记频率字典和标记与索引的映射字典

        Args:
            corpus: 所有分词后的语料
        """
        token2idx = {
            '<PAD>': 0,
            '<UNK>': 1,
            '<BOS>': 2,
            '<EOS>': 3
        }

        token2count = {}
        for sentence in corpus:
            for token in sentence:
                count = token2count.get(token, 0)
                token2count[token] = count + 1
        # 按照词频降序排序
        sorted_token2count = sorted(token2count.items(),
                                    key=operator.itemgetter(1),
                                    reverse=True)
        token2count = collections.OrderedDict(sorted_token2count)

        for token in token2count.keys():
            if token not in token2idx:
                token2idx[token] = len(token2idx)
        return token2idx, token2count

    @staticmethod
    def numerize_sequences(sequence: List[str],
                           token2index: Dict[str, int]) -> List[int]:
        """
        将分词后的标记（token）数组转换成对应的索引数组
        如 ['我', '想', '睡觉'] -> [10, 313, 233]

        Args:
            sequence: 分词后的标记数组
            token2index: 索引词典
        Returns: 输入数据对应的索引数组
        """
        token_result = []
        for token in sequence:
            token_index = token2index.get(token)
            if token_index is None:
                token_index = token2index['<UNK>']
            token_result.append(token_index)
        return token_result

In [10]:
p = Processor()

p.input2idx, p.input2count = p.build_token_dict(df.en_cutted.to_list())
p.output2idx, p.output2count = p.build_token_dict(df.cn_cutted.to_list())

p.idx2output = dict([(v, k) for k, v in p.output2idx.items()])

In [11]:
ENCODER_DIM = len(p.input2idx)
DECODER_DIM = len(p.output2idx)

# 读取序列长度，用于补全数据
EN_SEQ_LEN = max([len(seq) for seq in df.en_cutted.to_list()])
CN_SEQ_LEN = max([len(seq) for seq in df.cn_cutted.to_list()])

# 隐藏层数量
HIDDEN_LAYER_DIM = 512

In [13]:
tokenized_en = []
tokenized_cn = []

for input_seq in df.en_cutted.to_list():
    tokenized_en.append(p.numerize_sequences(input_seq, p.input2idx))

for output_seq in df.cn_cutted.to_list():
    tokenized_cn.append(p.numerize_sequences(output_seq, p.output2idx))

padded_en = pad_sequences(tokenized_en, EN_SEQ_LEN, padding='post', truncating='post')
padded_cn = pad_sequences(tokenized_cn, CN_SEQ_LEN, padding='post', truncating='post')

encoder_input_data = padded_en
# 第 0 个时长到倒数第 2 个时长的序列作为解码器输入
decoder_input_data = padded_cn[:, :-1]
# 第 1 个时长到最后一个时长的序列作为解码器输入
# 由于输出层通过交叉熵计算损失，还需要把解码器输出转换为 one-hot 编码
decoder_output_data = to_categorical(padded_cn[:, 1:], DECODER_DIM)

In [None]:
L = keras.layers

# 编码器输入
encoder_inputs = L.Input(shape=(None,),
                         name='encoder_inputs')

# 编码器 Embedding 层
encoder_embedding_layer = L.Embedding(input_dim=ENCODER_DIM,
                                      output_dim=64,
                                      name='encoder_embedding')

# 编码器 LSTM 层
encoder_lstm_layer = L.LSTM(HIDDEN_LAYER_DIM,
                            return_state=True,  # 返回编码器的隐藏层状态
                            name='encoder_lstm')

encoder_embeddings = encoder_embedding_layer(encoder_inputs)
# 获取编码器 LSTM 层的隐藏层状态
encoder_outputs, state_h, state_c = encoder_lstm_layer(encoder_embeddings)

encoder_states = [state_h, state_c]

# 解码器输入
decoder_inputs = L.Input(shape=(None,),
                         name='decoder_inputs')
# 解码器 Embedding 层
decoder_embedding_layer = L.Embedding(input_dim=DECODER_DIM,
                                      output_dim=64,
                                      name='decoder_embedding')
# 解码器 LSTM 层
decoder_lstm_layer = L.LSTM(HIDDEN_LAYER_DIM,
                            return_sequences=True,  # 返回序列
                            return_state=True,  # 返回编码器的隐藏层状态
                            name='decoder_lstm')

# 解码器的全连接输出层
decoder_dense_layer = L.Dense(DECODER_DIM,
                              activation='softmax',
                              name='decoder_dense')

decoder_embeddings = decoder_embedding_layer(decoder_inputs)
# 使用编码器隐藏层状态作为解码器的初始状态
decoder_lstm_output, state_h, state_c = decoder_lstm_layer(decoder_embeddings,
                                                           initial_state=encoder_states)
decoder_outputs = decoder_dense_layer(decoder_lstm_output)

# 构造模型，输入为编码器输入和解码器输入，输出为编码器输出
model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
model.summary()

# 训练 100 轮次
model.fit([encoder_input_data, decoder_input_data],
          decoder_output_data,
          epochs=100,
          batch_size=64,
          callbacks=[])

In [17]:
# 编码模型接受编码器输入，输出编码器隐藏层状态
encoder_model = keras.Model(encoder_inputs, encoder_states)
encoder_model.summary()

# 解码模型接受解码器 Embedding 结果和上一次的隐藏层状态作为输入
decoder_state_input_h = L.Input(shape=(HIDDEN_LAYER_DIM,))
decoder_state_input_c = L.Input(shape=(HIDDEN_LAYER_DIM,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_lstm_outputs, h, c = decoder_lstm_layer(decoder_embeddings,
                                                initial_state=decoder_states_inputs)

# 解码器以目标序列和当前隐藏层状态作为输出
decoder_states = [h, c]
decoder_outputs = decoder_dense_layer(decoder_lstm_outputs)

decoder_model = keras.Model([decoder_inputs] + decoder_states_inputs,
                            [decoder_outputs] + decoder_states)
decoder_model.summary()


def translate_sentence(sentence: List[str]):
    """
    翻译句子
    Args:
        sentence: 原始句子

    Returns:
        翻译结果
    """
    # 输入句子转换为 idx 序列，补全序列
    vec_sen = p.numerize_sequences(sentence, p.input2idx)
    vec_sen = pad_sequences([vec_sen], EN_SEQ_LEN, padding='post', truncating='post')
    # 获取 Thought Vector
    h1, c1 = encoder_model.predict(vec_sen)

    # 以开始标记 <BOS> 作为输入标记，开始预测
    target_seq = np.array([[p.output2idx['<BOS>']]])

    outputs: List[int] = []

    while True:
        # 预测下一个标记，更新隐藏层状态
        output_tokens, h1, c1 = decoder_model.predict([target_seq, h1, c1])
        # 通过 argmax 方法，得到下一个标记的 id
        sampled_token_index: int = np.argmax(output_tokens[0, -1, :])

        # 当标记为结束标志或者序列过长时候停止预测
        if sampled_token_index == p.output2idx['<EOS>'] or len(outputs) > 30:
            break

        outputs.append(sampled_token_index)
        # 使用预测标记作为下一次的输入
        target_seq = np.array([[sampled_token_index]])

    return ''.join([p.idx2output[output] for output in outputs])

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder_inputs (InputLayer)  [(None, None)]            0         
_________________________________________________________________
encoder_embedding (Embedding (None, None, 64)          142528    
_________________________________________________________________
encoder_lstm (LSTM)          [(None, 512), (None, 512) 1181696   
Total params: 1,324,224
Trainable params: 1,324,224
Non-trainable params: 0
_________________________________________________________________
Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
decoder_inputs (InputLayer)     [(None, None)]       0                                            
_________________________________________________________________

In [18]:
for i in range(10):
    sentence = random.choice(df.en_cutted.to_list())
    res = translate_sentence(sentence)
    print(f"{' '.join(sentence):30}-> {res}")

feel free to stay .           -> 欢迎留下来。
i use twitter .               -> 我用Twitter。
their eyes met .              -> 他们目光相接。
please eat some cake .        -> 请吃点蛋糕。
it can't be true .            -> 那不可能是真的。
may we swim here ?            -> 我们能在这里游泳吗？
yes , of course .             -> 是的，当然。
who is that old man ?         -> 那个老男人是谁？
you've been had .             -> 你们被骗了。
this is a pencil .            -> 这是一支铅笔。
