In [1]:
import numpy as np
import torch
import torch.nn as nn

d_k = 64 # Q的维度
d_v = 64 # V的维度
d_embedding = 512 # embedding的维度
n_heads = 8 # 多头注意力的个数
batch_size = 10
n_layers = 6 # 编码器和解码器的层数

In [2]:
class ScaledDotProductAttention(nn.Module):
    """
    缩放点积注意力
    简单理解 ScaledDotProductAttention，目的是计算Query和Key的相似权重，作用于Value
    结果是
    Query1: {Value1: w11, Value2: w12, Value3: w13}
    Query2: {Value1: w21, Value2: w22, Value3: w23}
    """
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()

    def forward(self, Q, K, V, attn_mask):
        # 维度信息
        # Q: [batch_size, n_heads, len_q, d_k]
        # K: [batch_size, n_heads, len_k, d_k]
        # V: [batch_size, n_heads, len_v(=len_k), d_v]
        # attn_mask: [batch_size, n_heads, len_q, len_k]
        # 计算注意力分数
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) # scores : [batch_size, n_heads, len_q, len_k]
        # scores: [batch_size, n_heads, len_q, len_k]
        # 加上注意力掩码, 将attn_mask中为True的位置的分数设置为极小值
        scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is True.
        # softmax归一化 => 注意力权重
        weights = nn.Softmax(dim=-1)(scores)
        # weights: [batch_size, n_heads, len_q, len_k]
        context = torch.matmul(weights, V) 
        # context: [batch_size, n_heads, len_q, d_v]
        return context, weights # 返回上下文变量 和 注意力分数

In [3]:


class MultiHeadAttention(nn.Module):
    """
    多头注意力
    简单理解，先放大维度，提取Q、K、V的各个维度的信息，再缩小维度，得到最终的结果
    黑盒的看是 (Q、K、V) -> Q
    """
    def __init__(self, d_embedding=d_embedding, n_heads=n_heads):
        super(MultiHeadAttention, self).__init__()
        self.d_embedding = d_embedding
        self.n_heads = n_heads

        self.W_Q = nn.Linear(d_embedding, n_heads * d_k)
        self.W_K = nn.Linear(d_embedding, n_heads * d_k)
        self.W_V = nn.Linear(d_embedding, n_heads * d_v)
        self.linear = nn.Linear(n_heads * d_v, d_embedding)
        self.layer_norm = nn.LayerNorm(d_embedding)

    def forward(self, Q, K, V, attn_mask):
        # 维度信息
        # Q: [batch_size, len_q, d_embedding]
        # K: [batch_size, len_k, d_embedding]
        # V: [batch_size, len_v(=len_k), d_embedding]
        # attn_mask: [batch_size, len_q, len_k]
        
        residual, batch_size = Q, Q.size(0)
        # 线性层，维度提升，为了捕捉更多信息
        q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2) 
        # q_s: [batch_size, n_heads, len_q, d_k]
        k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)
        # k_s: [batch_size, n_heads, len_k, d_k]
        v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)
        # v_s: [batch_size, n_heads, len_v(=len_k), d_v]

        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1)
        # attn_mask: [batch_size, n_heads, len_q, len_k]

        # 点积缩放注意力
        context, weights = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)
        # context: [batch_size, n_heads, len_q, d_v]
        # weights: [batch_size, n_heads, len_q, len_k]
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v)
        # context: [batch_size, len_q, n_heads * d_v]

        # 线性层，降维成 Q 原始的维度
        output = self.linear(context) 
        # output: [batch_size, len_q, d_embedding]
        
        # 残差连接，并做归一化（方便将当前Q往下层传递，所以做了残差）
        output = self.layer_norm(output + residual) 
        # output: [batch_size, len_q, d_embedding]
        return output, weights

In [4]:
class PoswiseFeedForwardNet(nn.Module):
    """
    前馈神经网络，目标是优化每个标记（单词）的表征
    对每个位置的d_embedding维度进行升维 => 降维 => 残差归一化
    """
    def __init__(self, d_ff=2048):
        super(PoswiseFeedForwardNet, self).__init__()
        # 输入升维
        self.conv1 = nn.Conv1d(in_channels=d_embedding, out_channels=d_ff, kernel_size=1)
        # 输入降维
        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_embedding, kernel_size=1)
        # 定义 归一化
        self.layer_norm = nn.LayerNorm(d_embedding)

    def forward(self, inputs):
        # inputs [batch_size, len_q, d_embedding]
        residual = inputs

        output = nn.ReLU()(self.conv1(inputs.transpose(1, 2)))
        # [batch_size, d_ff, len_q]

        output = self.conv2(output).transpose(1, 2)
        # [batch_size, len_q, d_embedding]
        
        output = self.layer_norm(output + residual)
        # [batch_size, len_q, d_embedding]
        return output

In [5]:
def get_pos_enc_table(n_position, embedding_dim):
    # 位置编码表：目的是让模型知道输入序列中单词的位置信息
    # 也可以用自然序列(1,2,3)作为位置编码，但正余弦能更好表达位置信息
    # 维度信息
    # n_position: 输入序列最大长度
    # embedding_dim: 词向量维度

    pos_table = np.zeros((n_position, embedding_dim), dtype=np.float32)
    for pos_i in range(n_position):
        for idx in range(embedding_dim):
            angle = pos_i / np.power(10000, 2 * (idx // 2) / embedding_dim)
            pos_table[pos_i, idx] = angle
    
    pos_table[:, 0::2] = np.sin(pos_table[:, 0::2]) # dim 2i偶数维
    pos_table[:, 1::2] = np.cos(pos_table[:,1::2]) # dim 2i+1奇数维
    # pos_table: [n_position, embedding_dim]
    return torch.FloatTensor(pos_table)

In [6]:
def get_attn_pad_mask(seq_q, seq_k):
    # 填充注意力掩码
    # seq_q: [batch_size, len_q]
    # seq_k: [batch_size, len_k]
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()

    # =0的位置会变成True,其他是False
    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1) 
    # [batch_size, 1, len_k]

    pad_aatn_mask = pad_attn_mask.expand(batch_size, len_q, len_k)
    # [batch_size, len_q, len_k]
    return pad_attn_mask

In [7]:
class EncoderLayer(nn.Module):
    def __init__(self):
        super(EncoderLayer, self).__init__()
        self.enc_self_attn = MultiHeadAttention() # 多头注意力
        self.pos_ffn = PoswiseFeedForwardNet() # 逐位前馈网络

    def forward(self, enc_inputs, enc_self_attn_mask):
        # enc_inputs: [batch_size, seq_len, embedding_dim]
        # enc_self_attn_mask: [batch_size, seq_len, seq_len]

        # Q、K、V都是本身
        enc_outputs, attn_weights = self.enc_self_attn(
            enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask) # enc_inputs to same Q,K,V

        # enc_outputs: [batch_size, seq_len, embedding_dim]
        # attn_weights: [batch_size, n_heads, seq_len, seq_len]
        enc_outputs = self.pos_ffn(enc_outputs) 
        # enc_outputs: [batch_size, seq_len, embedding_dim]
        return enc_outputs, attn_weights  

In [8]:
class Encoder(nn.Module):
    def __init__(self, corpus, n_layers=6, drop_p=0.1):
        super(Encoder, self).__init__()
        self.src_emb = nn.Embedding(len(corpus.src_vocab), d_embedding)
        self.pos_emb = nn.Embedding.from_pretrained(
            get_pos_enc_table(corpus.src_len+1, d_embedding), freeze=True)
        # freeze=True, 位置编码固定，不用更新
        self.layers = nn.ModuleList([
            EncoderLayer() for _ in range(n_layers)
        ])
        # self.dropout = nn.Dropout(drop_p)
    
    def forward(self, enc_inputs):
        # enc_inputs: [batch_size, seq_len]

        pos_idxs = torch.arange(1, enc_inputs.size(1) + 1).unsqueeze(0).to(enc_inputs)
        # pos_idx [1, seq_len]

        # embedding(词) + embedding(位置)
        enc_outputs = self.src_emb(enc_inputs) + self.pos_emb(pos_idxs)
        # enc_outpus [batch_size, seq_len, embedding_dim]
        
        # 忽略下某些信息
        # enc_outputs = self.dropout(enc_outputs)
        # enc_outpus [batch_size, seq_len, embedding_dim]

        # 获取掩码
        enc_self_attn_mask = get_attn_pad_mask(enc_inputs, enc_inputs)

        # 多层Encoder Layer
        enc_self_attn_weights = []
        for layer in self.layers:
            enc_outputs, enc_self_attn_weight = layer(
                enc_outputs, enc_self_attn_mask)
            enc_self_attn_weights.append(enc_self_attn_weight)

        # enc_outputs = self.dropout(enc_outputs)
        # enc_outputs: [batch_size, seq_len, embedding_dim],
        # enc_self_attn_mask: list[batch_size, n_head, seq_len, seq_len]
        return  enc_outputs, enc_self_attn_weights

In [9]:
def get_attn_subsequent_mask(seq):
    # seq: [batch_size, seq_len(Q)=seq_len(K)
    attn_shape = [seq.size(0), seq.size(1), seq.size(1)]
    # attn_shape: [batch_size, seq_len, seq_len]

    # triu triangle upper
    subsequent_mask = np.triu(np.ones(attn_shape), k=1)
    # subsequent_mask: [batch_size, seq_len, seq_len]

    subsequent_mask = torch.from_numpy(subsequent_mask).byte()
    # subsequent_mask: [batch_size, seq_len, seq_len]
    return subsequent_mask

In [10]:
class DecoderLayer(nn.Module):
    def __init__(self):
        super(DecoderLayer, self).__init__()

        self.dec_self_attn = MultiHeadAttention()
        self.dec_enc_attn = MultiHeadAttention()
        self.pos_ffn = PoswiseFeedForwardNet()

    def forward(self, dec_inputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask):
        # dec_inputs [batch_size, tgt_len, d_model]
        # enc_outputs [batch_size, src_len, d_model]
        # dec_self_attn_mask [batch_size, tgt_len, tgt_len]
        # dec_enc_attn_mask [batch_size, tgt_len, src_len]


        # 先自注意自己
        dec_outputs, dec_self_attn = self.dec_self_attn(
            dec_inputs, dec_inputs, dec_inputs, dec_self_attn_mask)
        # dec_outputs [batch_size, tgt_len, d_model]
        # dec_self_attn [batch_size, n_heads, tgt_len, tgt_len]

        # 再注意Encoder的隐藏层
        dec_outputs, dec_enc_attn = self.dec_enc_attn(
            dec_outputs, enc_outputs, enc_outputs, dec_enc_attn_mask)
        # dec_outputs [batch_size, tgt_len, d_model]
        # dec_enc_attn [batch_size, h_heads, tgt_len, src_len]

        dec_outpus = self.pos_ffn(dec_outputs)
        return dec_outputs, dec_self_attn, dec_enc_attn

In [11]:
class Decoder(nn.Module):
    def __init__(self, corpus):
        super(Decoder, self).__init__()
        self.tgt_emb = nn.Embedding(len(corpus.tgt_vocab), d_embedding)
        
        self.pos_emb = nn.Embedding.from_pretrained(
            get_pos_enc_table(corpus.tgt_len+1, d_embedding), freeze=True)
        self.layers = nn.ModuleList([DecoderLayer() for _ in range(n_layers)])
    
    def forward(self, dec_inputs, enc_inputs, enc_outputs):
        # dec_inputs: [batch_size, tgt_len]
        # enc_inputs: [batch_size, src_len]
        # enc_outputs: [batch_size, src_len, embedding_dim]

        # 位置索引
        pos_idxs = torch.arange(1, dec_inputs.size(1)+1).unsqueeze(0).to(dec_inputs)
        # pos_idxs: [1, tgt_len]
        dec_outputs = self.tgt_emb(dec_inputs) + self.pos_emb(pos_idxs)
        # dec_outputs: [batch_size, tgt_len, embedding_dim]
        
        # 解码自注意力掩码
        # 位置掩码（长度不够的序列补<pad>）
        dec_self_attn_pad_mask = get_attn_pad_mask(dec_inputs, dec_inputs)
        # 后续掩码（训练过程中不能看到后续的token）
        dec_self_attn_subsequent_mask = get_attn_subsequent_mask(dec_inputs)
        # 合并 位置掩码，后续掩码
        dec_self_attn_mask = torch.gt((dec_self_attn_pad_mask + dec_self_attn_subsequent_mask), 0)
        # dec_self_attn_mask: [batch_size, tgt_len, tgt_len]

        # 解码-编码注意力掩码
        dec_enc_attn_mask = get_attn_pad_mask(dec_inputs, enc_inputs)
        # dec_self_attn_mask: [batch_size, tgt_len, src_len]

        dec_self_attns, dec_enc_attns = [], []
        for layer in self.layers:
            dec_outputs, dec_self_attn, dec_enc_attn = layer(
                dec_outputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask)
            # dec_outputs: [batch_size, tgt_len, embedding_dim]
            # dec_self_attn: [batch_size, tgt_len, tgt_len]
            # dec_enc_attn: [batch_size, tgt_len, src_len]
            dec_self_attns.append(dec_self_attn)
            dec_enc_attns.append(dec_enc_attn)
        return dec_outputs, dec_self_attns, dec_enc_attns

In [12]:
class Transformer(nn.Module):
    def __init__(self, corpus):
        super(Transformer, self).__init__()
        self.corpus = corpus
        self.encoder = Encoder(corpus)
        self.decoder = Decoder(corpus)
        # 将解码器输出转换为目标词汇表的概率分布
        self.projection = nn.Linear(d_embedding, len(corpus.tgt_vocab), bias=False)

    def forward(self, enc_inputs, dec_inputs):
        # enc_inputs: [batch_size, src_len]
        # dec_inputs: [batch_size, tgt_len]

        enc_outputs, enc_self_attns = self.encoder(enc_inputs)
        # enc_outputs: [batch_size, src_len, d_embedding]
        # enc_self_attns: list([batch_size, n_heads, src_len, src_len])


        # encoder_input用来和decoder_output做掩码的，即 encoder里面短的序列，decoder不关注超过短序列的信息
        dec_outputs, dec_self_attns, dec_enc_attns = self.decoder(dec_inputs, enc_inputs, enc_outputs)
        # dec_outputs: [batch_size, tgt_len, d_embedding]
        # dec_self_attns: list([batch_size, n_heads, tgt_len, tgt_len])
        # dec_enc_attns: list([batch_size, n_heads, tgt_len, src_len])
        
        # 解码器的输出通过一个全连接层得到最终的输出
        dec_logits = self.projection(dec_outputs)
        return dec_logits, enc_self_attns, dec_self_attns, dec_enc_attns

In [13]:
from collections import Counter

class TranslationCorpus:
    def __init__(self, sentences):
        self.sentences = sentences
        
        # +1是容纳<pad>
        self.src_len = max(len(sentence[0].split()) for sentence in sentences) + 1
        # +2是容纳<sos>和<eos>
        self.tgt_len = max(len(sentence[1].split()) for sentence in sentences) + 3

        self.src_vocab, self.tgt_vocab = self.create_vocab()

        self.src_idx2word = {v: k for k, v in self.src_vocab.items()}
        self.tgt_idx2word = {v: k for k, v in self.tgt_vocab.items()}

    def create_vocab(self):
        src_counter = Counter(word for sentence in self.sentences for word in sentence[0].split())
        tgt_counter = Counter(word for sentence in self.sentences for word in sentence[1].split())
        
        src_vocab = {
            "<pad>": 0,
            "<unknown>": 1
            # **{word: i+1 ,
        }
        src_unknown_idx = 1
        word_max_cnt = 50
        for i, word in enumerate(src_counter):
            # if src_counter[word] >= word_max_cnt:
            #     # src_vocab[word] = src_unknown_idx 
            #     continue
            src_vocab[word] = i + 2
        

        tgt_vocab = {
            "<pad>": 0,
            "<sos>": 1,
            "<eos>": 2,
            "<unkonwn>": 3,
        }
        for i, word in enumerate(tgt_counter):
            # if tgt_counter[word] >= word_max_cnt:
            #     continue
            tgt_vocab[word] = i+4
        return src_vocab, tgt_vocab

    def make_batch(self, batch_size, test_batch=False):
        input_batch, output_batch, target_batch = [], [], []
        batch_size = min(batch_size, len(self.sentences))
        sentence_idxs = torch.randperm(len(self.sentences))[:batch_size]
        for idx in sentence_idxs:
            src_sentence, tgt_sentence = self.sentences[idx]
            src_seq = [self.src_vocab[word] for word in src_sentence.split() if word in self.src_vocab]
            tgt_seq = [self.tgt_vocab['<sos>']] + [self.tgt_vocab[word] for word in tgt_sentence.split() if word in self.tgt_vocab ]  + [self.tgt_vocab['<eos>']]
            # 数据填充
            src_seq += [self.src_vocab['<pad>']] * (self.src_len - len(src_seq))
            tgt_seq += [self.tgt_vocab['<pad>']] * (self.tgt_len - len(tgt_seq))
            input_batch.append(src_seq)
            if test_batch:
                # 验证阶段，输出序列初始值为["sos", "pad", "pad"]
                output_batch.append(
                    [self.tgt_vocab['<sos>']] + ([self.tgt_vocab['<pad>']] * (self.tgt_len-2))
                )
            else:
                output_batch.append(tgt_seq[:-1])
            target_batch.append(tgt_seq[1:])
        return torch.LongTensor(input_batch), torch.LongTensor(output_batch), torch.LongTensor(target_batch)

In [14]:
sentences = [
    ["咖哥 很 喜欢 小冰", "KaGe likes XiaoBing much"],
    ["我 爱 学习 人工智能", "I love studying AI"],
    ["深度学习 改变 世界", "DL changed the world"],
    ["自然语言 处理 很 强大", "NLP is so powerful"],
    ["神经网络 非常 复杂", "Neural Nets are complex"]
]

In [15]:
sentences = []

dataset_prefix = "SPC.en-zh."
source = "zh"
target = "en"
pairs = [source, target]

datas = {
    "zh": [],
    "en": []
}
for k in pairs:
    with open(f"{dataset_prefix}{k}", "r") as f:
        while True:
            line = f.readline()
            if len(line) == 0:
                break
            datas[k] += [line]
for idx in range(len(datas["zh"])):
    cn = datas["zh"][idx].strip()
    en = datas["en"][idx].strip()
    cn_words = cn.split()
    en_words = en.split()
    # if len(cn_words) > 10 or len(cn_words) < 3:
    if len(cn_words) < 3:
        continue
    if len(en_words) > 15 or len(en_words) < 3:
    # if len(en_words) > 20 or len(en_words) < 3:
        continue
    sentences += [[cn, en]]
print(len(sentences))
print("示例语句", sentences[0])

509
示例语句 ['上市 公司 重大 资产 重组 管理 办法', 'Measures for Administration of Material Assets Reorganization of Listed Companies']


In [16]:
corpus = TranslationCorpus(sentences)
print("src词表大小", len(corpus.src_vocab))
print("tgt词表大小", len(corpus.tgt_vocab))

src词表大小 1238
tgt词表大小 1603


In [17]:
import torch
import torch.optim as optim

model = Transformer(corpus)

loss_fn = nn.CrossEntropyLoss() 
learning_rate = 0.0001 
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

epoches = 1000 # 迭代轮次

for epoch in range(epoches):
    optimizer.zero_grad()

    enc_inputs, dec_inputs, target_batch = corpus.make_batch(batch_size)
    outputs, _, _, _ = model(enc_inputs, dec_inputs)
    loss = loss_fn(outputs.view(-1, len(corpus.tgt_vocab)), target_batch.view(-1))
    if (epoch + 1) % 100 == 0:
        print(f"epoch: {epoch+1: 04d} cost={loss:6f}")
    
    loss.backward()
    optimizer.step()

epoch:  100 cost=3.194750
epoch:  200 cost=2.165691
epoch:  300 cost=1.775567
epoch:  400 cost=1.169823
epoch:  500 cost=0.690838
epoch:  600 cost=0.657423
epoch:  700 cost=0.286039
epoch:  800 cost=0.145922
epoch:  900 cost=0.057019
epoch:  1000 cost=0.090823


In [19]:
# 贪婪解码器，每次找概率最大的单词
def greedy_decoder(model, enc_input, start_symbol):
    
    # enc_input: [batch_size, seq_len]

    enc_outputs, enc_self_attns = model.encoder(enc_input)
    # enc_outputs: [batch_size, seq_len, embedding_dim]
    # 解码器输入为全0变量
    dec_input = torch.zeros(1,5).type_as(enc_input.data)
    # dec_input: [1, 5]
    next_symbol = start_symbol
    for i in range(0,5):
        dec_input[0][i] = next_symbol
        dec_outputs, _, _ = model.decoder(dec_input, enc_input, enc_outputs)
        projected = model.projection(dec_outputs)
        prob = projected.squeeze(0).max(1, keepdim=False)[1]
        next_word = prob.data[i]
        next_symbol = next_word.item()
    dec_outputs = dec_input
    return dec_outputs

In [20]:
def list_to_str(vals=[], sep=" "):
    return sep.join(vals)

# 模型一次性生成
def test_single_batch():
    enc_inputs, dec_inputs, target_batch = corpus.make_batch(batch_size=1, test_batch=True)
    predict, _, _, _ = model(enc_inputs, dec_inputs)
    predict = predict.view(-1, len(corpus.tgt_vocab))
    predict = predict.data.max(1, keepdim=True)[1] # 找到每个位置概率最大单词的索引

    translated_sentence = [corpus.tgt_idx2word[idx.item()] for idx in predict.squeeze()]
    input_sentence = [corpus.src_idx2word[idx.item()] for idx in enc_inputs[0]]

    print("input: ", list_to_str(input_sentence))
    print("translate: ", list_to_str(translated_sentence))

# 贪婪逐字生成
def test_greedy_batch():
    enc_inputs, dec_inputs, target_batch = corpus.make_batch(batch_size=1, test_batch=True)
    greedy_dec_predict = greedy_decoder(model, enc_inputs, start_symbol=corpus.tgt_vocab['<sos>'])
    greedy_dec_output_words = [corpus.tgt_idx2word[idx.item()] for idx in greedy_dec_predict.squeeze()]
    input_sentence = [corpus.src_idx2word[idx.item()] for idx in enc_inputs[0]]
    print("input: ", list_to_str(input_sentence))
    print("greedy translate: ", list_to_str(greedy_dec_output_words))

In [21]:
for i in range(5):
    test_single_batch()

input:  （ 二 ） 经营者 报酬 实施 方案 。 <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
translate:  (2) (2) (2) (2) (2) (2) (2) (2) (2) (2) (2) (2) (2) (2) (2) (2) (2)
input:  第三 条 国家 严格 管制 枪支 。 <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> 

In [26]:
for i in range(5):
    test_greedy_batch()

input:  第八 节 财务 报告 <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
greedy translate:  <sos> Section 8 Financial Statements
input:  （ 四 ） 调整 负责人 及 有关 管理 人员 。 <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pa