In [1]:
import re
import numpy as np
import paddle
import paddle.nn.functional as F

In [2]:
MAX_LEN=10
lines=open('cmn.txt',encoding='utf-8').read().strip().split('\n')
words_re=re.compile(r'\w+')
pairs=[]
for l in lines:
    en_sent,cn_sent,_=l.split('\t')
    pairs.append((words_re.findall(en_sent.lower()),list(cn_sent)))

# print(pairs[:13])

filtered_pairs=[]
for x in pairs:
    if len(x[0])<MAX_LEN and len(x[1])<MAX_LEN and x[0][0] in ('i','you','he','she','they','we'):
        filtered_pairs.append(x)

print(len(filtered_pairs))

for x in filtered_pairs[-5:] :print(x)

6784
(['you', 'should', 'do', 'the', 'honorable', 'thing', 'and', 'resign'], ['你', '應', '該', '光', '榮', '地', '辭', '職', '。'])
(['i', 'am', 'looking', 'forward', 'to', 'hearing', 'from', 'you', 'soon'], ['我', '期', '待', '您', '的', '消', '息', '。'])
(['i', 'don', 't', 'want', 'there', 'to', 'be', 'any', 'misunderstanding'], ['我', '不', '想', '有', '任', '何', '误', '会', '。'])
(['i', 'like', 'cracking', 'sunflower', 'seeds', 'with', 'my', 'teeth'], ['我', '喜', '欢', '嗑', '葵', '花', '籽', '。'])
(['he', 'went', 'to', 'the', 'united', 'states', 'to', 'study', 'medicine'], ['他', '去', '美', '国', '学', '医', '了', '。'])


In [3]:
en_vocab={}
cn_vocab={}
en_vocab['<pad>'],en_vocab['<bos>'],en_vocab['<eos>']=0,1,2
cn_vocab['<pad>'],cn_vocab['<bos>'],cn_vocab['<eos>']=0,1,2

en_idx,cn_idx=3,3
for en,cn in filtered_pairs:
    for w in en:
        if w not in en_vocab:
            en_vocab[w]=en_idx
            en_idx+=1
    for w in cn:
        if w not in cn_vocab:
            cn_vocab[w]=cn_idx
            cn_idx+=1

In [4]:
padded_en_sents=[]
padded_cn_sents=[]
padded_cn_label_sents=[]

for en,cn in filtered_pairs:
    padded_en_sent=en+["<eos>"]+["<pad>"]*(MAX_LEN-len(en))
    # print(padded_en_sent)
    padded_en_sent.reverse()
    # print(padded_en_sent)
    padded_cn_sent=["<bos>"]+cn+["<eos>"]+["<pad>"]*(MAX_LEN-len(cn))
    padded_cn_label_sent=cn+['<eos>']+['<pad>']*(MAX_LEN-len(cn)+1)

    padded_en_sents.append([en_vocab[w] for w in padded_en_sent])
    padded_cn_sents.append([cn_vocab[w] for w in padded_cn_sent])
    padded_cn_label_sents.append([cn_vocab[w] for w in padded_cn_label_sent])

train_en_sents=np.array(padded_en_sents)
train_cn_sents=np.array(padded_cn_sents)
train_cn_label_sents=np.array(padded_cn_label_sents)

print(train_en_sents.shape)
print(train_cn_sents.shape)
print(train_cn_label_sents.shape)



(6784, 11)
(6784, 12)
(6784, 12)


In [5]:
embedding_size=128
hidden_size=256
num_encoder_lstm_layers=1
en_vocab_size=len(list(en_vocab))
cn_vocab_size=len(list(cn_vocab))

epochs=20
batch_size=16


In [6]:
class Encoder(paddle.nn.Layer):
    def __init__(self):
        super(Encoder,self).__init__()
        self.emb=paddle.nn.Embedding(en_vocab_size,embedding_size)
        self.lstm=paddle.nn.LSTM(input_size=embedding_size,hidden_size=hidden_size,num_layers=num_encoder_lstm_layers)

    def forward(self,x):
        x=self.emb(x)
        x,(_,_)=self.lstm(x)
        return x

# Encoder-AttentionDecoder 模型配置(只有这里和上一个实践不同)

In [7]:
class AttentionDecoder(paddle.nn.Layer):
    def __init__(self):
        super(AttentionDecoder,self).__init__()
        self.emb=paddle.nn.Embedding(cn_vocab_size,embedding_size)
        self.lstm=paddle.nn.LSTM(input_size=embedding_size+hidden_size,hidden_size=hidden_size)

        self.attention_linear1=paddle.nn.Linear(hidden_size*2,hidden_size)
        self.attention_linear2=paddle.nn.Linear(hidden_size,1)

        self.outlinear=paddle.nn.Linear(hidden_size,cn_vocab_size)
    
    def forward(self,x,previous_hidden,previous_cell,encoder_outputs):
        x=self.emb(x)


        attention_inputs=paddle.concat((encoder_outputs,paddle.tile(previous_hidden,repeat_times=[1,MAX_LEN+1,1])),axis=-1)
        attention_hidden=self.attention_linear1(attention_inputs)
        attention_hidden=F.tanh(attention_hidden)

        attention_logits=self.attention_linear2(attention_hidden)
        
        attention_logits=paddle.squeeze(attention_logits)
        attention_weights=F.softmax(attention_logits)
        attention_weights=paddle.expand_as(paddle.unsqueeze(attention_weights,-1),encoder_outputs)

        context_vector=paddle.multiply(encoder_outputs,attention_weights)
        context_vector=paddle.sum(context_vector,1)
        context_vector=paddle.unsqueeze(context_vector,1)


        lstm_input=paddle.concat((x,context_vector),axis=-1)
        previous_hidden=paddle.transpose(previous_hidden,[1,0,2])
        previous_cell=paddle.transpose(previous_cell,[1,0,2])

        x,(hidden,cell)=self.lstm(lstm_input,(previous_hidden,previous_cell))
        
        hidden=paddle.transpose(hidden,[1,0,2])
        cell=paddle.transpose(cell,[1,0,2])

        output=self.outlinear(hidden)
        output=paddle.squeeze(output)

        return output,(hidden,cell)



# 模型训练

In [8]:
encoder=Encoder()
decoder=AttentionDecoder()

opt=paddle.optimizer.Adam(learning_rate=0.001,parameters=encoder.parameters()+decoder.parameters())
for epoch in  range(epochs):
    print("epoch:{}".format(epoch))
    perm=np.random.permutation(len(train_en_sents))
    train_en_sents_shuffled=train_en_sents[perm]
    train_cn_sents_shuffled=train_cn_sents[perm]
    train_cn_label_sents_shuffled=train_cn_label_sents[perm]

    for iteration in range(train_en_sents_shuffled.shape[0]//batch_size):
        x_data=train_en_sents_shuffled[(batch_size*iteration):(batch_size*(iteration+1))]
        sent=paddle.to_tensor(x_data)
        en_repr=encoder(sent)

        x_cn_data=train_cn_sents_shuffled[(batch_size*iteration):(batch_size*(iteration+1))]
        x_cn_label_data=train_cn_label_sents_shuffled[(batch_size*iteration):(batch_size*(iteration+1))]

        hidden=paddle.zeros([batch_size,1,hidden_size])
        cell=paddle.zeros([batch_size,1,hidden_size])
        loss=paddle.zeros([1])

        for i in range(MAX_LEN+2):
            cn_word=paddle.to_tensor(x_cn_data[:,i:i+1])
            cn_word_label=paddle.to_tensor(x_cn_label_data[:,i])

            logits,(hidden,cell)=decoder(cn_word,hidden,cell,en_repr)
            step_loss=F.cross_entropy(logits,cn_word_label)
            loss+=step_loss
        
        loss=loss/(MAX_LEN+2)
        if(iteration % 200==0):
            print("iter {}, loss:{}".format(iteration,loss.numpy()))
        
        loss.backward()
        opt.step()
        opt.clear_grad()

W0806 10:22:56.657261   700 gpu_resources.cc:61] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.2, Runtime API Version: 10.1
W0806 10:22:56.661391   700 gpu_resources.cc:91] device: 0, cuDNN Version: 7.6.


epoch:0
iter 0, loss:[7.683391]
iter 200, loss:[3.120678]
iter 400, loss:[3.1328294]
epoch:1
iter 0, loss:[2.8534687]
iter 200, loss:[2.5909526]
iter 400, loss:[2.595067]
epoch:2
iter 0, loss:[2.5150201]
iter 200, loss:[2.48274]
iter 400, loss:[2.3937104]
epoch:3
iter 0, loss:[2.6956944]
iter 200, loss:[2.3992543]
iter 400, loss:[2.3842678]
epoch:4
iter 0, loss:[2.3324437]
iter 200, loss:[1.9485701]
iter 400, loss:[1.8910669]
epoch:5
iter 0, loss:[1.8418152]
iter 200, loss:[1.8176267]
iter 400, loss:[1.6164421]
epoch:6
iter 0, loss:[1.7416186]
iter 200, loss:[1.5939775]
iter 400, loss:[1.5162796]
epoch:7
iter 0, loss:[1.286771]
iter 200, loss:[1.4252247]
iter 400, loss:[1.7099051]
epoch:8
iter 0, loss:[1.366365]
iter 200, loss:[1.4557322]
iter 400, loss:[1.3257641]
epoch:9
iter 0, loss:[1.2669723]
iter 200, loss:[1.2047687]
iter 400, loss:[1.0214838]
epoch:10
iter 0, loss:[1.2424362]
iter 200, loss:[1.3565617]
iter 400, loss:[1.0956166]
epoch:11
iter 0, loss:[0.7367716]
iter 200, loss:

# 预测

In [9]:
encoder.eval()
decoder.eval()

num_of_examples_to_evaluate=10

indices=np.random.choice(len(train_en_sents),num_of_examples_to_evaluate,replace=False)
x_data=train_en_sents[indices]
sent=paddle.to_tensor(x_data)

en_repr=encoder(sent)


word=np.array([[cn_vocab["<bos>"]]]*num_of_examples_to_evaluate)
word=paddle.to_tensor(word)

hidden=paddle.zeros([num_of_examples_to_evaluate,1,hidden_size])
cell=paddle.zeros([num_of_examples_to_evaluate,1,hidden_size])

decoded_sent=[]
for i in range(MAX_LEN+2):
    logits,(hidden,cell)=decoder(word,hidden,cell,en_repr)
    # print('-'*30)
    # print(logits.shape)
    word=paddle.argmax(logits,axis=-1)
    # print(word.shape)
    decoded_sent.append(word.numpy())
    word=paddle.unsqueeze(word,axis=-1)
    # print(word.shape)

results=np.stack(decoded_sent,axis=1)
for i in range(num_of_examples_to_evaluate):
    en_input=' '.join(filtered_pairs[indices[i]][0])
    ground_truth_translate=''.join(filtered_pairs[indices[i]][1])
    modle_translate=""
    for k in results[i]:
        w=list(cn_vocab)[k]
        if w!='<pad>' and w!= '<eos>':
            modle_translate+=w
    print(en_input)
    print("true:",ground_truth_translate)
    print("pred:",modle_translate)


i took the elevator to the fourth floor
true: 我搭电梯去了四楼。
pred: 我去了三楼前。
i just wasn t paying attention
true: 我只是没注意。
pred: 我只是没注意。
he raised his hands
true: 他舉起了他的手。
pred: 他舉起了他的手。
you did a good job
true: 你干得很好。
pred: 你做了很多的。
i like you
true: 我喜欢你！
pred: 我喜欢你。
i have almost no money with me
true: 我身上幾乎沒有錢。
pred: 我身上幾乎沒有錢。
i don t think anyone can do this
true: 我认为没人能做到。
pred: 我認為沒看不到它。
we re eating apples
true: 我們在吃蘋果。
pred: 我們在吃蘋果。
she sat on the bench
true: 她坐在長椅上。
pred: 她坐在長椅上。
he can read english easily
true: 他能轻松地读英语。
pred: 他能轻松英语。
