In [1]:
import re
import numpy as np
import paddle
import paddle.nn.functional as F

In [2]:
MAX_LEN=10
lines=open('cmn.txt',encoding='utf-8').read().strip().split('\n')
words_re=re.compile(r'\w+')
pairs=[]
for l in lines:
    en_sent,cn_sent,_=l.split('\t')
    pairs.append((words_re.findall(en_sent.lower()),list(cn_sent)))

# print(pairs[:13])

filtered_pairs=[]
for x in pairs:
    if len(x[0])<MAX_LEN and len(x[1])<MAX_LEN and x[0][0] in ('i','you','he','she','they','we'):
        filtered_pairs.append(x)

print(len(filtered_pairs))

for x in filtered_pairs[-5:] :print(x)

6784
(['you', 'should', 'do', 'the', 'honorable', 'thing', 'and', 'resign'], ['你', '應', '該', '光', '榮', '地', '辭', '職', '。'])
(['i', 'am', 'looking', 'forward', 'to', 'hearing', 'from', 'you', 'soon'], ['我', '期', '待', '您', '的', '消', '息', '。'])
(['i', 'don', 't', 'want', 'there', 'to', 'be', 'any', 'misunderstanding'], ['我', '不', '想', '有', '任', '何', '误', '会', '。'])
(['i', 'like', 'cracking', 'sunflower', 'seeds', 'with', 'my', 'teeth'], ['我', '喜', '欢', '嗑', '葵', '花', '籽', '。'])
(['he', 'went', 'to', 'the', 'united', 'states', 'to', 'study', 'medicine'], ['他', '去', '美', '国', '学', '医', '了', '。'])


# 构建词表

In [3]:
en_vocab={}
cn_vocab={}
en_vocab['<pad>'],en_vocab['<bos>'],en_vocab['<eos>']=0,1,2
cn_vocab['<pad>'],cn_vocab['<bos>'],cn_vocab['<eos>']=0,1,2

en_idx,cn_idx=3,3
for en,cn in filtered_pairs:
    for w in en:
        if w not in en_vocab:
            en_vocab[w]=en_idx
            en_idx+=1
    for w in cn:
        if w not in cn_vocab:
            cn_vocab[w]=cn_idx
            cn_idx+=1

In [4]:
padded_en_sents=[]
padded_cn_sents=[]
padded_cn_label_sents=[]

for en,cn in filtered_pairs:
    padded_en_sent=en+["<eos>"]+["<pad>"]*(MAX_LEN-len(en))
    # print(padded_en_sent)
    padded_en_sent.reverse()
    # print(padded_en_sent)
    padded_cn_sent=["<bos>"]+cn+["<eos>"]+["<pad>"]*(MAX_LEN-len(cn))
    padded_cn_label_sent=cn+['<eos>']+['<pad>']*(MAX_LEN-len(cn)+1)

    padded_en_sents.append([en_vocab[w] for w in padded_en_sent])
    padded_cn_sents.append([cn_vocab[w] for w in padded_cn_sent])
    padded_cn_label_sents.append([cn_vocab[w] for w in padded_cn_label_sent])

train_en_sents=np.array(padded_en_sents)
train_cn_sents=np.array(padded_cn_sents)
train_cn_label_sents=np.array(padded_cn_label_sents)

print(train_en_sents.shape)
print(train_cn_sents.shape)
print(train_cn_label_sents.shape)



(6784, 11)
(6784, 12)
(6784, 12)


# 模型配置

In [5]:
embedding_size=128
hidden_size=256
num_encoder_lstm_layers=1
en_vocab_size=len(list(en_vocab))
cn_vocab_size=len(list(cn_vocab))

epochs=20
batch_size=16


In [6]:
class Encoder(paddle.nn.Layer):
    def __init__(self):
        super(Encoder,self).__init__()
        self.emb=paddle.nn.Embedding(en_vocab_size,embedding_size)
        self.lstm=paddle.nn.LSTM(input_size=embedding_size,hidden_size=hidden_size,num_layers=num_encoder_lstm_layers)

    def forward(self,x):
        x=self.emb(x)
        x,(_,_)=self.lstm(x)
        return x

In [7]:
class Decoder(paddle.nn.Layer):
    def __init__(self):
        super(Decoder,self).__init__()
        self.emb=paddle.nn.Embedding(cn_vocab_size,embedding_size)
        self.lstm=paddle.nn.LSTM(input_size=embedding_size+hidden_size,hidden_size=hidden_size)
        self.outlinear=paddle.nn.Linear(hidden_size,cn_vocab_size)
    
    def forward(self,x,previous_hidden,previous_cell,encoder_outputs):
        x=self.emb(x)
        # print('-'*30)
        # print(encoder_outputs.shape)
        context_vector=paddle.sum(encoder_outputs,1)
        # print(context_vector.shape)
        context_vector=paddle.unsqueeze(context_vector,1)
        # print(context_vector.shape)

        lstm_input=paddle.concat((x,context_vector),axis=-1)

        previous_hidden=paddle.transpose(previous_hidden,[1,0,2])
        previous_cell=paddle.transpose(previous_cell,[1,0,2])

        x,(hidden,cell)=self.lstm(lstm_input,(previous_hidden,previous_cell))

        hidden=paddle.transpose(hidden,[1,0,2])
        cell=paddle.transpose(cell,[1,0,2])

        output=self.outlinear(hidden)
        output=paddle.squeeze(output)
        return output,(hidden,cell)

# 训练

In [8]:
encoder=Encoder()
decoder=Decoder()

opt=paddle.optimizer.Adam(learning_rate=0.001,parameters=encoder.parameters()+decoder.parameters())
for epoch in  range(epochs):
    print("epoch:{}".format(epoch))
    perm=np.random.permutation(len(train_en_sents))
    train_en_sents_shuffled=train_en_sents[perm]
    train_cn_sents_shuffled=train_cn_sents[perm]
    train_cn_label_sents_shuffled=train_cn_label_sents[perm]

    for iteration in range(train_en_sents_shuffled.shape[0]//batch_size):
        x_data=train_en_sents_shuffled[(batch_size*iteration):(batch_size*(iteration+1))]
        sent=paddle.to_tensor(x_data)
        en_repr=encoder(sent)

        x_cn_data=train_cn_sents_shuffled[(batch_size*iteration):(batch_size*(iteration+1))]
        x_cn_label_data=train_cn_label_sents_shuffled[(batch_size*iteration):(batch_size*(iteration+1))]

        hidden=paddle.zeros([batch_size,1,hidden_size])
        cell=paddle.zeros([batch_size,1,hidden_size])
        loss=paddle.zeros([1])

        for i in range(MAX_LEN+2):
            cn_word=paddle.to_tensor(x_cn_data[:,i:i+1])
            cn_word_label=paddle.to_tensor(x_cn_label_data[:,i])

            logits,(hidden,cell)=decoder(cn_word,hidden,cell,en_repr)
            step_loss=F.cross_entropy(logits,cn_word_label)
            loss+=step_loss
        
        loss=loss/(MAX_LEN+2)
        if(iteration % 200==0):
            print("iter {}, loss:{}".format(iteration,loss.numpy()))
        
        loss.backward()
        opt.step()
        opt.clear_grad()

W0804 16:08:47.193271 10871 gpu_resources.cc:61] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.2, Runtime API Version: 10.1
W0804 16:08:47.196259 10871 gpu_resources.cc:91] device: 0, cuDNN Version: 7.6.


epoch:0
iter 0, loss:[7.6692123]
iter 200, loss:[3.9639182]
iter 400, loss:[3.2989588]
epoch:1
iter 0, loss:[3.2058902]
iter 200, loss:[3.1991234]
iter 400, loss:[3.1223054]
epoch:2
iter 0, loss:[3.2026477]
iter 200, loss:[3.2073016]
iter 400, loss:[3.010473]
epoch:3
iter 0, loss:[3.2029216]
iter 200, loss:[3.107572]
iter 400, loss:[2.578267]
epoch:4
iter 0, loss:[2.5719724]
iter 200, loss:[2.3629503]
iter 400, loss:[2.3354428]
epoch:5
iter 0, loss:[2.270809]
iter 200, loss:[2.2493029]
iter 400, loss:[1.9096901]
epoch:6
iter 0, loss:[2.0700908]
iter 200, loss:[1.9801536]
iter 400, loss:[1.5809536]
epoch:7
iter 0, loss:[1.4416498]
iter 200, loss:[1.6989276]
iter 400, loss:[1.4019668]
epoch:8
iter 0, loss:[1.4268367]
iter 200, loss:[1.6154541]
iter 400, loss:[1.6757619]
epoch:9
iter 0, loss:[1.031613]
iter 200, loss:[1.287231]
iter 400, loss:[1.106545]
epoch:10
iter 0, loss:[0.8118762]
iter 200, loss:[0.8926243]
iter 400, loss:[1.0500791]
epoch:11
iter 0, loss:[0.86716926]
iter 200, loss

# 预测

In [12]:
encoder.eval()
decoder.eval()

num_of_examples_to_evaluate=10

indices=np.random.choice(len(train_en_sents),num_of_examples_to_evaluate,replace=False)
x_data=train_en_sents[indices]
sent=paddle.to_tensor(x_data)

en_repr=encoder(sent)


word=np.array([[cn_vocab["<bos>"]]]*num_of_examples_to_evaluate)
word=paddle.to_tensor(word)

hidden=paddle.zeros([num_of_examples_to_evaluate,1,hidden_size])
cell=paddle.zeros([num_of_examples_to_evaluate,1,hidden_size])

decoded_sent=[]
for i in range(MAX_LEN+2):
    logits,(hidden,cell)=decoder(word,hidden,cell,en_repr)
    # print('-'*30)
    # print(logits.shape)
    word=paddle.argmax(logits,axis=-1)
    # print(word.shape)
    decoded_sent.append(word.numpy())
    word=paddle.unsqueeze(word,axis=-1)
    # print(word.shape)

results=np.stack(decoded_sent,axis=1)
for i in range(num_of_examples_to_evaluate):
    en_input=' '.join(filtered_pairs[indices[i]][0])
    ground_truth_translate=''.join(filtered_pairs[indices[i]][1])
    modle_translate=""
    for k in results[i]:
        w=list(cn_vocab)[k]
        if w!='<pad>' and w!= '<eos>':
            modle_translate+=w
    print(en_input)
    print("true:",ground_truth_translate)
    print("pred:",modle_translate)


i was at home
true: 我刚才在家。
pred: 我在家裡。
you got here fast
true: 你來得很快。
pred: 你最快快。
i played soccer yesterday
true: 我昨天踢了足球。
pred: 我昨天踢了足球。
you do have choices
true: 你有选择。
pred: 你有选择。
i seem to have a fever
true: 我好像发烧了。
pred: 我好像发烧了。
he came in person
true: 他亲自来了。
pred: 他亲自来了。
he told the truth
true: 他說了實話。
pred: 他说了實話。
i got mugged
true: 我被抢劫了。
pred: 我被抢劫了。
i left it on the table
true: 我把它留在桌上了。
pred: 我把它放在桌上了。
they say this old house is haunted
true: 據說老房子鬧鬼。
pred: 據說老房子鬧鬼。
