In [1]:
import re
import numpy as np
import paddle
import paddle.nn.functional as F

In [2]:
MAX_LEN=10
lines=open('cmn.txt',encoding='utf-8').read().strip().split('\n')
words_re=re.compile(r'\w+')
pairs=[]
for l in lines:
    en_sent,cn_sent,_=l.split('\t')
    pairs.append((words_re.findall(en_sent.lower()),list(cn_sent)))

# print(pairs[:13])

filtered_pairs=[]
for x in pairs:
    if len(x[0])<MAX_LEN and len(x[1])<MAX_LEN and x[0][0] in ('i','you','he','she','they','we'):
        filtered_pairs.append(x)

print(len(filtered_pairs))

for x in filtered_pairs[-5:] :print(x)

6784
(['you', 'should', 'do', 'the', 'honorable', 'thing', 'and', 'resign'], ['你', '應', '該', '光', '榮', '地', '辭', '職', '。'])
(['i', 'am', 'looking', 'forward', 'to', 'hearing', 'from', 'you', 'soon'], ['我', '期', '待', '您', '的', '消', '息', '。'])
(['i', 'don', 't', 'want', 'there', 'to', 'be', 'any', 'misunderstanding'], ['我', '不', '想', '有', '任', '何', '误', '会', '。'])
(['i', 'like', 'cracking', 'sunflower', 'seeds', 'with', 'my', 'teeth'], ['我', '喜', '欢', '嗑', '葵', '花', '籽', '。'])
(['he', 'went', 'to', 'the', 'united', 'states', 'to', 'study', 'medicine'], ['他', '去', '美', '国', '学', '医', '了', '。'])


In [3]:
en_vocab={}
cn_vocab={}
en_vocab['<pad>'],en_vocab['<bos>'],en_vocab['<eos>']=0,1,2
cn_vocab['<pad>'],cn_vocab['<bos>'],cn_vocab['<eos>']=0,1,2

en_idx,cn_idx=3,3
for en,cn in filtered_pairs:
    for w in en:
        if w not in en_vocab:
            en_vocab[w]=en_idx
            en_idx+=1
    for w in cn:
        if w not in cn_vocab:
            cn_vocab[w]=cn_idx
            cn_idx+=1

In [4]:
padded_en_sents=[]
padded_cn_sents=[]
padded_cn_label_sents=[]

for en,cn in filtered_pairs:
    padded_en_sent=en+["<eos>"]+["<pad>"]*(MAX_LEN-len(en))
    # print(padded_en_sent)
    padded_en_sent.reverse()
    # print(padded_en_sent)
    padded_cn_sent=["<bos>"]+cn+["<eos>"]+["<pad>"]*(MAX_LEN-len(cn))
    padded_cn_label_sent=cn+['<eos>']+['<pad>']*(MAX_LEN-len(cn)+1)

    padded_en_sents.append([en_vocab[w] for w in padded_en_sent])
    padded_cn_sents.append([cn_vocab[w] for w in padded_cn_sent])
    padded_cn_label_sents.append([cn_vocab[w] for w in padded_cn_label_sent])

train_en_sents=np.array(padded_en_sents)
train_cn_sents=np.array(padded_cn_sents)
train_cn_label_sents=np.array(padded_cn_label_sents)

print(train_en_sents.shape)
print(train_cn_sents.shape)
print(train_cn_label_sents.shape)



(6784, 11)
(6784, 12)
(6784, 12)


# 模型配置

In [5]:
from paddle.nn import TransformerEncoderLayer,TransformerEncoder,TransformerDecoderLayer,TransformerDecoder

In [6]:
embedding_size=128
hidden_size=512
num_encoder_lstm_layers=1
en_vocab_size=len(list(en_vocab))
cn_vocab_size=len(list(cn_vocab))

epochs=20
batch_size=16


In [7]:
class Encoder(paddle.nn.Layer):
    def __init__(self,en_vocab_size,embedding_size,num_layers=2,head_number=2,middle_units=512):
        super(Encoder,self).__init__()
        self.emb=paddle.nn.Embedding(en_vocab_size,embedding_size)
        encoder_layer=TransformerEncoderLayer(embedding_size,head_number,middle_units)
        self.encoder=TransformerEncoder(encoder_layer,num_layers)
    
    def forward(self,x):
        x=self.emb(x)
        en_out=self.encoder(x)
        return en_out
    
class Decoder(paddle.nn.Layer):
    def __init__(self,cn_vocab_size,embedding_size,num_layers=2,head_number=2,middle_units=512):
        super(Decoder,self).__init__()
        self.emb=paddle.nn.Embedding(cn_vocab_size,embedding_size)

        decoder_layer=TransformerDecoderLayer(embedding_size,head_number,middle_units)
        self.decoder=TransformerDecoder(decoder_layer,num_layers)

        self.outlinear=paddle.nn.Linear(embedding_size,cn_vocab_size)

    def forward(self,x,encoder_outputs):
        x=self.emb(x)
        de_out=self.decoder(x,encoder_outputs)
        output=self.outlinear(de_out)
        output=paddle.squeeze(output)
        return output

# 训练

In [8]:
encoder=Encoder(en_vocab_size,embedding_size)
decoder=Decoder(cn_vocab_size,embedding_size)

opt=paddle.optimizer.Adam(learning_rate=0.00001,parameters=encoder.parameters()+decoder.parameters())
for epoch in  range(epochs):
    print("epoch:{}".format(epoch))
    perm=np.random.permutation(len(train_en_sents))
    train_en_sents_shuffled=train_en_sents[perm]
    train_cn_sents_shuffled=train_cn_sents[perm]
    train_cn_label_sents_shuffled=train_cn_label_sents[perm]

    for iteration in range(train_en_sents_shuffled.shape[0]//batch_size):
        x_data=train_en_sents_shuffled[(batch_size*iteration):(batch_size*(iteration+1))]
        sent=paddle.to_tensor(x_data)
        en_repr=encoder(sent)

        x_cn_data=train_cn_sents_shuffled[(batch_size*iteration):(batch_size*(iteration+1))]
        x_cn_label_data=train_cn_label_sents_shuffled[(batch_size*iteration):(batch_size*(iteration+1))]

        # hidden=paddle.zeros([batch_size,1,hidden_size])
        # cell=paddle.zeros([batch_size,1,hidden_size])
        loss=paddle.zeros([1])

        for i in range(MAX_LEN+2):
            cn_word=paddle.to_tensor(x_cn_data[:,i:i+1])
            cn_word_label=paddle.to_tensor(x_cn_label_data[:,i])

            # logits,(hidden,cell)=decoder(cn_word,hidden,cell,en_repr)
            logits=decoder(cn_word,en_repr)
            step_loss=F.cross_entropy(logits,cn_word_label)
            loss+=step_loss
        
        loss=loss/(MAX_LEN+2)
        if(iteration % 50==0):
            print("iter {}, loss:{}".format(iteration,loss.numpy()))
        
        loss.backward()
        opt.step()
        opt.clear_grad()

W0806 19:04:14.405814   347 gpu_resources.cc:61] Please NOTE: device: 0, GPU Compute Capability: 8.0, Driver API Version: 11.2, Runtime API Version: 11.2
W0806 19:04:14.408393   347 gpu_resources.cc:91] device: 0, cuDNN Version: 8.2.


epoch:0
iter 0, loss:[7.8213344]
iter 50, loss:[7.129674]
iter 100, loss:[6.9309206]
iter 150, loss:[6.7554655]
iter 200, loss:[6.4796467]
iter 250, loss:[6.3123894]
iter 300, loss:[6.1580744]
iter 350, loss:[6.157483]
iter 400, loss:[5.784957]
epoch:1
iter 0, loss:[5.8144426]
iter 50, loss:[5.5928516]
iter 100, loss:[5.72377]
iter 150, loss:[5.548216]
iter 200, loss:[5.644553]
iter 250, loss:[5.3269114]
iter 300, loss:[5.4419055]
iter 350, loss:[5.394272]
iter 400, loss:[5.4558554]
epoch:2
iter 0, loss:[5.3960285]
iter 50, loss:[5.065187]
iter 100, loss:[5.376636]
iter 150, loss:[5.088723]
iter 200, loss:[5.1011314]
iter 250, loss:[4.7370057]
iter 300, loss:[4.7658596]
iter 350, loss:[4.865528]
iter 400, loss:[4.98389]
epoch:3
iter 0, loss:[4.7842174]
iter 50, loss:[4.7123823]
iter 100, loss:[4.467573]
iter 150, loss:[4.5247555]
iter 200, loss:[4.688441]
iter 250, loss:[4.5418425]
iter 300, loss:[4.683344]
iter 350, loss:[4.620677]
iter 400, loss:[4.378727]
epoch:4
iter 0, loss:[4.192

# 预测

In [9]:
encoder.eval()
decoder.eval()

num_of_examples_to_evaluate=10

indices=np.random.choice(len(train_en_sents),num_of_examples_to_evaluate,replace=False)
x_data=train_en_sents[indices]
sent=paddle.to_tensor(x_data)

en_repr=encoder(sent)


word=np.array([[cn_vocab["<bos>"]]]*num_of_examples_to_evaluate)
word=paddle.to_tensor(word)

# hidden=paddle.zeros([num_of_examples_to_evaluate,1,hidden_size])
# cell=paddle.zeros([num_of_examples_to_evaluate,1,hidden_size])

decoded_sent=[]
for i in range(MAX_LEN+2):
    # logits,(hidden,cell)=decoder(word,hidden,cell,en_repr)
    logits=decoder(word,en_repr)
    # print('-'*30)
    # print(logits.shape)
    word=paddle.argmax(logits,axis=-1)
    # print(word.shape)
    decoded_sent.append(word.numpy())
    word=paddle.unsqueeze(word,axis=-1)
    # print(word.shape)

results=np.stack(decoded_sent,axis=1)
for i in range(num_of_examples_to_evaluate):
    en_input=' '.join(filtered_pairs[indices[i]][0])
    ground_truth_translate=''.join(filtered_pairs[indices[i]][1])
    modle_translate=""
    for k in results[i]:
        w=list(cn_vocab)[k]
        if w!='<pad>' and w!= '<eos>':
            modle_translate+=w
    print(en_input)
    print("true:",ground_truth_translate)
    print("pred:",modle_translate)


i m very lonely
true: 我很寂寞。
pred: 我不是我不是我不是我不是
i hope people are satisfied
true: 我希望人们满意。
pred: 我不是一个。
you shouldn t make fun of tom
true: 你不该取笑汤姆。
pred: 你不不不不不不不不不不不
i did something really stupid
true: 我做了很蠢的事。
pred: 我的。
i m here too
true: 我也在这里。
pred: 我不是我不是我不是我不是
i walked as far as the station
true: 我一直走到火車站。
pred: 我的。
he stuck to his promise
true: 他信守了承诺.
pred: 他的。
she gave it her personal attention
true: 她親自過問了此事。
pred: 她是她是她是她是她是她是
i ll drop you off at the station
true: 我載你到車站。
pred: 我不
you can go
true: 你可以去了。
pred: 你不是你不是你不是你不是
