In [1]:
# 下载数据
import nltk
import sys
train_file = "./data/train.txt"

In [2]:
enc = sys.getdefaultencoding()
def load_data(in_file):
    cn = []
    en = []
    num_examples = 0
    with open(in_file, 'r', encoding = enc) as f:
        for line in f:
            line = line.strip().split("\t")
            # comment the following line if you use nltk
            en.append(["BOS"] + line[0].split() + ["EOS"])
            # uncomment the following line if you installed nltk
#             en.append(["BOS"] + nltk.word_tokenize(line[0]) + ["EOS"]) 
            # split chinese sentence into characters
            cn.append(["BOS"] + [c for c in line[1]] + ["EOS"])
    return en, cn
train_en, train_cn = load_data(train_file)
num_train = len(train_en)

In [3]:
train_en[:5]

[['BOS', 'Anyone', 'can', 'do', 'that.', 'EOS'],
 ['BOS', 'How', 'about', 'another', 'piece', 'of', 'cake?', 'EOS'],
 ['BOS', 'She', 'married', 'him.', 'EOS'],
 ['BOS', 'I', "don't", 'like', 'learning', 'irregular', 'verbs.', 'EOS'],
 ['BOS', "It's", 'a', 'whole', 'new', 'ball', 'game', 'for', 'me.', 'EOS']]

In [4]:
train_cn[:5]

[['BOS', '任', '何', '人', '都', '可', '以', '做', '到', '。', 'EOS'],
 ['BOS', '要', '不', '要', '再', '來', '一', '塊', '蛋', '糕', '？', 'EOS'],
 ['BOS', '她', '嫁', '给', '了', '他', '。', 'EOS'],
 ['BOS', '我', '不', '喜', '欢', '学', '习', '不', '规', '则', '动', '词', '。', 'EOS'],
 ['BOS',
  '這',
  '對',
  '我',
  '來',
  '說',
  '是',
  '個',
  '全',
  '新',
  '的',
  '球',
  '類',
  '遊',
  '戲',
  '。',
  'EOS']]

In [5]:
# 构建Vocabulary
import os
import collections
import pickle

def make_dir(path):
    try:
        os.mkdir(path)
    except OSError:
        pass
    
model_dir = "seq2seq"
make_dir(model_dir)

def build_dict(sentences, max_words=50000):
    word_count = collections.Counter()
    for sentence in sentences:
        for s in sentence:
            word_count[s] += 1
    # 取出现最多的前50000个字，按照出现次数进行排序
    ls = word_count.most_common(max_words)
    total_words = len(ls) + 1
    # 索引和值调换个位置，并且加一个UNK表示Unknown文字
    word_dict = {w[0]: index+1 for (index, w) in enumerate(ls)}
    word_dict["UNK"] = 0
    return word_dict, total_words

vocab_file = os.path.join(model_dir, "vocab.pkl")
en_dict, en_total_words = build_dict(train_en)
cn_dict, cn_total_words = build_dict(train_cn)
    
# Inverse字典，键值位置调换
inv_en_dict = {v: k for k, v in en_dict.items()}
inv_cn_dict = {v: k for k, v in cn_dict.items()}

In [6]:
# 将word转换成index
def encode(en_sentences, cn_sentences, en_dict, cn_dict, sort_by_len=True):
    '''
        Encode the sequences. 
    '''
    length = len(en_sentences)
    out_en_sentences = []
    out_cn_sentences = []

    for i in range(length):
        # [1, 7808, 52, 32, 198, 2]
        en_seq = [en_dict[w] if w in en_dict else 0 for w in en_sentences[i]]
        cn_seq = [cn_dict[w] if w in cn_dict else 0 for w in cn_sentences[i]]
        # [[1, 7808, 52, 32, 198, 2]]
        out_en_sentences.append(en_seq)
        out_cn_sentences.append(cn_seq)

    # sort sentences by english lengths
    def len_argsort(seq):
        return sorted(range(len(seq)), key=lambda x: len(seq[x]))
    
    # 按照英文的长短进行排序
    if sort_by_len:
        sorted_index = len_argsort(out_en_sentences)
        out_en_sentences = [out_en_sentences[i] for i in sorted_index]
        out_cn_sentences = [out_cn_sentences[i] for i in sorted_index]
    return out_en_sentences, out_cn_sentences

display(train_en[:5])
train_en, train_cn = encode(train_en, train_cn, en_dict, cn_dict)

[['BOS', 'Anyone', 'can', 'do', 'that.', 'EOS'],
 ['BOS', 'How', 'about', 'another', 'piece', 'of', 'cake?', 'EOS'],
 ['BOS', 'She', 'married', 'him.', 'EOS'],
 ['BOS', 'I', "don't", 'like', 'learning', 'irregular', 'verbs.', 'EOS'],
 ['BOS', "It's", 'a', 'whole', 'new', 'ball', 'game', 'for', 'me.', 'EOS']]

In [24]:
display(train_en[:5])
inv_en_dict[5214]

[[1, 5214, 2], [1, 4380, 2], [1, 2674, 2], [1, 8514, 2], [1, 4380, 2]]

'Run.'

In [28]:
display(train_cn[:5])
inv_cn_dict[7],inv_cn_dict[86],inv_cn_dict[441],inv_cn_dict[5],inv_cn_dict[3],

[[1, 7, 86, 441, 5, 3, 2],
 [1, 118, 1374, 219, 2],
 [1, 1012, 2016, 7, 3, 2],
 [1, 238, 238, 219, 2],
 [1, 152, 189, 219, 2]]

('你', '用', '跑', '的', '。')

In [35]:
# 数据转换成batch
batch_size = 128
import numpy as np

# get minibatches of 
# 把一万四千数据分成若干组，每组128个数据
def get_minibatches(n, minibatch_size, shuffle=False):
    idx_list = np.arange(0, n, minibatch_size)
    # display(idx_list)
    if shuffle:
        np.random.shuffle(idx_list)
    minibatches = []
    for idx in idx_list:
        minibatches.append(np.arange(idx, min(idx + minibatch_size, n)))
    return minibatches

# 准备数据，不足的列补零，并且创建mask矩阵，原有数据为1，补零数据为0
def prepare_data(seqs):
    lengths = [len(seq) for seq in seqs]
    n_samples = len(seqs)
    max_len = np.max(lengths)

    x = np.zeros((n_samples, max_len)).astype('int32')
    x_mask = np.zeros((n_samples, max_len)).astype('float32')
    for idx, seq in enumerate(seqs):
        x[idx, :lengths[idx]] = seq
        x_mask[idx, :lengths[idx]] = 1.0
    return x, x_mask

def gen_examples(en_sentences, cn_sentences, batch_size):
    minibatches = get_minibatches(len(en_sentences), batch_size)
    # display(minibatches)
    all_ex = []
    for minibatch in minibatches:
        mb_en_sentences = [en_sentences[t] for t in minibatch]
        # display(mb_en_sentences)
        mb_cn_sentences = [cn_sentences[t] for t in minibatch]
        mb_x, mb_x_mask = prepare_data(mb_en_sentences)
        # display(mb_x, mb_x_mask)
        mb_y, mb_y_mask = prepare_data(mb_cn_sentences)
        all_ex.append((mb_x, mb_x_mask, mb_y, mb_y_mask))
        # display(all_ex)
    return all_ex
train_data = gen_examples(train_en, train_cn, batch_size)

In [10]:
import tensorflow as tf
from tensorflow.contrib import rnn

class Encoder:
    def __init__(self, embedding, hidden_size, num_layers = 1):
        self.embedding = embedding
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.cell = rnn.GRUCell(self.hidden_size)
        
    def __call__(self, inputs, seq_length, state=None):
        out = tf.nn.embedding_lookup(self.embedding, inputs)
        for i in range(self.num_layers):
            out, state = tf.nn.dynamic_rnn(self.cell, out, sequence_length=seq_length, initial_state=state, dtype=tf.float32)
        return out, state

class Decoder:
    def __init__(self, embedding, hidden_size, num_layers=1, max_length=15):
        self.embedding = embedding
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.cell = rnn.GRUCell(hidden_size)
        self.linear = tf.Variable(tf.random_normal(shape=(self.hidden_size, cn_total_words))*0.1)
        
        
    def __call__(self, inputs, state, encoder_state): # context vector
        
        out = tf.nn.embedding_lookup(self.embedding, inputs)
        out = tf.tile(tf.expand_dims(encoder_state, 1), (1, tf.shape(out)[1], 1))

        for i in range(self.num_layers):
#             state = tf.concat([state, encoder_state], 1)
            out, state = tf.nn.dynamic_rnn(self.cell, out, initial_state=state, dtype=tf.float32)
    
        out = tf.tensordot(out, self.linear, axes=[[2], [0]])
        return out, state

class Seq2Seq:
    def __init__(self, hidden_size, num_layers, embed_words_en, embed_words_cn):
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.max_length = 15
        self.grad_clip = 5.0
        
        with tf.device("/cpu:0"):
            with tf.name_scope("place_holder"):
                self.encoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int64, name="encoder_inputs")
                self.encoder_length = tf.placeholder(shape=(None, ), dtype=tf.int64, name="encoder_length")
                self.decoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int64, name="decoder_inputs")
                self.decoder_target = tf.placeholder(shape=(None, None), dtype=tf.int64, name="decoder_target")
                self.decoder_mask = tf.placeholder(shape=(None, None), dtype=tf.float32, name="decoder_mask")

            with tf.name_scope("embedding"):
                self.embedding_en = tf.get_variable(name="embedding_en", dtype=tf.float32, shape=(en_total_words, hidden_size),
                                                    initializer=tf.constant_initializer(embed_words_en))
                self.embedding_cn = tf.get_variable(name="embedding_cn", dtype=tf.float32, shape=(cn_total_words, hidden_size),
                                                    initializer=tf.constant_initializer(embed_words_cn))
            with tf.name_scope("encoder-decoder"):
                self.encoder = Encoder(self.embedding_en, self.hidden_size, self.num_layers)
                self.decoder = Decoder(self.embedding_cn + self.hidden_size, self.hidden_size, self.num_layers)

            with tf.variable_scope("seq2seq-train"):
                encoder_outputs, encoder_state = self.encoder(self.encoder_inputs, self.encoder_length)
                tf.get_variable_scope().reuse_variables()
                # 这里把encoder的state送给decoder的state，encoder的output则不需要用了……
                decoder_state = encoder_state
                word_indices = self.decoder_inputs

                decoder_outputs, decoder_state = self.decoder(word_indices, decoder_state, encoder_state)

                # decoder_outputs.append(decoder_out)
                decoder_outputs = tf.concat(decoder_outputs, 1)

            with tf.name_scope("cost"):
                loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=decoder_outputs, labels=self.decoder_target)

                self.cost = tf.reduce_mean(loss * self.decoder_mask)

                tvars = tf.trainable_variables()
                grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), self.grad_clip)
                optimizer = tf.train.RMSPropOptimizer(learning_rate=0.01)
                self.train_op = optimizer.apply_gradients(zip(grads, tvars))

            with tf.variable_scope("seq2seq-generate"):
                self.generate_outputs = []
                decoder_state = encoder_state
                word_indices = tf.expand_dims(self.decoder_inputs[:, 0], 1)
                for i in range(self.max_length):
                    decoder_out, decoder_state = self.decoder(word_indices, decoder_state, encoder_state)
                    softmax_out = tf.nn.softmax(decoder_out[:, 0, :])
                    word_indices = tf.expand_dims(tf.cast(tf.argmax(softmax_out, -1), dtype=tf.int64), 1)
                    self.generate_outputs.append(word_indices)
                self.generate_outputs = tf.concat(self.generate_outputs, 0)
            
            
    def train(self, sess, encoder_inputs, encoder_length, decoder_inputs, decoder_target, decoder_mask):
        _, cost = sess.run([self.train_op, self.cost], feed_dict={
            self.encoder_inputs: encoder_inputs, 
            self.encoder_length: encoder_length,
            self.decoder_inputs: decoder_inputs,
            self.decoder_target: decoder_target,
            self.decoder_mask: decoder_mask
        })
        return cost
    
    def generate(self, sess, encoder_inputs, encoder_length):
        decoder_inputs = np.asarray([[en_dict["BOS"]]*15], dtype="int64")
        if encoder_inputs.ndim == 1:
            encoder_inputs = encoder_inputs.reshape((1, -1))
            encoder_length = encoder_length.reshape((-1))
        generate = sess.run([self.generate_outputs],
                           feed_dict={self.encoder_inputs: encoder_inputs,
                                      self.decoder_inputs: decoder_inputs,
                                      self.encoder_length: encoder_length})[0]
        return generate
            

In [11]:
tf.reset_default_graph()
# 隐层有50个神经元
hidden_size = 50
num_layers = 1
# 不用预训练的embedding，用2个随机正态分布的embedding
emb_en = np.random.uniform(low=-0.1, high=0.1, size=(en_total_words, hidden_size))
emb_cn = np.random.uniform(low=-0.1, high=0.1, size=(cn_total_words, hidden_size))
model = Seq2Seq(hidden_size, num_layers, emb_en, emb_cn)
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
epoch = 0
n_epochs = 30
# print(sess.run(model.decoder_state))
while epoch < n_epochs:
    epoch += 1
    total_loss = 0 
    total_num_ins = 0
    for (encoder_inputs, encoder_length, mb_y, mb_y_mask) in train_data:
        decoder_inputs = mb_y[:, :-1]
        decoder_target = mb_y[:, 1:]
#         print(encoder_length.sum(1).shape)
        loss = model.train(sess, encoder_inputs, encoder_length.sum(1), decoder_inputs, decoder_target, mb_y_mask[:, :-1])
        total_loss += loss
        total_num_ins += mb_y.shape[0]
    print("training loss: {}".format(total_loss / total_num_ins))
    

training loss: 0.038896150617862627
training loss: 0.027942053787576877
training loss: 0.026126280200380523
training loss: 0.02499888086486304
training loss: 0.024169257508578897
training loss: 0.02342231726602129
training loss: 0.02275253400367543
training loss: 0.022218813433877157
training loss: 0.021750806375896086
training loss: 0.02134941043885258
training loss: 0.020977562951802108
training loss: 0.020659926768910375
training loss: 0.0203385693416833
training loss: 0.020073575572564614
training loss: 0.01979738133071598
training loss: 0.019565929461270313
training loss: 0.019342890808146416
training loss: 0.01912959994448197
training loss: 0.01895555056749991
training loss: 0.0187674670671224
training loss: 0.018629708392141214
training loss: 0.018449202379848643
training loss: 0.018309324588104987
training loss: 0.018164931739392928
training loss: 0.018027273817711374
training loss: 0.017919452070804354
training loss: 0.017767461875362208
training loss: 0.01766828853150716
trai

In [16]:
encoder_inputs = [inv_en_dict[c] for c in train_data[11][0][2]]
print(encoder_inputs)
encoder_inputs = [en_dict.get(e, 0) for e in encoder_inputs]
encoder_inputs = np.asarray(encoder_inputs).reshape(1, -1)
encoder_length = np.asarray([encoder_inputs.shape[1]]).reshape(-1)
res = model.generate(sess, encoder_inputs, encoder_length).flatten()

res = [inv_cn_dict[r] for r in res]
print(res)

['BOS', 'Can', 'he', 'speak', 'English?', 'EOS']
['他', '能', '能', '說', '說', '語', '語', '語', '吗', '？', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK']
