In [1]:
import tensorflow as tf
import time
import os
import numpy as np

tf.enable_eager_execution()

In [2]:
def preprocess_sentence(w):
    # 去除首字母大写
    w = w.lower()
    # 去除头尾的空格
    w = w.rstrip().strip()
    # 句子头尾添加标记
    w = '<s> ' + w + ' </s>'
    return w

In [3]:
def create_dataset(src_path, trg_path):
    src_lines = open(src_path, encoding='UTF-8').read().strip().split('\n')
    trg_lines = open(trg_path, encoding='UTF-8').read().strip().split('\n')
    word_pairs = [[preprocess_sentence(src_line), preprocess_sentence(trg_line)] 
                  for src_line, trg_line in zip(src_lines, trg_lines)]
    
    return word_pairs

In [4]:
def create_index(path):
    vocab = open(path, encoding='UTF-8').read().strip().split('\n')
    
    word2idx = {}
    idx2word = {}
    
    word2idx['<pad>'] = 0
    for index, word in enumerate(vocab):
        word2idx[word] = index + 1
    
    for word, index in word2idx.items():
        idx2word[index] = word
        
    return word2idx, idx2word

In [5]:
def max_length(sentences):
    return max(len(s) for s in sentences)

def load_dataset(src_path, trg_path, en_word2idx, zh_word2idx):
    pairs = create_dataset(src_path, trg_path)
    
    # 英语句子
#     input_tensor = [[en_word2idx[s] for s in en.split(' ')] 
#                     for en, zh in pairs]
    input_tensor = []
    for en, zh in pairs:
        sentence = []
        for s in en.split(' '):
            if s in en_word2idx.keys():
                sentence.append(en_word2idx[s])
            else:
                sentence.append(en_word2idx['<unk>'])
        input_tensor.append(sentence)
    
    # 中文句子
#     target_tensor = [[zh_word2idx[s] for s in zh.split(' ')]
#                      for en, zh in pairs]
    target_tensor = []
    for en, zh in pairs:
        sentence = []
        for s in zh.split(' '):
            if s in zh_word2idx.keys():
                sentence.append(zh_word2idx[s])
            else:
                sentence.append(zh_word2idx['<unk>'])
        target_tensor.append(sentence)

    
    max_length_src, max_length_trg = max_length(input_tensor), max_length(target_tensor)
    
    
    return input_tensor, target_tensor, max_length_src, max_length_trg

In [6]:
train_zh_path = 'data/train.zh'
train_en_path = 'data/train.en'
val_zh_path = 'data/dev.zh'
val_en_path = 'data/dev.en'
test_zh_path = 'data/test.zh'
test_en_path = 'data/test.en'

vocab_zh_path = 'data/vocab.zh'
vocab_en_path = 'data/vocab.en'

en_word2idx, en_idx2word = create_index(vocab_en_path)
zh_word2idx, zh_idx2word = create_index(vocab_zh_path)

train_input, train_target, train_input_max_length, train_target_max_length = load_dataset(
    train_en_path, train_zh_path, en_word2idx, zh_word2idx)

val_input, val_target, val_input_max_length, val_target_max_length = load_dataset(
    val_en_path, val_zh_path, en_word2idx, zh_word2idx)

test_input, test_target, test_input_max_length, test_target_max_length = load_dataset(
    test_en_path, test_zh_path, en_word2idx, zh_word2idx)

print(train_input_max_length, val_input_max_length, test_input_max_length)
print(train_target_max_length, val_target_max_length, test_target_max_length)

62 62 62
62 55 55


In [7]:
train_input = tf.keras.preprocessing.sequence.pad_sequences(train_input, maxlen=train_input_max_length,
                                                                padding='post')
train_target = tf.keras.preprocessing.sequence.pad_sequences(train_target, 
                                                                  maxlen=train_target_max_length, 
                                                                  padding='post')
val_input = tf.keras.preprocessing.sequence.pad_sequences(val_input, maxlen=train_input_max_length,
                                                                padding='post')
val_target = tf.keras.preprocessing.sequence.pad_sequences(val_target, 
                                                                  maxlen=train_target_max_length, 
                                                                  padding='post')
test_input = tf.keras.preprocessing.sequence.pad_sequences(test_input, maxlen=train_input_max_length,
                                                                padding='post')
test_target = tf.keras.preprocessing.sequence.pad_sequences(test_target, 
                                                                  maxlen=train_target_max_length, 
                                                                  padding='post')
print(train_input[0])
print(train_target[0])


[    2    10  1042     5     4 14252   945  3533   190    90     6     4
 14252  3533   190    90    93  1502  1342     6  1065    18    12  1571
   363   449  1131  3815   948   108     4  1207     5  2707    82 22811
     5     6   463     9   340    54   654    13     9    13   654  1131
   290     4    29     7    47  1243     8     3     0     0     0     0
     0     0]
[    2  1614     4   594   122   436     4 32697     4    54    87  1140
  1134   278  5390    44  6149   967     5   695     4   430  1767     4
  3201 32698     4 14171 15368     4    83  1552   245    48  6423  6149
     6     3     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0]


In [8]:
class Config(object):
    def __init__(self):
        self.batch_size = 64
        self.n_batch = len(train_input // self.batch_size)
        self.embedding_dim = 256
        self.input_embedding_size = len(en_word2idx)
        self.target_embedding_size = len(zh_word2idx)
        self.units = 1024
        self.epochs = 10

In [9]:
def gru(units):
#     if tf.test.is_gpu_available:
#         return tf.keras.layers.CuDNNGRU(units,
#                                        return_sequences=True,
#                                        return_state=True,
#                                        recurrent_initializer='glorot_uniform')
#     else:
    return tf.keras.layers.GRU(units,
                                return_sequences=True,
                                return_state=True,
                                recurrent_activation='sigmoid',
                                recurrent_initializer='glorot_uniform')

In [10]:
class Encoder(tf.keras.Model):
    def __init__(self, config):
        super(Encoder, self).__init__()
        self.batch_size = config.batch_size
        self.enc_units = config.units
        self.embedding = tf.keras.layers.Embedding(config.input_embedding_size, 
                                                  config.embedding_dim)
        self.gru = gru(self.enc_units)
    
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size, self.enc_units))

In [11]:
class Decoder(tf.keras.Model):
    def __init__(self, config):
        super(Decoder, self).__init__()
        self.batch_size = config.batch_size
        self.dec_units = config.units
        self.embedding = tf.keras.layers.Embedding(config.target_embedding_size,
                                                   config.embedding_dim)
        self.gru = gru(self.dec_units)
        self.fc = tf.layers.Dense(config.target_embedding_size)
        
        self.W1 = tf.keras.layers.Dense(self.dec_units)
        self.W2 = tf.keras.layers.Dense(self.dec_units)
        self.V = tf.keras.layers.Dense(1)
        
    def call(self, x, hidden, enc_output):
        
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        
        score = self.V(tf.nn.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis)))

        attention_weights = tf.nn.softmax(score, axis=1)
        
        context_vector = attention_weights * enc_output
        context_vector = tf.reduce_sum(context_vector, axis=1)
        
        x = self.embedding(x)
        
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        
        output, state = self.gru(x)
        
        output = tf.reshape(output, (-1, output.shape[2]))
        
        x = self.fc(output)
        
        return x, state, attention_weights
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size, self.dec_units))
        

In [12]:
config = Config()
encoder = Encoder(config)
decoder = Decoder(config)

In [13]:
optimizer = tf.train.AdamOptimizer()

def loss_function(real, pred):
    mask = 1 - np.equal(real, 0)
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
        labels=real, logits=pred) * mask
    loss = tf.reduce_mean(xentropy)
    return tf.reduce_mean(loss)

In [14]:
checkpoint_dir = "./training_checkpoints"
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                encoder=encoder,
                                decoder=decoder)

In [15]:
dataset = tf.data.Dataset.from_tensor_slices(
    (train_input, train_target)).shuffle(len(train_input))

dataset = dataset.batch(config.batch_size, drop_remainder=True)

print(dataset)

<DatasetV1Adapter shapes: ((64, 62), (64, 62)), types: (tf.int32, tf.int32)>


In [16]:
tf.executing_eagerly()

for epoch in range(config.epochs):
    start = time.time()
    
    hidden = encoder.initialize_hidden_state()
    total_loss = 0
    for (batch, (x_input, y_input)) in enumerate(dataset):
        loss = 0
        
        with tf.GradientTape() as tape:
            enc_output, enc_hidden = encoder(x_input, hidden)
            dec_hidden = enc_hidden
            dec_input = tf.expand_dims([en_word2idx['<s>']] * config.batch_size, 1)
            
            for t in range(1, y_input.shape[1]):
                pred, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
                loss += loss_function(y_input[:, t], pred)
                dec_input = tf.expand_dims(y_input[:, t], 1)
                
        batch_loss = (loss / int(y_input.shape[1]))
        total_loss += batch_loss
        
        variables = encoder.variables + decoder.variables
        gradients = tape.gradient(loss, variables)
        
        optimizer.apply_gradients(zip(gradients, variables))
        
        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, batch_loss.numpy()))
    
    print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / config.n_batch))
    print('Time taken for 1 epoch{} sec\n'.format(time.time() - start))
    
checkpoint.save(file_prefix = checkpoint_prefix)


Instructions for updating:
Colocations handled automatically by placer.


KeyboardInterrupt: 

In [None]:
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

In [None]:
def evaluate(sentence, encoder, decoder, en_word2idx, zh_word2idx, zh_idx2word, 
             train_input_max_length, train_target_max_length, config):
    attention_plot = np.zeros((train_target_max_length, train_input_max_length))
    sentence = preprocess_sentence(sentence)
  
    inputs = [en_word2idx[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], 
                                                         maxlen=max_length_inp, 
                                                         padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''

    hidden = [tf.zeros((1, config.units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([zh_word2idx['<s>']], 0)

    for t in range(train_target_max_length):
        pred, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, 
                                                      enc_out)

        # storing the attention weigths to plot later on
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        pred_id = tf.argmax(pred[0]).numpy()

        result += zh_idx2word[pred_id] + ' '

        if zh_idx2word[predicted_id] == '</s>':
            return result, sentence, attention_plot

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([pred_id], 0)

    return result, sentence, attention_plot

In [None]:
def plot_attention(attention, sentence, predicted_sentence):
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(1, 1, 1)
    ax.matshow(attention, cmap='viridis')
    
    fontdict = {'fontsize': 14}
    
    ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
    ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)

    plt.show()

In [None]:
def translate(sentence, encoder, decoder, en_word2idx, zh_word2idx, zh_idx2word, 
             train_input_max_length, train_target_max_length, config):
    result, sentence, attention_plot = evaluate(sentence, encoder, decoder, 
                                                en_word2idx, zh_word2idx, 
                                                zh_idx2word, train_input_max_length,
                                                train_target_max_length, config) 
        
    print('Input: {}'.format(sentence))
    print('Predicted translation: {}'.format(result))
    
    attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
    plot_attention(attention_plot, sentence.split(' '), result.split(' '))

In [None]:
translate('I am your father', encoder, decoder, en_word2idx, zh_word2idx, zh_idx2word, 
             train_input_max_length, train_target_max_length, config)