<a href="https://colab.research.google.com/github/Dreaming-world/learn_tensorflow_nlp/blob/master/myseq2seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time

# 下载数据集

In [None]:
# 下载文件
path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)

path_to_file = os.path.dirname(path_to_zip)+"/spa-eng/spa.txt"

# 分离相互翻译的文本

In [None]:
source_text = []
target_text = []
with open(path_to_file, "r") as f:
  line = f.readline()
  while line:
    source_text.append(line.split("\t")[0].strip())
    target_text.append("<start> " + line.split("\t")[1].strip() + " <end>")
    line = f.readline()
print(len(source_text), len(target_text))
print(source_text[5:10])
print(target_text[5:10])

118964 118964
['Run!', 'Run.', 'Who?', 'Fire!', 'Fire!']
['<start> ¡Corre! <end>', '<start> Corred. <end>', '<start> ¿Quién? <end>', '<start> ¡Fuego! <end>', '<start> ¡Incendio! <end>']


In [None]:
use_length = 3000
source_text = source_text[:use_length]
target_text = target_text[:use_length]

# 生成id2char，以及char2id

In [None]:
source_char_id = {}
source_char_id["<padding>"] = 0
for sentence in source_text:
  for char in sentence.split(" "):
    if char not in source_char_id:
      source_char_id[char] = len(source_char_id)
print(source_char_id)
source_id_char = {source_char_id[key]:key for key in source_char_id.keys()}

{'<padding>': 0, 'Go.': 1, 'Hi.': 2, 'Run!': 3, 'Run.': 4, 'Who?': 5, 'Fire!': 6, 'Help!': 7, 'Jump!': 8, 'Jump.': 9, 'Stop!': 10, 'Wait!': 11, 'Wait.': 12, 'Go': 13, 'on.': 14, 'Hello!': 15, 'I': 16, 'ran.': 17, 'try.': 18, 'won!': 19, 'Oh': 20, 'no!': 21, 'Relax.': 22, 'Smile.': 23, 'Attack!': 24, 'Get': 25, 'up.': 26, 'now.': 27, 'Got': 28, 'it!': 29, 'it?': 30, 'He': 31, 'Hop': 32, 'in.': 33, 'Hug': 34, 'me.': 35, 'fell.': 36, 'know.': 37, 'left.': 38, 'lied.': 39, 'lost.': 40, 'quit.': 41, 'work.': 42, "I'm": 43, '19.': 44, 'Listen.': 45, 'No': 46, 'way!': 47, 'Really?': 48, 'Thanks.': 49, 'Try': 50, 'it.': 51, 'We': 52, 'won.': 53, 'Why': 54, 'me?': 55, 'Ask': 56, 'Tom.': 57, 'Awesome!': 58, 'Be': 59, 'calm.': 60, 'cool.': 61, 'fair.': 62, 'kind.': 63, 'nice.': 64, 'Beat': 65, 'Call': 66, 'us.': 67, 'Come': 68, 'on!': 69, 'Drop': 70, 'out!': 71, 'out.': 72, 'away!': 73, 'away.': 74, 'home.': 75, 'slow.': 76, 'Goodbye!': 77, 'Hang': 78, 'came.': 79, 'Help': 80, 'me!': 81, 'Hit': 8

In [None]:
target_char_id = {}
target_char_id["<padding>"] = 0
for sentence in target_text:
  for char in sentence.split(" "):
    if char not in target_char_id:
      target_char_id[char] = len(target_char_id)
print(target_char_id)
target_id_char = {target_char_id[key]:key for key in target_char_id.keys()}

{'<padding>': 0, '<start>': 1, 'Ve.': 2, '<end>': 3, 'Vete.': 4, 'Vaya.': 5, 'Váyase.': 6, 'Hola.': 7, '¡Corre!': 8, 'Corred.': 9, '¿Quién?': 10, '¡Fuego!': 11, '¡Incendio!': 12, '¡Disparad!': 13, '¡Ayuda!': 14, '¡Socorro!': 15, '¡Auxilio!': 16, '¡Salta!': 17, 'Salte.': 18, '¡Parad!': 19, '¡Para!': 20, '¡Pare!': 21, '¡Espera!': 22, 'Esperen.': 23, 'Continúa.': 24, 'Continúe.': 25, 'Corrí.': 26, 'Corría.': 27, 'Lo': 28, 'intento.': 29, '¡He': 30, 'ganado!': 31, '¡Oh,': 32, 'no!': 33, 'Tomátelo': 34, 'con': 35, 'soda.': 36, 'Sonríe.': 37, '¡Al': 38, 'ataque!': 39, '¡Atacad!': 40, 'Levanta.': 41, 'Ve': 42, 'ahora': 43, 'mismo.': 44, '¡Lo': 45, 'tengo!': 46, '¿Lo': 47, 'pillas?': 48, '¿Entendiste?': 49, 'Él': 50, 'corrió.': 51, 'Métete': 52, 'adentro.': 53, 'Abrázame.': 54, 'Me': 55, 'caí.': 56, 'Yo': 57, 'lo': 58, 'sé.': 59, 'Salí.': 60, 'Mentí.': 61, 'Perdí.': 62, 'Dimito.': 63, 'Renuncié.': 64, 'Estoy': 65, 'trabajando.': 66, 'Tengo': 67, 'diecinueve.': 68, 'levantado.': 69, 'Escucha.':

# 将文本转移成向量表示

In [None]:
def convert_text_int(char2id, sentence_list):
  result = []
  for sentence in sentence_list:
    result.append([char2id[char] for char in sentence.split(" ")])
  return result
  

In [None]:
source_sentence_int = convert_text_int(source_char_id, source_text)
target_sentence_int = convert_text_int(target_char_id, target_text)
print(len(source_sentence_int), len(target_sentence_int))
print(source_sentence_int[0], target_sentence_int[0])

3000 3000
[1] [1, 2, 3]


# 统计文本的最大长度

In [None]:
def max_length_sentecne(sentence_list):
  max_length = 0
  for sentence in sentence_list:
    max_length = max(len(sentence), max_length)
  return max_length

In [None]:
source_sentence_max_length = max_length_sentecne(source_sentence_int)
print(source_sentence_max_length)
target_sentence_max_length = max_length_sentecne(target_sentence_int)
print(target_sentence_max_length)

4
9


In [None]:
source_data = tf.keras.preprocessing.sequence.pad_sequences(source_sentence_int, maxlen=source_sentence_max_length)
target_data = tf.keras.preprocessing.sequence.pad_sequences(target_sentence_int, maxlen=target_sentence_max_length)

In [None]:
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(source_data, target_data, test_size = 0.2)
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

2400 2400 600 600


In [None]:
def convert(id2char, tensor):
  for t in tensor:
    if t!=0:
      print ("%d ----> %s" % (t, id2char[t]))

In [None]:
print ("Input Language; index to word mapping")
convert(source_id_char, input_tensor_train[0])
print ()
print ("Target Language; index to word mapping")
convert(target_id_char, target_tensor_train[0])

Input Language; index to word mapping
16 ----> I
532 ----> felt
1203 ----> woozy.

Target Language; index to word mapping
1 ----> <start>
55 ----> Me
1107 ----> sentía
2510 ----> aturdido.
3 ----> <end>


In [None]:
BUFFER_SIZE = len(source_sentence_int)
BATCH_SIZE = 64
steps_per_epoch = len(source_sentence_int)//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(source_char_id)
vocab_tar_size = len(target_char_id)

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [None]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 4]), TensorShape([64, 9]))

In [None]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.enc_units,
                    return_sequences=True,
                    return_state=True,
                    recurrent_initializer='glorot_uniform')

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x,initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))
  

In [None]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
# 样本输入
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (64, 4, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


In [None]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):  
    print(tf.shape(query))
    print(tf.shape(values))

    hidden_with_time_axis = tf.expand_dims(query, 1)
    print(tf.shape(hidden_with_time_axis))

    score = self.V(tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis)))
    print(tf.shape(score))

    attention_weights = tf.nn.softmax(score, axis=1)
    context_vector = attention_weights * values

    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [None]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

tf.Tensor([  64 1024], shape=(2,), dtype=int32)
tf.Tensor([  64    4 1024], shape=(3,), dtype=int32)
tf.Tensor([  64    1 1024], shape=(3,), dtype=int32)
tf.Tensor([64  4  1], shape=(3,), dtype=int32)
Attention result shape: (batch size, units) (64, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (64, 4, 1)


In [None]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)

    # 用于注意力
    self.attention = BahdanauAttention(self.dec_units)

  def call(self, x, hidden, enc_output):
    # 编码器输出 （enc_output） 的形状 == （批大小，最大长度，隐藏层大小）
    context_vector, attention_weights = self.attention(hidden, enc_output)

    # x 在通过嵌入层后的形状 == （批大小，1，嵌入维度）
    x = self.embedding(x)

    # x 在拼接 （concatenation） 后的形状 == （批大小，1，嵌入维度 + 隐藏层大小）
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    # 将合并后的向量传送到 GRU
    output, state = self.gru(x)

    # 输出的形状 == （批大小 * 1，隐藏层大小）
    output = tf.reshape(output, (-1, output.shape[2]))

    # 输出的形状 == （批大小，vocab）
    x = self.fc(output)

    return x, state, attention_weights

In [None]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((64, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (64, 2646)


In [None]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [None]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [None]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([0] * BATCH_SIZE, 1)

    # 教师强制 - 将目标词作为下一个输入
    for t in range(1, targ.shape[1]):
      # 将编码器输出 （enc_output） 传送至解码器
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
      loss += loss_function(targ[:, t], predictions)

      # 使用教师强制
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [None]:
EPOCHS = 10
for epoch in range(EPOCHS):
  start = time.time()
  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0
  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss
    if batch % 100 == 0:
        print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                 batch,
                                 batch_loss.numpy()))
  # 每 2 个周期（epoch），保存（检查点）一次模型
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)
  print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 2.0658
Epoch 1 Loss 1.5670
Time taken for 1 epoch 1.3203003406524658 sec

Epoch 2 Batch 0 Loss 1.6858
Epoch 2 Loss 1.3884
Time taken for 1 epoch 1.6881139278411865 sec

Epoch 3 Batch 0 Loss 1.6572
Epoch 3 Loss 1.2813
Time taken for 1 epoch 1.3335912227630615 sec

Epoch 4 Batch 0 Loss 1.4960
Epoch 4 Loss 1.2042
Time taken for 1 epoch 1.654665231704712 sec

Epoch 5 Batch 0 Loss 1.4455
Epoch 5 Loss 1.1195
Time taken for 1 epoch 1.336137056350708 sec

Epoch 6 Batch 0 Loss 1.3381
Epoch 6 Loss 1.0231
Time taken for 1 epoch 1.669050931930542 sec

Epoch 7 Batch 0 Loss 1.1833
Epoch 7 Loss 0.9219
Time taken for 1 epoch 1.333808183670044 sec

Epoch 8 Batch 0 Loss 1.0573
Epoch 8 Loss 0.8232
Time taken for 1 epoch 1.6611442565917969 sec

Epoch 9 Batch 0 Loss 0.8325
Epoch 9 Loss 0.7212
Time taken for 1 epoch 1.3357813358306885 sec

Epoch 10 Batch 0 Loss 0.7343
Epoch 10 Loss 0.6207
Time taken for 1 epoch 1.6627309322357178 sec



In [None]:
def evaluate(sentence):
    attention_plot = np.zeros((target_sentence_max_length, source_sentence_max_length))

    sentence = preprocess_sentence(sentence)

    inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                 maxlen=source_sentence_max_length,
                                 padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''

    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input,
                                     dec_hidden,
                                     enc_out)

        # 存储注意力权重以便后面制图
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += targ_lang.index_word[predicted_id] + ' '

        if targ_lang.index_word[predicted_id] == '<end>':
            return result, sentence, attention_plot

        # 预测的 ID 被输送回模型
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence, attention_plot

In [None]:
# 注意力权重制图函数
def plot_attention(attention, sentence, predicted_sentence):
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(1, 1, 1)
    ax.matshow(attention, cmap='viridis')

    fontdict = {'fontsize': 14}

    ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
    ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)

    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()

In [None]:
def translate(sentence):
    result, sentence, attention_plot = evaluate(sentence)

    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))

    attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
    plot_attention(attention_plot, sentence.split(' '), result.split(' '))

In [None]:
# 恢复检查点目录 （checkpoint_dir） 中最新的检查点
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f0d248cf6d8>

In [None]:
translate(u'hace mucho frio aqui.')

NameError: ignored

In [None]:
tf.expand_dims([[1,2,3],[1,2,3]], 1)

<tf.Tensor: shape=(2, 1, 3), dtype=int32, numpy=
array([[[1, 2, 3]],

       [[1, 2, 3]]], dtype=int32)>