<a href="https://colab.research.google.com/github/SunshyPiKaChew/seq2seq_attention/blob/master/seq_seq_attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow==2.0.0-beta1
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

In [None]:
! /opt/bin/nvidia-smi

In [None]:
print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
  print(module.__name__, module.__version__)

In [None]:
# 1. preprocessing data
# 2. build model
# 2.1 encoder
# 2.2 attetion
# 2.3 decoder
# 3. evaluation
# 3.1 given sentence. return translate results
# 3.2 visualize results (attention)

In [None]:
# unicode2ascii 去掉西班牙语的重音
import unicodedata
def unicode_to_ascii(s):
  # normalize 的 NFD 方法，如果一个unicode值包含多个字符，那么把他拆开，例如e和重音分开
  # 'Mn' 重音的分类标志
  return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
en_sentence = 'Then what?'
sp_sentence = '¿Entonces qué?'

print(unicode_to_ascii(en_sentence))
print(unicode_to_ascii(sp_sentence))

In [None]:
# 字符串预处理
import re
def preprocess_sentence(s):
  s = unicode_to_ascii(s.lower().strip())
  # [] 匹配操作 () 替换操作 前后加空格
  s = re.sub(r"([?.!,¿])", r" \1 ", s)
  # 将一个或多个空格替换为一个空格
  s = re.sub(r'[" "]+', " ", s)
  # 除了字母和标点符号都替换为空格
  s = re.sub(r'[^a-zA-Z?.!,¿]', " ", s)
  # 去掉前后的空格
  s = s.rstrip().strip()
  # 添加前后特殊字符
  s = '<start> ' + s + ' <end>'
  return s

print(preprocess_sentence(en_sentence))
print(preprocess_sentence(sp_sentence))


In [None]:
from google.colab import drive
drive.mount('/content/drive')
en_spa_file_path = '/content/drive/MyDrive/Colab/chapter_10/data_spa_en/spa.txt'

In [None]:
def parse_data(filename):
  # 根据回车分割数据中的每一行
  lines = open(filename, encoding='UTF-8').read().strip().split('\n')
  # 根据制表符分割西班牙文和英文
  sentence_pairs = [line.split('\t') for line in lines]
  preprocessed_sentence_pairs = [
    (preprocess_sentence(en), preprocess_sentence(sp)) for en,sp in sentence_pairs]
  return zip(*preprocessed_sentence_pairs)

en_dataset, sp_dataset = parse_data(en_spa_file_path)
print(en_dataset[-1])
print(sp_dataset[-1])


In [None]:
# 文本式数据转化为ID式数据
def tokenizer(lang):
  lang_tokenizer = keras.preprocessing.text.Tokenizer(num_words = None, filters='', split=' ')
  lang_tokenizer.fit_on_texts(lang)
  tensor = lang_tokenizer.texts_to_sequences(lang)
  # Padding
  tensor = keras.preprocessing.sequence.pad_sequences(tensor, padding = 'post')
  return tensor, lang_tokenizer

input_tensor, input_tokenizer = tokenizer(sp_dataset[0:30000])
output_tensor, output_tokenizer = tokenizer(en_dataset[0:30000])

def max_length(tensor):
  return max(len(t) for t in tensor)

max_length_input = max_length(input_tensor)
max_length_output = max_length(output_tensor)
print(max_length_input, max_length_output)

In [None]:
# 调用sklearn函数分割数据集
from sklearn.model_selection import train_test_split
input_train, input_eval, output_train, output_eval = train_test_split(input_tensor, output_tensor, test_size = 0.2)
len(input_train),len(input_eval), len(output_train), len(output_eval)

In [None]:
def convert(example, tokenizer):
  for t in example:
    if t != 0:
      print('%d --> %s' % (t,tokenizer.index_word[t]))

convert(input_train[0],input_tokenizer)
print()
convert(output_train[0],output_tokenizer)

In [None]:
# 比如训练集有50000个样本，而我设定的batch_size是50，也就是说每50个样本才更新一次参数，那么也就意味着一个epoch里会提取1000次bach，
# 这样才会把每个样本都提取了一遍，更新了1000次参数。

# 这是一个epoch里做的，依次类推，我要设定2000个epoch意味着把这个过程重复2000次。也就是训练集里的每个样本都被提取了2000次。

# 生成DataSet
def make_dataset(input_tensor, output_tensor, batch_size, epochs, shuffle):
  dataset = tf.data.Dataset.from_tensor_slices((input_tensor, output_tensor))
  if shuffle:
    dataset = dataset.shuffle(30000)
  dataset = dataset.repeat(epochs).batch(batch_size, drop_remainder = True)
  return dataset

batch_size = 64;
epochs = 20

train_dataset = make_dataset(input_train, output_train, batch_size, epochs, True)
eval_dataset = make_dataset(input_eval, output_eval, batch_size, epochs, False)

# 64 是一个bacth的大小 16 11 分别为输入输出padding之后的大小
for x,y in train_dataset.take(1):
  print(x.shape)
  print(y.shape)
  print(x)
  print(y)

In [None]:
# 超参数定义

# 将单词进行编码，编码长度为256
embedding_units = 256   
# 中间循环神经网络 encoder decoder  
units = 1024          
# 输入词表长度
input_vocab_size = len(input_tokenizer.word_index) + 1
# 输出词表长度
output_vocab_size = len(output_tokenizer.word_index) + 1

In [None]:
# 调用子类API
class Encoder(tf.keras.Model):
  # 初始化函数
  def __init__(self, vocab_size, embedding_units, encoding_units, batch_size):
    # 调用父类初始化函数
    super(Encoder, self).__init__()
    # 赋初值
    self.batch_size = batch_size;
    self.encoding_units = encoding_units;
    # 一个规定输入词表大小和输出编码大小的编码器
    self.embedding = keras.layers.Embedding(vocab_size, embedding_units)
    # 每一步的隐层状态： 矩阵 hide_state 最后一步的输出：cell_state
    self.gru = keras.layers.GRU(self.encoding_units, return_sequences = True, return_state = True, recurrent_initializer = 'glorot_uniform')
  def call(self, x, hidden):
    # 输入编码
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    # 每一步输出和最后一次输出的隐含状态
    return output, state

  def initialize_hidden_state(self):    
    return tf.zeros((self.batch_size, self.encoding_units))

encoder = Encoder(input_vocab_size, embedding_units, units, batch_size)
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder.call(x, sample_hidden)
print('output_shape')
print(sample_output.shape)
print('decoder_hidden_shape')
print(sample_hidden.shape)


In [None]:
class BahdanauAttention(keras.Model):
  def __init__(self, units):
    # units 全连接层的维度 y = A * x
    super(BahdanauAttention, self).__init__()
    self.W1 = keras.layers.Dense(units)
    self.W2 = keras.layers.Dense(units)
    self.V = keras.layers.Dense(1)
  def call(self, decoder_hidden, encoder_outputs):
    # decoder_hidden.shape: (batch_size,units) (64,1024)
    # encoder_outputs.shape: (batch_size, length, units) (64,16,1024)
    # decoder_hidden_with_time_axis: (batch_size,1,units) (64,1,1024)
    # 保证大的维度一致，就可以相加

    decoder_hidden_with_time_axis = tf.expand_dims(decoder_hidden, 1)

    # before V: (batch_size, length, units) 64 16 10
    # after V socre: (batch_size, length, 1)  64 16 1
    # self.W1(encoder_outputs) 64 16 10
    # self.W2(decoder_hidden_with_time_axis) 64 1 10
    # tanh不改变维度
    score = self.V(
        tf.nn.tanh(
            self.W1(encoder_outputs) + self.W2(decoder_hidden_with_time_axis)
        )
    )

    # attention_weights.shape: (batch_size, length, 1)
    # attention 只和 单词（length） 有关系, 所以只能在length上做softmax
    attention_weights = tf.nn.softmax(score, axis = 1)
    
    # context_vector.shape: (batch_size, length, units) 64 16 1024
    # 维度不匹配的相乘(忽略batch_size 维度)
    # [attention_weight 按列复制1024份] * [encoder_outputs] 16*1024
    # attention_weight.shape() 64 16 1; encoder_outputs.shape() 64 16 1024
    # attention_weights 实际是length的权重
    context_vector = attention_weights * encoder_outputs
    
    # context_vector.shape: (batch_size, units) 64 1024
    context_vector = tf.reduce_sum(context_vector, axis = 1)

    return context_vector, attention_weights

attention_model = BahdanauAttention(units = 10)
attention_results, attention_weights = attention_model(sample_hidden, sample_output)

print("attention_results.shape:", attention_results.shape)
print("attention_weights.shape:", attention_weights.shape)