# 091. Neural machine translation with attention

- Google Tutorial
- English Korean translation 을 seq2seq attention model 로 구현  
- 아래의 matrix 는 model 이 문장을 번역할 때 input sequence 의 어느 부분에  attention 했는지 보여준다

In [0]:
# Google Colab 한글 font 사용
import matplotlib as mpl
import matplotlib.pyplot as plt
 
%config InlineBackend.figure_format = 'retina'
 
!apt -qq -y install fonts-nanum
 
import matplotlib.font_manager as fm
fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=9)
plt.rc('font', family='NanumBarunGothic') 
mpl.font_manager._rebuild()

In [None]:
# #windows 한글 폰트 사용
# from matplotlib import font_manager
# import matplotlib
# font_path = "C:/Windows/Fonts/H2GTRM.TTF"                       #폰트 경로
# font_name = font_manager.FontProperties(fname=font_path).get_name()  #폰트 이름 얻어오기
# matplotlib.rc('font', family=font_name)                                 #font 지정
# matplotlib.rcParams['axes.unicode_minus'] = False               #한글사용시 마이너스 사인 깨짐 방지

In [0]:
import tensorflow as tf
tf.__version__

In [0]:
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time
import pandas as pd

In [0]:
df = pd.read_csv("https://github.com/ironmanciti/NLP_DeepLearning/raw/master/kor-eng/kor.txt", delimiter='\t', header=None)
df = df.iloc[:-10]
print(len(df))
df.pop(2)
df.to_csv("kor_clean.txt", sep="\t", index=False)

# Step1. prepare the dataset

1.  Teacher Forcing 용 data 생성
    - target_texts_inputs  : 1 만큼 offset 된 target language sentence $\rightarrow$ `<sos>...`
    - target_texts  : target language sentence  $\rightarrow$ `.....<eos>`


2. 특수문자 제거  
3. word index 및 reverse word index 작성 (word $\rightarrow$ id, id $\rightarrow$word)
4. 각 sentence 를 maximum 길이로 sequence padding



   

In [0]:
path_to_file = "kor_clean.txt"

In [0]:
# Converts the unicode file to ascii
# NFC - 한글 소리마디 영역으로 처리
# Mn - Nonspacing_Mark
# 문장을 NFC 방식으로 normalize 하고 accent 를 제거
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFC', s) if unicodedata.category(c) != 'Mn')


def preprocess_sentence(w):
  w = unicode_to_ascii(w.lower().strip())

  # word 와 구둣점 사이에 space 추가 
  # eg: "he is a boy." => "he is a boy ."
  # \1 - first group
  w = re.sub(r"([?.!,¿])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)
    
  # a-z, A-Z, ".", "?", "!", "," 외에는 모두 space  로 변환
  w = re.sub(r"[^a-zA-Z[가-힣]?.!,¿]+", " ", w)

  w = w.rstrip().strip()

  # sentence 에 start, end token 추가
  w = '<start> ' + w + ' <end>'
  return w

In [0]:
en_sentence = u"How's the business going?"
ko_sentence = u"사업 어떻게 되어 가?"
print(preprocess_sentence(en_sentence))
print(preprocess_sentence(ko_sentence))

In [0]:
# 1. accent 제거
# 2. sentences cleansing
# 3. [ENGLISH, Korean] pair 로 반환
def create_dataset(path, num_examples):
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]
    return zip(*word_pairs)

In [0]:
lines = io.open(path_to_file, encoding='UTF-8').read().strip().split('\n')
lines[110:115]

In [0]:
[[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[110:115]]

In [0]:
en, ko = create_dataset(path_to_file, None)
print(en[-1])
print(ko[-1])

In [0]:
def max_length(tensor):
    return max(len(t) for t in tensor)

In [0]:
def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<OOV>')
    
    lang_tokenizer.fit_on_texts(lang)

    tensor = lang_tokenizer.texts_to_sequences(lang)

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')

    return tensor, lang_tokenizer

In [0]:
def load_dataset(path, num_examples=None):
  # creating cleaned input, output pairs
  targ_lang, inp_lang = create_dataset(path, num_examples)

  input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
  target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

  return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [0]:
# dataset 전체 size 에 대해 다음을 수행
input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer = load_dataset(path_to_file)

# target tensor 의 max_length 계산
max_length_targ, max_length_inp = max_length(target_tensor), max_length(input_tensor)
max_length_targ, max_length_inp

In [0]:
# 80-20 으로 training, validation set 분할
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val \
                                        = train_test_split(input_tensor, target_tensor, test_size=0.2)

# Show length
print(len(input_tensor_train), len(target_tensor_train))
print(len(input_tensor_val), len(target_tensor_val))

In [0]:
print ("Input Language; index to word mapping")
print([inp_lang_tokenizer.index_word[i] for i in input_tensor_train[0] if i != 0])
print ()
print ("Target Language; index to word mapping")
print([targ_lang_tokenizer.index_word[i] for i in target_tensor_train[0] if i != 0])

### tf.data dataset 생성

In [0]:
len(targ_lang_tokenizer.word_index)

In [0]:
BUFFER_SIZE = len(input_tensor_train)   
print(BUFFER_SIZE)

BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE    
print((steps_per_epoch))

embedding_dim = 256
units = 1024

vocab_inp_size = len(inp_lang_tokenizer.word_index) + 1       
print(vocab_inp_size)

vocab_tar_size = len(targ_lang_tokenizer.word_index) + 1      
print(vocab_tar_size)

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [0]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

## encoder, decoder model 작성

# Step 2. Encoder model 작성

In [0]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    
    self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

In [0]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, max input sequence length, units) {}'
                       .format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

In [0]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    
    # score shape == (batch_size, max_length, 1) -> (64, 16, 1)  
    # we get 1 at the last axis because we are applying score to self.V
    hidden_with_time_axis = tf.expand_dims(query, 1)
    score = self.V(tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis)))

    # attention_weights shape == (batch_size, input_lang_max_length, 1) -> (64, 16, 1)
    attention_weights = tf.nn.softmax(score, axis=1)

    # context_vector(어텐션 값) shape after sum == (batch_size, hidden_size)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [0]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, hidden_units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, input_lang_max_sequence_length, 1) {}"
                  .format(attention_weights.shape))

In [0]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)     # vocab_size - target_lang vocab_size (4935)

    # used for attention
    self.attention = BahdanauAttention(self.dec_units)        

  def call(self, x, hidden, enc_output):
    
    # context_vector (batch_size, hidden_size) -> (64, 1024)
    # attention_weights (64, 16, 1)
    context_vector, attention_weights = self.attention(hidden, enc_output)

    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    # (64, 1, 256)
    x = self.embedding(x)

    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
    # attention 값 (context vector) 과 timestep t 의 output 을 연결
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    # passing the concatenated vector to the GRU
    output, state = self.gru(x)

    # output shape == (batch_size * 1, hidden_size)
    output = tf.reshape(output, (-1, output.shape[2]))

    # output shape == (batch_size, vocab) -> (64, 4935)
    x = self.fc(output)

    return x, state, attention_weights

In [0]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, target_lang_vocab size) {}'
                       .format(sample_decoder_output.shape))

## Define the optimizer and the loss function

- tf.keras.losses.SparseCategoricalCrossentropy 의 reduction 이 NONE 이면, return shape 은 [batch_size, d0, .. dN-1] 이 된다. otherwise, scalar. 

In [0]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
                    from_logits=True, reduction='none')

def loss_function(real, pred):     

  # batch 64 개 record 중 timestep t 에 0 padding 이 아닌 
  # 실제 단어가 존재하는 record 만 True 로 만듦
  mask = tf.math.logical_not(tf.math.equal(real, 0))    

  # [word_index] 에 대한 확률분포 array - (64, ), dtype=float32 
  loss_ = loss_object(real, pred)    

  # mask dtype 을 float32 로 type cast
  mask = tf.cast(mask, dtype=loss_.dtype)   

  loss_ *= mask             # 실제 단어가 존재하는 위치 외에는 모두 0 으로 만든다

  return tf.reduce_mean(loss_) 

## Checkpoints (Object-based saving)

In [0]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

## Training

1. *input* 을 *encoder* 에 통과시켜 *encoder output* 과 *encoder hidden state* 를 출력
2. *encoder output*, *encoder hidden state*, *decoder input( start token*) 을 decoder 로 공급
3. decoder 는 *prediction* 과 *decoder hidden state* 를 반환
4. *decoder hidden state* 를 다시 model 로 되돌려지고,  *prediction* 은 loss 계산에 사용
5. decoder 에 공급할 next input 을 *teacher forcing* 으로 결정
6. *Teacher forcing* 은 *target word* 를 *next input*  으로  decoder 에 공급하는 기술이다.
7. final step 은 gradient 를 계산하고 optimizer 에 적용하여  backpropagate 하는 것이다.

In [0]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)
    
    # decoder hidden state 의 초기값은  encoder  last hidden state
    dec_hidden = enc_hidden   

    # dec_input shape (batch_size, 1) -> (64, 1)
    dec_input = tf.expand_dims([targ_lang_tokenizer.word_index['<start>']] * BATCH_SIZE, 1)

    # Teacher forcing - feeding the target as the next input
    for t in range(1, targ.shape[1]):    # targ.shape = (64, 11)
      # passing enc_output to the decoder
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

      loss += loss_function(targ[:, t], predictions)  # (64,) (64, 4935)

      # using teacher forcing - predictions 가 아닌 true value 를 
      # next step 의 dec_input 으로 제공
      dec_input = tf.expand_dims(targ[:, t], 1)

  # targ.shape -> (64, 11)
  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [0]:
EPOCHS = 30

for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):

    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 10 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
  # saving (checkpoint) the model every 2 epochs
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} 평균 Loss {:.4f}'.format(epoch + 1, total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

## Translate

* evaluate 함수는 *teacher forcing* 을 하지 않는 것만 다르고  training loop 와 유사. 각 time step 에서의 decoder input 은  이전 step 의  prediction, hidden state, encoder output 이다.
* model 이 *end token* 을 predict 하면 prediction 종료.
* *attention weights* 를 각 time step 별로 저장.

Note: encoder output 은 one input 에 단 한번 계산된다.

In [0]:
def evaluate(sentence):
  attention_plot = np.zeros((max_length_targ, max_length_inp))

  sentence = preprocess_sentence(sentence)

  inputs = [inp_lang_tokenizer.word_index.get(i, 1) for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_length_inp,
                                                         padding='post')
  inputs = tf.convert_to_tensor(inputs)

  result = ''

  hidden= [tf.zeros((1, units))]   # units : 1024
  enc_out, enc_hidden = encoder(inputs, hidden)

  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([targ_lang_tokenizer.word_index['<start>']], 0)

  for t in range(max_length_targ):      # 11
    predictions, dec_hidden, attention_weights  \
            = decoder(dec_input, dec_hidden, enc_out)
    # storing the attention weights to plot later on
    attention_weights = tf.reshape(attention_weights, (-1, ))
    attention_plot[t] = attention_weights.numpy()

    predicted_id = tf.argmax(predictions[0]).numpy()

    result += targ_lang_tokenizer.index_word.get(predicted_id, 1) + ' '

    if targ_lang_tokenizer.index_word.get(predicted_id, 1)== '<end>':
      return result, sentence, attention_plot

    # the predicted ID is fed back into the model - no teacher-forcing
    dec_input = tf.expand_dims([predicted_id], 0)

  return result, sentence, attention_plot

In [0]:
import seaborn as sns

# function for plotting the attention weights
def plot_attention(attention, sentence, predicted_sentence):
  fig = plt.figure(figsize=(8,6))
  ax = fig.add_subplot(1, 1, 1)
  sns.heatmap(attention, annot=True, fmt=".2f")

  fontdict = {'fontsize': 14}

  ax.set_xticklabels([''] + sentence, rotation=45)
  ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict, rotation=45)

  ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
  ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

  plt.show()

In [0]:
def translate(sentence):
  result, sentence, attention_plot = evaluate(sentence)

  print('Input: %s' % (sentence))
  print('Predicted translation: {}'.format(result))

  attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
  plot_attention(attention_plot, sentence.split(' '), result.split(' '))

## Restore the latest checkpoint and test

In [0]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

In [0]:
translate(u'이 근처에 휴대폰 충전할 수 있는 곳이 있나요?')

In [0]:
translate(u'8시간 동안 식사한 후에 톰은 더이상 운전을 할 수 없었다')

In [0]:
translate(u'옷장을 뒤져서 기부할 것들을 찾아봐야겠어')

In [0]:
# wrong translation
translate(u'이건 내 강아지야.')