In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf

import time
import os
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False

from params import Params as pm
from utils_v2 import en2idx, idx2en, de2idx, idx2de, dump2record, build_dataset, LRSchedule, masking, create_masks, plot_attention_weights
from bleu import bleu_metrics

In [None]:
tf.__version__

---

In [None]:
def get_data(corpus_file):
    return open(corpus_file, 'r', encoding='utf-8').read().splitlines()

In [None]:
src_train, src_val = get_data(pm.src_train), get_data(pm.src_test)
tgt_train, tgt_val = get_data(pm.tgt_train), get_data(pm.tgt_test)

In [None]:
dump2record(pm.train_record, src_train, tgt_train)
dump2record(pm.test_record, src_val, tgt_val)

In [None]:
# train_dataset = build_dataset(mode='array', corpus=[src_train, tgt_train], is_training=True)
# val_dataset = build_dataset(mode='array', corpus=[src_val, tgt_val], is_training=True)

train_dataset = build_dataset(mode='file', filename=pm.train_record, is_training=True)
val_dataset = build_dataset(mode='file', filename=pm.test_record, is_training=True)

In [None]:
next(iter(train_dataset))

---

In [None]:
from modules import positional_encoding, scaled_dot_product_attention, multihead_attention, pointwise_feedforward, EncoderBlock, DecoderBlock, Encoder, Decoder, Transformer

# Positional encoding
$$\Large{PE_{(pos, 2i)} = sin(pos / 10000^{2i / d_{model}})} $$
$$\Large{PE_{(pos, 2i+1)} = cos(pos / 10000^{2i / d_{model}})} $$

In [None]:
pos_encoding = positional_encoding(50, 512, True)
print(pos_encoding.shape)

# Masking

In [None]:
x = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
masking(x, task='padding')

In [None]:
masking(x, task='look_ahead')

# Scaled dot product attention

![](https://www.tensorflow.org/images/tutorials/transformer/scaled_attention.png)
$$\Large{Attention(Q, K, V) = softmax_k(\frac{QK^T}{\sqrt{d_k}}) V} $$

In [None]:
def print_out(q, k, v):
    temp_out, temp_attn = scaled_dot_product_attention(q, k, v, None)
    print ('Attention weights are:')
    print (temp_attn)
    print ('Output is:')
    print (temp_out)

In [None]:
np.set_printoptions(suppress=True)

temp_k = tf.constant([[10,0,0],
                      [0,10,0],
                      [0,0,10],
                      [0,0,10]], dtype=tf.float32)

temp_v = tf.constant([[   1,0],
                      [  10,0],
                      [ 100,5],
                      [1000,6]], dtype=tf.float32)

temp_q = tf.constant([[0, 10, 0]], dtype=tf.float32)
print_out(temp_q, temp_k, temp_v)

In [None]:
temp_q = tf.constant([[0, 0, 10]], dtype=tf.float32)
print_out(temp_q, temp_k, temp_v)

In [None]:
temp_q = tf.constant([[0, 0, 10], [0, 10, 0], [10, 10, 0]], dtype=tf.float32)
print_out(temp_q, temp_k, temp_v)

# Multi-head attention

![](https://www.tensorflow.org/images/tutorials/transformer/multi_head_attention.png)

- **Tips: Dimention-level split**

In [None]:
temp_mha = multihead_attention(d_model=512, num_heads=8)
y = tf.random.uniform((1, 50, 512))
out, attn = temp_mha(y, k=y, q=y, mask=None)
out.shape, attn.shape

# Pointwise feed forward network

In [None]:
sample_ffn = pointwise_feedforward(512, 2048)
sample_ffn(tf.random.uniform((64, 50, 512))).shape

# Whole model (Encoder & Decoder)
![](https://www.tensorflow.org/images/tutorials/transformer/transformer.png)

## Encoder

In [None]:
sample_encoder_layer = EncoderBlock(512, 8, 2048)
sample_encoder_layer_output, _ = sample_encoder_layer(tf.random.uniform((64, 43, 512)), False, None)
sample_encoder_layer_output.shape

## Decoder

In [None]:
sample_decoder_layer = DecoderBlock(512, 8, 2048)

sample_decoder_layer_output, _, _ = sample_decoder_layer(
    tf.random.uniform((64, 50, 512)), sample_encoder_layer_output, 
    False, None, None)

sample_decoder_layer_output.shape

## Packed Encoder & Decoder

In [None]:
sample_encoder = Encoder(num_blocks=2, d_model=512, num_heads=8, dff=2048, input_vocab_size=8500, plot_pos_embedding=False)
attn_dict = {}
sample_encoder_output, attn_dict = sample_encoder(tf.random.uniform((64, 62)), training=False, padding_mask=None, attn_dict=attn_dict)
sample_encoder_output.shape

In [None]:
sample_decoder = Decoder(num_blocks=2, d_model=512, num_heads=8, dff=2048, target_vocab_size=8000, plot_pos_embedding=False)
output, attn_dict = sample_decoder(tf.random.uniform((64, 26)), 
                                   enc_output=sample_encoder_output, 
                                   training=False, look_ahead_mask=None, 
                                   padding_mask=None, attn_dict=attn_dict)
output.shape, attn_dict['decoder_layer2_block'].shape

# Transformer

In [None]:
sample_transformer = Transformer(num_blocks=2, d_model=512, num_heads=8, dff=2048, input_vocab_size=8500, target_vocab_size=8000, plot_pos_embedding=False)

temp_input = tf.random.uniform((64, 62))
temp_target = tf.random.uniform((64, 26))

fn_out, _ = sample_transformer(temp_input, 
                               temp_target, 
                               training=False, 
                               enc_padding_mask=None, 
                               look_ahead_mask=None,
                               dec_padding_mask=None)

fn_out.shape

# Training

In [None]:
num_layers = pm.num_block
d_model = pm.d_model
dff = pm.dff
num_heads = pm.num_heads

input_vocab_size = len(en2idx)
target_vocab_size = len(de2idx)
dropout_rate = pm.dropout_rate

EPOCHS = pm.num_epochs

- Learning rate schedule
$$\Large{lrate = d_{model}^{-0.5} * min(step{\_}num^{-0.5}, step{\_}num * warmup{\_}steps^{-1.5})}$$

In [None]:
learning_rate = LRSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=pm.beta_1, beta_2=pm.beta_2, epsilon=pm.epsilon)

In [None]:
temp_learning_rate_schedule = LRSchedule(d_model)

plt.figure(figsize=(12, 8))
plt.plot(temp_learning_rate_schedule(tf.range(40000, dtype=tf.float32)))
plt.ylabel("Learning Rate")
plt.xlabel("Train Step")

- loss mask

In [None]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [None]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

In [None]:
transformer = Transformer(num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, pm.plot_pos_embedding, dropout_rate)

In [None]:
checkpoint_path = pm.ckpt_path

ckpt = tf.train.Checkpoint(transformer=transformer, optimizer=optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print ('Latest checkpoint restored!!')

- Teacher forcing

In [None]:
@tf.function
def train_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)

    with tf.GradientTape() as tape:
        predictions, _ = transformer(inp, 
                                     tar_inp, 
                                     True, 
                                     enc_padding_mask, 
                                     combined_mask, 
                                     dec_padding_mask)
        
        loss = loss_function(tar_real, predictions)

    gradients = tape.gradient(loss, transformer.trainable_variables)    
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
    istarget = tf.cast(tf.not_equal(tar_real, 0), tf.float32)

    train_loss(loss)
    train_accuracy(tar_real, predictions, sample_weight=istarget)

In [None]:
for epoch in range(EPOCHS):
    start = time.time()

    train_loss.reset_states()
    train_accuracy.reset_states()

    for (batch, (inp, tar)) in enumerate(train_dataset):
        train_step(inp, tar)

        if batch % 500 == 0:
              print ('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(
                  epoch + 1, batch, train_loss.result(), train_accuracy.result()))

    if (epoch + 1) % 5 == 0:
        ckpt_save_path = ckpt_manager.save()
        print ('Saving checkpoint for epoch {} at {}'.format(epoch + 1, ckpt_save_path))
        
    print ('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, train_loss.result(), train_accuracy.result()))
    print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

---

In [None]:
def evaluate(inp_sentence):
    encoder_input = inp_sentence
    
    decoder_input = [2]
    output = tf.expand_dims(decoder_input, 0)
    output = tf.tile(output, [tf.shape(encoder_input)[0], 1])

    for i in range(pm.maxlen):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(encoder_input, output)

        predictions, attention_weights = transformer(encoder_input, 
                                                     output,
                                                     False,
                                                     enc_padding_mask,
                                                     combined_mask,
                                                     dec_padding_mask)

        predictions = predictions[: ,-1:, :]
        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

        output = tf.concat([output, predicted_id], axis=-1)

    return output, attention_weights

In [None]:
def cut_by_end(samples):
    output_list = np.zeros(tf.shape(samples))
    for i, sample in enumerate(samples):
        dtype = sample.dtype
        idx = tf.where(tf.equal(sample, 3))
        
        flag = tf.where(tf.equal(tf.size(idx), 0), 1, 0)
        if flag:
            output_list[i] = sample
        else:
            indices = tf.cast(idx[0, 0], dtype)
            output_list[i] = tf.concat([sample[:indices], tf.zeros(tf.shape(sample)[0] - indices, dtype=dtype)], axis=0)

    return tf.cast(output_list, dtype)

In [None]:
eval_log = os.path.join(pm.eval_log_path, '{}_eval.tsv'.format(pm.project_name))
if not os.path.exists(pm.eval_log_path):
    os.makedirs(pm.eval_log_path)
eval_file = open(eval_log, 'w', encoding='utf-8')

start = time.time()
count, scores = 0, 0
for (batch, (inp, tar)) in enumerate(val_dataset):
    prediction, attention_weights = evaluate(inp)
    prediction = cut_by_end(prediction)
    
    preds, tars = [], []
    for source, real_tar, pred in zip(inp, tar, prediction):
        s = " ".join([idx2en.get(i, 1) for i in source.numpy() if i < len(idx2en) and i not in [0, 2, 3]])
        t = "".join([idx2de.get(i, 1) for i in real_tar.numpy() if i < len(idx2de) and i not in [0, 2, 3]])
        p = "".join([idx2de.get(i, 1) for i in pred.numpy() if i < len(idx2de) and i not in [0, 2, 3]])
        
        preds.append(p)
        tars.append([t])
    
        eval_file.write('-Source : {}\n-Target : {}\n-Pred : {}\n\n'.format(s, t, p))
        eval_file.flush()
        
    scores += bleu_metrics(tars, preds, False, 3, True)
    count += 1

eval_file.write('-BLEU Score : {:.4f}'.format(scores / count))
eval_file.close()

print("MSG : Done for evalutation ... Totolly {:.2f} sec.".format(time.time() - start))

In [None]:
def predict(inp_sentence):
    start_token = [2]
    end_token = [3]

    inp_sentence = start_token + [en2idx.get(word, 1) for word in inp_sentence.split()] + end_token
    encoder_input = tf.expand_dims(inp_sentence, 0)
    
    decoder_input = [2]
    output = tf.expand_dims(decoder_input, 0)

    for i in range(pm.maxlen):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(encoder_input, output)

        predictions, attention_weights = transformer(encoder_input, 
                                                     output,
                                                     False,
                                                     enc_padding_mask,
                                                     combined_mask,
                                                     dec_padding_mask)

        predictions = predictions[: ,-1:, :]
        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

        if tf.equal(predicted_id, 3):
            return tf.squeeze(output, axis=0), attention_weights

        output = tf.concat([output, predicted_id], axis=-1)

    return tf.squeeze(output, axis=0), attention_weights

In [None]:
def translate(sentence, plot=''):
    result, attention_weights = predict(sentence)
    
    predicted_sentence = [idx2de.get(i, 1) for i in result.numpy() if i < len(idx2de) and i not in [0, 2, 3]]

    print('Input: {}'.format(sentence))
    print('Predicted translation: {}'.format(" ".join(predicted_sentence)))

    if plot:
        plot_attention_weights(attention_weights, sentence, result, plot)

In [None]:
translate("明 天 就 要 上 班 了", plot='decoder_layer4_block')
print("Real translation: 還好我沒工作QQ")