In [1]:
conda install tensorflow_datasets

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... failed with initial frozen solve. Retrying with flexible solve.
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... failed with initial frozen solve. Retrying with flexible solve.

Note: you may need to restart the kernel to use updated packages.



PackagesNotFoundError: The following packages are not available from current channels:

  - tensorflow_datasets

Current channels:

  - https://repo.anaconda.com/pkgs/main/win-64
  - https://repo.anaconda.com/pkgs/main/noarch
  - https://repo.anaconda.com/pkgs/r/win-64
  - https://repo.anaconda.com/pkgs/r/noarch
  - https://repo.anaconda.com/pkgs/msys2/win-64
  - https://repo.anaconda.com/pkgs/msys2/noarch

To search for alternate channels that may provide the conda package you're
looking for, navigate to

    https://anaconda.org

and use the search bar at the top of the page.




In [None]:
# https://www.tensorflow.org/tutorials/text/transformer

import tensorflow_datasets as tfds
import tensorflow as tf
import numpy as np

In [None]:
examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True,
                               as_supervised=True)
examples_train, examples_valid = examples['train'], examples['validation']

print(examples_train)
print(examples_valid)

In [None]:
tokenizer_en = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    (en.numpy() for pt, en in examples_train), target_vocab_size=2**13)

tokenizer_pt = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    (pt.numpy() for pt, en in examples_train), target_vocab_size=2**13)

In [None]:
str_sample = 'Transformer is awesome.'

str_tokenized = tokenizer_en.encode(str_sample)
print ('Tokenized string is {}'.format(str_tokenized))

str_original = tokenizer_en.decode(str_tokenized)
print ('The original string: {}'.format(str_original))

assert str_original == str_sample

In [None]:
for ts in str_tokenized:
    print ('{} ----> {}'.format(ts, tokenizer_en.decode([ts])))

In [None]:
size_buffer = 20000
size_batch = 64
length_max = 40

In [None]:
def encode(lang1, lang2):
    lang1 = [tokenizer_pt.vocab_size] + tokenizer_pt.encode(
        lang1.numpy()) + [tokenizer_pt.vocab_size+1]

    lang2 = [tokenizer_en.vocab_size] + tokenizer_en.encode(
        lang2.numpy()) + [tokenizer_en.vocab_size+1]
  
    return lang1, lang2

def tf_encode(pt, en):
    result_pt, result_en = tf.py_function(encode, [pt, en],
                                          [tf.int64, tf.int64])
    result_pt.set_shape([None])
    result_en.set_shape([None])

    return result_pt, result_en

def filter_max_length(x, y, max_length=length_max):
    return tf.logical_and(tf.size(x) <= max_length,
                          tf.size(y) <= max_length)

In [None]:
dataset_train = examples_train.map(tf_encode)
dataset_train = dataset_train.filter(filter_max_length)\
    .cache().shuffle(size_buffer).padded_batch(size_batch)
dataset_train = dataset_train.prefetch(tf.data.experimental.AUTOTUNE)

dataset_valid = examples_valid.map(tf_encode)
dataset_valid = dataset_valid.filter(filter_max_length).padded_batch(size_batch)

In [None]:
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    return pos * angle_rates

def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                            np.arange(d_model)[np.newaxis, :],
                            d_model)
  
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]
    return tf.cast(pos_encoding, dtype=tf.float32)

def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]

def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask

In [None]:
def scaled_dot_product_attention(q, k, v, mask):
    matmul_qk = tf.matmul(q, k, transpose_b=True)
    
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
    
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)
        # 굉장히 작은 수 더해 줌
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
    output = tf.matmul(attention_weights, v)
    return output, attention_weights

In [None]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        
        self.num_heads = num_heads
        self.d_model = d_model
        assert d_model % self.num_heads == 0
        
        self.depth = d_model // self.num_heads
        
        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)
        
        self.dense = tf.keras.layers.Dense(d_model)
        
    def split_heads(self, x, size_batch):
        x = tf.reshape(x, (size_batch, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])
    
    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]
        
        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)
        
        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)
        
        attention_scaled, weights_attention = scaled_dot_product_attention(
            q, k, v, mask)
        attention_scaled = tf.transpose(attention_scaled,
                                        perm=[0, 2, 1, 3])
        attention_concat = tf.reshape(attention_scaled,
                                      (batch_size, -1, self.d_model))
        
        outputs = self.dense(attention_concat)
        return outputs, weights_attention

In [None]:
def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(dff, activation=tf.nn.relu),
        tf.keras.layers.Dense(d_model)
    ])

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(EncoderLayer, self).__init__()
        
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)
        
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
    
    def call(self, x, training, mask):
        attn_output, _ = self.mha(x, x, x, mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)
        
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)
        return out2

In [None]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(DecoderLayer, self).__init__()
        
        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)
        
        self.ffn = point_wise_feed_forward_network(d_model, dff)
        
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.dropout3 = tf.keras.layers.Dropout(rate)
        
    def call(self, x, outputs_enc, training,
             look_ahead_mask, padding_mask):
        attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)
        
        attn2, attn_weights_block2 = self.mha2(
            outputs_enc, outputs_enc, out1, padding_mask)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(attn2 + out1)
        
        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(ffn_output + out2)
        
        return out3, attn_weights_block1, attn_weights_block2

In [None]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, size_vocab_input,
                 maximum_position_encoding, rate_dropout=0.1):
        super(Encoder, self).__init__()
        
        self.d_model = d_model
        self.num_layers = num_layers
        
        self.embedding = tf.keras.layers.Embedding(size_vocab_input, d_model)
        self.pos_encoding = positional_encoding(
            maximum_position_encoding,
            self.d_model
        )
        
        self.layers = [
            EncoderLayer(d_model, num_heads, dff, rate_dropout) 
            for _ in range(num_layers)
        ]
        self.dropout = tf.keras.layers.Dropout(rate_dropout)
        
    def call(self, x, training, mask):
        seq_len = tf.shape(x)[1]
        
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]
        
        x = self.dropout(x, training=training)
        
        for i in range(self.num_layers):
            x = self.layers[i](x, training, mask)
            
        return x

In [None]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff,
                 size_vocab_target, maximum_position_encoding, rate_dropout=0.1
    ):
        super(Decoder, self).__init__()
        
        self.d_model = d_model
        self.num_layers = num_layers
        
        self.embedding = tf.keras.layers.Embedding(size_vocab_target, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)
        
        self.layers = [
            DecoderLayer(d_model, num_heads, dff, rate_dropout)
            for _ in range(num_layers)
        ]
        self.dropout = tf.keras.layers.Dropout(rate_dropout)
    
    def call(
        self,
        x, outputs_enc, training,
        look_ahead_mask, padding_mask
    ):
        seq_len = tf.shape(x)[1]
        weights_attention = {}
        
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]
        
        x = self.dropout(x, training=training)
        
        for i in range(0, self.num_layers):
            x, block1, block2 = self.layers[i](
                x, outputs_enc, training,
                look_ahead_mask, padding_mask
            )
      
            weights_attention['decoder_layer{}_block1'.format(i + 1)] = block1
            weights_attention['decoder_layer{}_block2'.format(i + 1)] = block2
            
        return x, weights_attention

In [None]:
class Transformer(tf.keras.Model):
    def __init__(self,
        num_layers, d_model, num_heads, dff, size_vocab_input,
        size_vocab_target, pe_input, pe_target, rate_dropout=0.1
    ):
        super(Transformer, self).__init__()
        self.encoder = Encoder(
            num_layers, d_model, num_heads, dff,
            size_vocab_input, pe_input, rate_dropout
        )
        self.decoder = Decoder(
            num_layers, d_model, num_heads, dff,
            size_vocab_target, pe_target, rate_dropout
        )
        self.layer_dense = tf.keras.layers.Dense(size_vocab_target)
        
    def call(
        self,
        inp, tar, training,
        enc_padding_mask,
        look_ahead_mask,
        dec_padding_mask
    ):
        outputs_enc = self.encoder(inp, training, enc_padding_mask)
        outputs_dec, weights_attention = self.decoder(
            tar, outputs_enc, training, look_ahead_mask, dec_padding_mask
        )
        
        outputs = self.layer_dense(outputs_dec)
        
        return outputs, weights_attention

In [None]:
num_layers = 4
d_model = 128
dff = 512
num_heads = 8
num_epochs = 5

size_vocab_input = tokenizer_pt.vocab_size + 2
size_vocab_target = tokenizer_en.vocab_size + 2
rate_dropout = 0.1

In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        
        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)
        
        self.warmup_steps = warmup_steps
        
    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [None]:
rate_learning = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(
    rate_learning,
    beta_1=0.9,
    beta_2=0.98,
    epsilon=1e-9
)

In [None]:
obj_loss = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = obj_loss(real, pred)
    
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    
    return tf.reduce_sum(loss_) / tf.reduce_sum(mask)

loss_train = tf.keras.metrics.Mean(
    name='loss_train')
accuracy_train = tf.keras.metrics.SparseCategoricalAccuracy(
    name='accuracy_train')

In [None]:
transformer = Transformer(
    num_layers, d_model, num_heads, dff,
    size_vocab_input, size_vocab_target,
    pe_input=size_vocab_input,
    pe_target=size_vocab_target,
    rate_dropout=rate_dropout
)

In [None]:
def create_masks(inp, tar):
    enc_padding_mask = create_padding_mask(inp)
    dec_padding_mask = create_padding_mask(inp)
    
    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    dec_target_padding_mask = create_padding_mask(tar)
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
    
    return enc_padding_mask, combined_mask, dec_padding_mask

In [None]:
path_checkpoint = "./checkpoints/train"
ckpt = tf.train.Checkpoint(
    transformer=transformer,
    optimizer=optimizer
)

ckpt_manager = tf.train.CheckpointManager(ckpt, path_checkpoint, max_to_keep=5)
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print ('Latest checkpoint restored.')

In [None]:
train_step_signature = [
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
]

@tf.function(input_signature=train_step_signature)
def step_train(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]
    
    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)
    
    with tf.GradientTape() as tape:
        preds, _ = transformer(
            inp,
            tar_inp,
            True,
            enc_padding_mask,
            combined_mask,
            dec_padding_mask
        )
        loss_ = loss(tar_real, preds)

    gradients = tape.gradient(loss_, transformer.trainable_weights)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_weights))
    
    loss_train(loss_)
    accuracy_train(tar_real, preds)

In [None]:
for ind_epoch in range(0, num_epochs):
    loss_train.reset_states()
    accuracy_train.reset_states()

    for (ind_batch, (inp, tar)) in enumerate(dataset_train):
        step_train(inp, tar)
        
        if ind_batch % 50 == 0:
            print('EPOCH {} BATCH {} loss_train {:.4f} accuracy_train {:.4f}'.format(
                ind_epoch + 1, ind_batch, loss_train.result(), accuracy_train.result()))
      
    if (ind_epoch + 1) % 5 == 0:
        path_ckpt_save = ckpt_manager.save()
        print('Saving checkpoint for epoch {} at {}'.format(
            ind_epoch + 1,
            path_ckpt_save)
        )
    print('EPOCH {} loss_train {:.4f} accuracy_train {:.4f}'.format(
        ind_epoch + 1,
        loss_train.result(),
        accuracy_train.result())
    )

In [None]:
def evaluate(inp_sentence):
    start_token = [tokenizer_pt.vocab_size]
    end_token = [tokenizer_pt.vocab_size + 1]
    
    inp_sentence = start_token + tokenizer_pt.encode(inp_sentence) + end_token
    encoder_input = tf.expand_dims(inp_sentence, 0)
    
    decoder_input = [tokenizer_en.vocab_size]
    output = tf.expand_dims(decoder_input, 0)
    
    for i in range(0, length_max):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            encoder_input, output)
        
        preds, attention_weights = transformer(
            encoder_input,
            output,
            False,
            enc_padding_mask,
            combined_mask,
            dec_padding_mask
        )
        
        preds = preds[: ,-1:, :]
        predicted_id = tf.cast(tf.argmax(preds, axis=-1), tf.int32)
        
        if predicted_id == tokenizer_en.vocab_size + 1:
            return tf.squeeze(output, axis=0), attention_weights
        
        output = tf.concat([output, predicted_id], axis=-1)
        
    return tf.squeeze(output, axis=0), attention_weights

In [None]:
def translate(sentence):
    result, attention_weights = evaluate(sentence)
    predicted_sentence = tokenizer_en.decode(
        [i for i in result if i < tokenizer_en.vocab_size])  

    print('Input: {}'.format(sentence))
    print('Predicted translation: {}'.format(predicted_sentence))