In [81]:
import numpy as np
import math
import re
import time
import zipfile
import random
import tensorflow as tf
from keras import layers
import tensorflow_datasets as tfds

## base de dados

In [82]:
with open('./tradutor/europarl-v7.pt-en.en', mode='r', encoding='utf-8') as f:
    europarl_en = f.read()

with open('./tradutor/europarl-v7.pt-en.pt', mode='r', encoding='utf-8') as f:
    europarl_pt = f.read()

corpus_en = europarl_en
corpus_pt = europarl_pt

In [83]:
corpus_en = re.sub(r"\.(?=[0-9]|[a-z]|[A-Z])", ".$$$", corpus_en)
corpus_en = re.sub(r".\$\$\$", '', corpus_en)
corpus_en = re.sub(r" +", ' ', corpus_en)
corpus_en = corpus_en.split('\n')
print(len(corpus_en))

corpus_pt = re.sub(r"\.(?=[0-9]|[a-z]|[A-Z])", ".$$$", corpus_pt)
corpus_pt = re.sub(r".\$\$\$", '', corpus_pt)
corpus_pt = re.sub(r" +", ' ', corpus_pt)
corpus_pt = corpus_pt.split('\n')
print(len(corpus_pt))

1960408
1960408


## tokenizer 

In [84]:
#### tokenizer ####
build_tokenize_en = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(corpus_en, target_vocab_size=2**13)
build_tokenize_pt = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(corpus_pt, target_vocab_size=2**13)

build_tokenize_en.save_to_file("./vocabs/vocab_tokenizer_en")
build_tokenize_pt.save_to_file("./vocabs/vocab_tokenizer_pt")


In [85]:
print("carregando o vocab")

tokenize_en = tfds.deprecated.text.SubwordTextEncoder.load_from_file('./vocabs/vocab_tokenizer_en')
tokenize_pt = tfds.deprecated.text.SubwordTextEncoder.load_from_file('./vocabs/vocab_tokenizer_pt')

print("vocab carregado")

carregando o vocab
vocab carregado


In [86]:
vocab_size_en = tokenize_en.vocab_size + 2
inputs_en = [[vocab_size_en - 2] + tokenize_en.encode(sentence) + [vocab_size_en - 1] for sentence in corpus_en]
print(inputs_en[random.randint(0, len(inputs_en) - 1)])

vocab_size_pt = tokenize_pt.vocab_size + 2
outputs_pt = [[vocab_size_pt - 2] + tokenize_pt.encode(sentence) + [vocab_size_pt - 1] for sentence in corpus_pt]
print(outputs_pt[random.randint(0, len(outputs_pt) - 1)])

[8191, 133, 9, 19, 226, 87, 60, 26, 14, 23, 1450, 7981, 8192]
[8116, 30, 46, 1263, 3283, 7905, 163, 7905, 5142, 1, 1143, 1, 6, 1263, 687, 464, 7906, 8117]


## etapa de melhora do processamento

In [87]:
"""
o Objetivo é tirar frases com mais de 15 palavras

mas para o treinamento total melhor é por mais
"""

max_length = 40
idx_to_remove = [count for count, sent in enumerate(inputs_en) if len(sent) > max_length]
for idx in reversed(idx_to_remove):
    del inputs_en[idx]
    del outputs_pt[idx]

idx_to_remove = [count for count, sent in enumerate(outputs_pt) if len(sent) > max_length]
for idx in reversed(idx_to_remove):
    del inputs_en[idx]
    del outputs_pt[idx]

In [88]:
#### tamanho total
print("tamanhos das entradas e saidas")
print(len(inputs_en))
print(len(outputs_pt))

tamanhos das entradas e saidas
1239460
1239460


## padding 

In [89]:
inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs_en, value=0, padding='post', maxlen=max_length)
outputs= tf.keras.preprocessing.sequence.pad_sequences(outputs_pt, value=0, padding='post', maxlen=max_length)

print(inputs_en[random.randint(0, len(inputs_en) - 1)])
print(outputs_pt[random.randint(0, len(outputs_pt) - 1)])

[8191, 61, 1983, 107, 118, 753, 7981, 8192]
[8116, 6809, 515, 134, 145, 14, 247, 2481, 66, 214, 947, 24, 4762, 2489, 7906, 8117]


In [90]:
batch_size = 64
buffer_size = 20000

In [91]:
dataset = tf.data.Dataset.from_tensor_slices((inputs, outputs))
dataset = dataset.cache()
dataset = dataset.shuffle(buffer_size).batch(batch_size)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

# contruindo o modelo

## Positional encoding:

$PE_{(pos,2i)} =\sin(pos/10000^{2i/dmodel})$
<br>
$PE_{(pos,2i+1)} =\cos(pos/10000^{2i/dmodel})$

In [92]:
class PositionalEncoding(layers.Layer):
    def __init__(self):
        super(PositionalEncoding, self).__init__()

    def get_angles(self, pos, i, d_model):
        angles = 1 / np.power(10000., (2*(i // 2)) / np.float32(d_model))
        return pos * angles # (seq_lenght, d_model)

    def call(self, inputs):
        seq_lenght = inputs.shape.as_list()[-2]
        d_model = inputs.shape.as_list()[-1]
        angles = self.get_angles(np.arange(seq_lenght)[:, np.newaxis],
                                 np.arange(d_model)[np.newaxis, :], d_model)
        angles[:, 0::2] = np.sin(angles[:, 0::2])
        angles[:, 1::2] = np.cos(angles[:, 1::2])
        pos_encoding = angles[np.newaxis, ...]
        return inputs + tf.cast(pos_encoding, tf.float32)

scaled

In [93]:
def scaled_dot_product_attention(queries, keys, values, mask):
    product = tf.matmul(queries, keys, transpose_b=True)
    keys_dim = tf.cast(tf.shape(keys)[-1], tf.float32)
    scaled_product = product / tf.math.sqrt(keys_dim)

    if mask is not None:
        scaled_product += (mask * -1e9) # 0.0000000001

    attention = tf.matmul(tf.nn.softmax(scaled_product, axis=-1), values)
    return attention

MultiHeadAttention

In [94]:
class MultiHeadAttention(layers.Layer):

    def __init__(self, nb_proj):
        super(MultiHeadAttention, self).__init__()
        self.nb_proj = nb_proj

    def build(self, input_shape):
        self.d_model = input_shape[-1]
        assert self.d_model % self.nb_proj == 0

        self.d_proj = self.d_model // self.nb_proj

        self.query_lin = layers.Dense(units = self.d_model)
        self.key_lin = layers.Dense(units = self.d_model)
        self.value_lin = layers.Dense(units = self.d_model)

        self.final_lin = layers.Dense(units = self.d_model)

    def split_proj(self, inputs, batch_size): # inputs: (batch_size, seq_lenght, d_model)
        shape = (batch_size, -1, self.nb_proj, self.d_proj)
        splited_inputs = tf.reshape(inputs, shape = shape) # (batch_size, seq_lenght, nb_proj, d_proj)
        return tf.transpose(splited_inputs, perm=[0, 2, 1, 3]) # (batch_size, nb_proj, seq_lenght, d_proj)

    def call(self, queries, keys, values, mask):
        batch_size = tf.shape(queries)[0]

        queries = self.query_lin(queries)
        keys = self.key_lin(keys)
        values = self.value_lin(values)

        queries = self.split_proj(queries, batch_size)
        keys = self.split_proj(keys, batch_size)
        values = self.split_proj(values, batch_size)

        attention = scaled_dot_product_attention(queries, keys, values, mask)

        attention = tf.transpose(attention, perm=[0, 2, 1, 3])

        concat_attention = tf.reshape(attention, shape=(batch_size, -1, self.d_model))

        outputs = self.final_lin(concat_attention)

        return outputs

Encoder

In [95]:
class EncoderLayer(layers.Layer):

    def __init__(self, FFN_units, nb_proj, dropout_rate):
        super(EncoderLayer, self).__init__()
        self.FFN_units = FFN_units
        self.nb_proj = nb_proj
        self.dropout_rate = dropout_rate

    def build(self, input_shape):
        self.d_model = input_shape[-1]

        self.multi_head_attention = MultiHeadAttention(self.nb_proj)
        self.dropout_1 = layers.Dropout(rate=self.dropout_rate)
        self.norm_1 = layers.LayerNormalization(epsilon=1e-6) # 0.0000001

        self.dense_1 = layers.Dense(units=self.FFN_units, activation='relu')
        self.dense_2 = layers.Dense(units=self.d_model, activation='relu')
        self.dropout_2 = layers.Dropout(rate=self.dropout_rate)

        self.norm_2 = layers.LayerNormalization(epsilon=1e-6)

    def call(self, inputs, mask, training):
        attention = self.multi_head_attention(inputs, inputs, inputs, mask)
        attention = self.dropout_1(attention, training = training)
        attention = self.norm_1(attention + inputs)

        outputs = self.dense_1(attention)
        outputs = self.dense_2(outputs)
        outputs = self.dropout_2(outputs, training=training)
        outputs = self.norm_2(outputs + attention)

        return outputs


In [96]:
class Encoder(layers.Layer):
    def __init__(self,
                 nb_layers,
                 FFN_units,
                 nb_proj,
                 dropout_rate,
                 vocab_size,
                 d_model,
                 name="encoder"):
        super(Encoder, self).__init__(name=name)
        self.nb_layers = nb_layers
        self.d_model = d_model

        self.embedding = layers.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding()
        self.dropout = layers.Dropout(rate=dropout_rate)
        self.enc_layers = [EncoderLayer(FFN_units, nb_proj, dropout_rate) for _ in range(nb_layers)]


    def call(self, inputs, mask, training):
        outputs = self.embedding(inputs)
        outputs *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        outputs = self.pos_encoding(outputs)
        outputs = self.dropout(outputs, training)

        for i in range(self.nb_layers):
            outputs = self.enc_layers[i](outputs, mask, training)

        return outputs

decoder

In [97]:
class DecoderLayer(layers.Layer):

    def __init__(self, FFN_units, nb_proj, dropout_rate):
        super(DecoderLayer, self).__init__()
        self.FFN_units = FFN_units
        self.nb_proj = nb_proj
        self.dropout_rate = dropout_rate

    def build(self, input_shape):
        self.d_model = input_shape[-1]

        self.multi_head_attention_1 = MultiHeadAttention(self.nb_proj)
        self.dropout_1 = layers.Dropout(rate=self.dropout_rate)
        self.norm_1 = layers.LayerNormalization(epsilon=1e-6)

        self.multi_head_attention_2 = MultiHeadAttention(self.nb_proj)
        self.dropout_2 = layers.Dropout(rate=self.dropout_rate)
        self.norm_2 = layers.LayerNormalization(epsilon=1e-6)

        self.dense_1 = layers.Dense(units = self.FFN_units, activation='relu')
        self.dense_2 = layers.Dense(units = self.d_model, activation='relu')
        self.dropout_3 = layers.Dropout(rate=self.dropout_rate)
        self.norm_3 = layers.LayerNormalization(epsilon=1e-6)

    def call(self, inputs, enc_outputs, mask_1, mask_2, training):
        attention = self.multi_head_attention_1(inputs, inputs, inputs, mask_1)
        attention = self.dropout_1(attention, training)
        attention = self.norm_1(attention + inputs)

        attention_2 = self.multi_head_attention_2(attention, enc_outputs, enc_outputs, mask_2)
        attention_2 = self.dropout_2(attention_2, training)
        attention_2 = self.norm_2(attention_2 + attention)

        outputs = self.dense_1(attention_2)
        outputs = self.dense_2(outputs)
        outputs = self.dropout_3(outputs, training)
        outputs = self.norm_3(outputs + attention_2)

        return outputs

In [98]:
class Decoder(layers.Layer):

    def __init__(self,
                 nb_layers,
                 FFN_units,
                 nb_proj,
                 dropout_rate,
                 vocab_size,
                 d_model,
                 name="decoder"):
        super(Decoder, self).__init__(name=name)
        self.d_model = d_model
        self.nb_layers = nb_layers

        self.embedding = layers.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding()
        self.dropout = layers.Dropout(rate=dropout_rate)

        self.dec_layers = [DecoderLayer(FFN_units, nb_proj, dropout_rate) for i in range(nb_layers)]

    def call(self, inputs, enc_outputs, mask_1, mask_2, training):
        outputs = self.embedding(inputs)
        outputs *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        outputs = self.pos_encoding(outputs)
        outputs = self.dropout(outputs, training)

        for i in range(self.nb_layers):
            outputs = self.dec_layers[i](outputs, enc_outputs, mask_1, mask_2, training)

        return outputs

transforme

In [99]:
class Transformer(tf.keras.Model):

    def __init__(self,
                 vocab_size_enc,
                 vocab_size_dec,
                 d_model,
                 nb_layers,
                 FFN_units,
                 nb_proj,
                 dropout_rate,
                 name="transformer"):
        super(Transformer, self).__init__(name=name)

        self.encoder = Encoder(nb_layers, FFN_units, nb_proj, dropout_rate,
                               vocab_size_enc, d_model)
        self.decoder = Decoder(nb_layers, FFN_units, nb_proj, dropout_rate,
                               vocab_size_dec, d_model)
        self.last_linear = layers.Dense(units=vocab_size_dec, name='lin_output')

    def create_padding_mask(self, seq): # (batch_size, seq_length) -> (batch_size, nb_proj, seq_lenght, d_proj)
        mask = tf.cast(tf.math.equal(seq, 0), tf.float32)
        return mask[:, tf.newaxis, tf.newaxis, :]

    def create_look_ahead_mask(self, seq):
        seq_len = tf.shape(seq)[1]
        look_ahed_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
        return look_ahed_mask

    def call(self, enc_inputs, dec_inputs, training):
        enc_mask = self.create_padding_mask(enc_inputs)
        dec_mask_1 = tf.maximum(self.create_padding_mask(dec_inputs), self.create_look_ahead_mask(dec_inputs))
        dec_mask_2 = self.create_padding_mask(enc_inputs)

        enc_outputs = self.encoder(enc_inputs, enc_mask, training)
        dec_outputs = self.decoder(dec_inputs, enc_outputs, dec_mask_1, dec_mask_2, training)

        outputs = self.last_linear(dec_outputs)

        return outputs

restart daqui

In [100]:
tf.keras.backend.clear_session()

d_model = 512 # 512
nb_layers = 6 # 6
ffn_units = 1024 # 2048
nb_proj = 8 # 8
dropout_rate = 0.1 # 0.1

In [101]:
transformer = Transformer(vocab_size_enc=vocab_size_en,
                          vocab_size_dec=vocab_size_pt,
                          d_model=d_model,
                          nb_layers=nb_layers,
                          FFN_units=ffn_units,
                          nb_proj=nb_proj,
                          dropout_rate=dropout_rate)

In [102]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
                                                            reduction='none')

In [103]:
def loss_function(target, pred):
    mask = tf.math.logical_not(tf.math.equal(target, 0))
    loss_ = loss_object(target, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [104]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

customSchedule

In [105]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):

    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = tf.cast(d_model, tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [106]:
learing_rate = CustomSchedule(d_model=d_model)

In [107]:
optimizer = tf.keras.optimizers.Adam(learing_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

In [108]:
checkpoint = './tradutor_chek/'
ckpt = tf.train.Checkpoint(transformer=transformer, optmizer=optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint, max_to_keep=5)
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print('checkpoint restalrado')

In [109]:
epochs = 40

In [None]:
for epoch in range(epochs):
    print('Start or epoch {}'.format(epoch + 1))
    start = time.time()

    train_loss.reset_states()
    train_accuracy.reset_states()

    for (batch, (enc_inputs, targets)) in enumerate(dataset):
        dec_inputs = targets[:, :-1]
        dec_outputs_real = targets[:, 1:]
        with tf.GradientTape() as tape:
            predictions = transformer(enc_inputs, dec_inputs, True)
            loss = loss_function(dec_outputs_real, predictions)

        gradients = tape.gradient(loss, transformer.trainable_variables)
        optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

        train_loss(loss)
        train_accuracy(dec_outputs_real, predictions)

        if batch % 50 == 0:
            print('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch+1, batch, train_loss.result(), train_accuracy.result()))

    ckpt_save_path = ckpt_manager.save()
    print('Saving checkpoint for epoch {} at {}'.format(epoch + 1, ckpt_save_path))
    print('Time taken for 1 epoch {} secs\n'.format(time.time() - start)) 

Start or epoch 1
Epoch 1 Batch 0 Loss 5.3673 Accuracy 0.0000
Epoch 1 Batch 50 Loss 5.2041 Accuracy 0.0137
Epoch 1 Batch 100 Loss 5.0733 Accuracy 0.0219
Epoch 1 Batch 150 Loss 4.9641 Accuracy 0.0259
Epoch 1 Batch 200 Loss 4.8503 Accuracy 0.0307
Epoch 1 Batch 250 Loss 4.7242 Accuracy 0.0364
Epoch 1 Batch 300 Loss 4.6118 Accuracy 0.0417
Epoch 1 Batch 350 Loss 4.5139 Accuracy 0.0458
Epoch 1 Batch 400 Loss 4.4456 Accuracy 0.0491
Epoch 1 Batch 450 Loss 4.3876 Accuracy 0.0520
Epoch 1 Batch 500 Loss 4.3320 Accuracy 0.0551
Epoch 1 Batch 550 Loss 4.2755 Accuracy 0.0583
Epoch 1 Batch 600 Loss 4.2259 Accuracy 0.0616
Epoch 1 Batch 650 Loss 4.1789 Accuracy 0.0649
Epoch 1 Batch 700 Loss 4.1324 Accuracy 0.0680
Epoch 1 Batch 750 Loss 4.0911 Accuracy 0.0710
Epoch 1 Batch 800 Loss 4.0478 Accuracy 0.0738
Epoch 1 Batch 850 Loss 4.0064 Accuracy 0.0763
Epoch 1 Batch 900 Loss 3.9663 Accuracy 0.0787
Epoch 1 Batch 950 Loss 3.9249 Accuracy 0.0810
Epoch 1 Batch 1000 Loss 3.8888 Accuracy 0.0833
Epoch 1 Batch 1050 

KeyboardInterrupt: 

avaliação

In [111]:
text = 'you are smart'

In [112]:
def evaluete(inp_sentence):
    inp_sentence = [vocab_size_en - 2] + tokenize_en.encode(inp_sentence) + [vocab_size_en-1]
    enc_input = tf.expand_dims(inp_sentence, axis=0)
    output = tf.expand_dims([vocab_size_pt - 2], axis=0)
    
    for _ in range(max_length):
        predicoes = transformer(enc_input, output, False)
        predict = predicoes[:, -1:, :]
        predict_id = tf.cast(tf.argmax(predict, axis=-1), tf.int32)
        
        if predict_id == vocab_size_pt -1:
            return tf.squeeze(output, axis=0)
        
        output = tf.concat([output, predict_id], axis=1)
        
    return tf.squeeze(output, axis=0)

In [113]:
def tradutor(sentence):
    output = evaluete(sentence).numpy()
    predict_sentence = tokenize_pt.decode([i for i in output if i < vocab_size_pt - 2])
    print(f'Input: {sentence}')
    print(f'Predição: {predict_sentence}')

In [114]:
tradutor('this is a powerful tool')

Input: this is a powerful tool
Predição: Também é um instrumento poderoso.
