In [2]:
import numpy as np
import pandas as pd
import math
from google.colab import drive
%tensorflow_version 2.x
import tensorflow as tf
import time

TensorFlow 2.x selected.


In [0]:
from tensorflow.keras import layers
import tensorflow_datasets as tfds

In [4]:
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
with open("/content/drive/My Drive/Transformers/europarl-v7.pl-en.en", mode="r", encoding="utf-8") as f:
    europarl_en = f.read()

with open("/content/drive/My Drive/Transformers/europarl-v7.pl-en.pl", mode="r", encoding="utf-8") as f:
    europarl_pl = f.read()

corpus_en = europarl_en.split("\n")
corpus_pl = europarl_pl.split("\n")


In [0]:
tokenizer_en = tfds.features.text.SubwordTextEncoder.build_from_corpus(corpus_en, target_vocab_size=1e13)
tokenizer_pl = tfds.features.text.SubwordTextEncoder.build_from_corpus(corpus_pl, target_vocab_size=1e13)


In [0]:
vocab_size_en = tokenizer_en.vocab_size + 2
vocab_size_pl = tokenizer_pl.vocab_size + 2

In [0]:
inputs = [[vocab_size_en - 2] + tokenizer_en.encode(sentence) + [vocab_size_en - 1] for sentence in corpus_en]
outputs = [[vocab_size_pl - 2] + tokenizer_pl.encode(sentence) + [vocab_size_pl - 1] for sentence in corpus_pl]

In [0]:
max_length = 20
idx_to_remove = [count for count, sent in enumerate(inputs) if len(sent) > max_length]
for idx in reversed(idx_to_remove):
    del inputs[idx]
    del outputs[idx]

idx_to_remove = [count for count, sent in enumerate(outputs) if len(sent) > max_length]
for idx in reversed(idx_to_remove):
    del inputs[idx]
    del outputs[idx]



In [0]:
inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs, value=0, padding="post", maxlen=max_length)
outputs = tf.keras.preprocessing.sequence.pad_sequences(outputs, value=0, padding="post", maxlen=max_length)

In [0]:
batch_size = 64
buffer_size = 20000

dataset = tf.data.Dataset.from_tensor_slices((inputs, outputs))

dataset = dataset.cache()
dataset = dataset.shuffle(buffer_size).batch(batch_size)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [0]:
class PositionalEncoding(layers.Layer):

    def __init__(self):
        super().__init__()

    def get_angles(self, pos, i, d_model):
        angles = 1/np.power(10000., (2*(i//2))/np.float32(d_model))
        return pos*angles

    def call(self, inputs):
        seq_length = inputs.shape.as_list()[-2]
        d_model = inputs.shape.as_list()[-1]
        angles = self.get_angles(np.arange(seq_length)[:, np.newaxis],
                                 np.arange(d_model)[np.newaxis, :],
                                 d_model)
        
        angles[:, 0::2] = np.sin(angles[:, 0::2])
        angles[:, 1::2] = np.cos(angles[:, 1::2])
        pos_encoding = angles[np.newaxis, ...]
        return inputs + tf.cast(pos_encoding, tf.float32)


In [0]:
def scaled_dot_product_attention(queries, keys, values, mask):
    product = tf.matmul(queries, keys, transpose_b=True)

    keys_dim = tf.cast(tf.shape(keys)[-1], tf.float32)
    scaled_product = product/tf.math.sqrt(tf.cast(keys_dim, tf.float32))

    if mask is not None:
        scaled_product += (mask * -1e9)

    attention = tf.matmul(tf.nn.softmax(scaled_product, axis=-1), values)

    return attention


In [0]:
class MultiHeadAttention(layers.Layer):
    def __init__(self, n_heads):
        super().__init__()
        self.n_heads = n_heads

    def build(self, input_shape):
        self.d_model = input_shape[-1]
        assert self.d_model % self.n_heads == 0

        self.d_heads = self.d_model // self.n_heads

        self.dense_q = layers.Dense(units=self.d_model)
        self.dense_k = layers.Dense(units=self.d_model)
        self.dense_v = layers.Dense(units=self.d_model)

        self.output_ffn = layers.Dense(units=self.d_model)

    def split_heads(self, inputs, batch_size):
        shape = (batch_size,
                 -1,
                 self.n_heads,
                 self.d_heads)
        
        splitted_inputs = tf.reshape(inputs, shape=shape)
        return tf.transpose(splitted_inputs, perm=[0, 2, 1, 3])

    def call(self, queries, keys, values, mask):
        batch_size = tf.shape(queries)[0]
        queries = self.dense_q(queries)
        keys = self.dense_k(keys)
        values = self.dense_v(values)

        queries = self.split_heads(queries, batch_size)
        keys = self.split_heads(keys, batch_size)
        values = self.split_heads(values, batch_size)

        attention = scaled_dot_product_attention(queries, keys, values, mask)

        attention = tf.transpose(attention, perm=[0, 2, 1, 3])

        concat_attention = tf.reshape(attention, shape=(batch_size, -1, self.d_model))

        outputs = self.output_ffn(concat_attention)

        return outputs
    

In [0]:
class EncoderLayer(layers.Layer):
    def __init__(self, ffn_units, n_heads, dropout):
        super().__init__()
        self.ffn_units=ffn_units
        self.n_heads = n_heads
        self.dropout = dropout
    
    def build(self, input_shape):
        self.d_model = input_shape[-1]

        self.multi_head_attention = MultiHeadAttention(self.n_heads)
        self.dropout_1 = layers.Dropout(rate=self.dropout)
        self.norm_1 = layers.LayerNormalization(epsilon=1e-6)

        self.dense_1 = layers.Dense(units=self.ffn_units, activation="relu")
        self.dense_2 = layers.Dense(units=self.d_model)
        self.dropout_2 = layers.Dropout(rate=self.dropout)
        self.norm_2 = layers.LayerNormalization(epsilon=1e-6)

    def call(self, inputs, mask, training):
        attention = self.multi_head_attention(inputs,
                                              inputs,
                                              inputs,
                                              mask)
        attention = self.dropout_1(attention, training=training)
        attention = self.norm_1(attention+inputs)

        outputs = self.dense_1(attention)
        outputs = self.dense_2(outputs)
        outputs = self.dropout_2(outputs, training=training)
        outputs = self.norm_2(outputs+attention)

        return outputs




In [0]:
class Encoder(layers.Layer):
    def __init__(self,
                 n_layers,
                 ffn_units,
                 n_heads,
                 dropout,
                 vocab_size,
                 d_model,
                 name="encoder"):
        super().__init__(name=name)
        self.n_layers = n_layers
        self.d_model = d_model

        self.embedding = layers.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding()
        self.dropout = layers.Dropout(rate=dropout)
        self.enc_layers = [EncoderLayer(ffn_units, n_heads, dropout) for _ in range(self.n_layers)]


    def call(self, inputs, mask, training):
        outputs = self.embedding(inputs)
        outputs *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        outputs = self.pos_encoding(outputs)
        outputs = self.dropout(outputs, training=training)
        
        for i in range(n_layers):
            outputs = self.enc_layers[i](outputs, mask, training)
        
        return outputs




In [0]:
class DecoderLayer(layers.Layer):
    def __init__(self, ffn_units, n_heads, dropout):
        super().__init__()
        self.ffn_units = ffn_units
        self.n_heads = n_heads
        self.dropout = dropout
    
    def build(self, input_shape):
        self.d_model = input_shape[-1]

        self.multi_head_attention_1 = MultiHeadAttention(self.n_heads)
        self.dropout_1 = layers.Dropout(rate=self.dropout)
        self.norm_1 = layers.LayerNormalization(epsilon=1e-6)

        self.multi_head_attention_2 = MultiHeadAttention(self.n_heads)
        self.dropout_2 = layers.Dropout(rate=self.dropout)
        self.norm_2 = layers.LayerNormalization(epsilon=1e-6)

        self.dense_1 = layers.Dense(units=self.ffn_units, activation="relu")
        self.dense_2 = layers.Dense(units=self.d_model)
        self.dropout_3 = layers.Dropout(rate=self.dropout)
        self.norm_3 = layers.LayerNormalization(epsilon=1e-6)

    def call(self, inputs, enc_outputs, mask_1, mask_2, training):
        attention = self.multi_head_attention_1(inputs, inputs, inputs, mask_1)
        attention = self.dropout_1(attention, training=training)
        attention = self.norm_1(attention+inputs)

        attention_2 = self.multi_head_attention_2(attention, enc_outputs, enc_outputs, mask_2)
        attention_2 = self.dropout_2(attention_2, training=training)
        attention_2 = self.norm_2(attention_2+attention)

        outputs = self.dense_1(attention_2)
        outputs = self.dense_2(outputs)
        outputs = self.dropout_3(outputs, training=training)
        outputs = self.norm_3(outputs+attention_2)
        
        return outputs


In [0]:
class Decoder(layers.Layer):
    def __init__(self, 
                 n_layers,
                 ffn_units,
                 n_heads,
                 dropout,
                 vocab_size,
                 d_model,
                 name="decoder"):
        super().__init__(name=name)
        self.d_model = d_model
        self.n_layers = n_layers

        self.embedding = layers.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding()
        self.dropout = layers.Dropout(rate=dropout)
        self.dec_layers = [DecoderLayer(ffn_units, n_heads, dropout) for _ in range(n_layers)]

    def call(self, inputs, enc_outputs, mask_1, mask_2, training):
        outputs = self.embedding(inputs)
        outputs *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        outputs = self.pos_encoding(outputs)
        outputs = self.dropout(outputs, training=training)

        for i in range(n_layers):
            outputs = self.dec_layers[i](outputs, enc_outputs, mask_1, mask_2, training)

        return outputs



In [0]:
class Transformer(tf.keras.models.Model):
    def __init__(self,
                 vocab_size_enc,
                 vocab_size_dec,
                 d_model,
                 n_layers,
                 ffn_units,
                 n_heads,
                 dropout,
                 name="transformer"):
        super().__init__(name=name)

        self.encoder = Encoder(n_layers,
                               ffn_units,
                               n_heads,
                               dropout,
                               vocab_size_enc,
                               d_model)
        self.decoder = Decoder(n_layers,
                               ffn_units,
                               n_heads,
                               dropout,
                               vocab_size_dec,
                               d_model)
        
        self.final_ffn = layers.Dense(units=vocab_size_dec)
        
    def create_padding_mask(self, seq):
        mask = tf.cast(tf.math.equal(seq, 0), tf.float32)
        return mask[:, tf.newaxis, tf.newaxis, :]
        
    def create_lookahead_mask(self, seq):
        seq_len = tf.shape(seq)[1]
        look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
        return look_ahead_mask
        
    def call(self, enc_inputs, dec_inputs, training):
        enc_mask = self.create_padding_mask(enc_inputs)
        dec_mask_1 = tf.maximum(self.create_padding_mask(dec_inputs), self.create_lookahead_mask(dec_inputs))
        dec_mask_2 = self.create_padding_mask(enc_inputs)
            
        enc_outputs = self.encoder(enc_inputs, enc_mask, training)
        dec_outputs = self.decoder(dec_inputs, enc_outputs, dec_mask_1, dec_mask_2, training)

        outputs = self.final_ffn(dec_outputs)

        return outputs



In [0]:
tf.keras.backend.clear_session()

d_model = 128
n_layers = 4
ffn_units = 512
n_heads = 8
dropout = 0.1

transformer = Transformer(vocab_size_enc=vocab_size_en,
                          vocab_size_dec=vocab_size_pl,
                          d_model=d_model,
                          n_layers=n_layers,
                          ffn_units=ffn_units,
                          n_heads=n_heads,
                          dropout=dropout)


In [0]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")


def loss_function(target, pred):
    mask = tf.math.logical_not(tf.math.equal(target, 0))
    loss_ = loss_object(target, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

train_loss = tf.keras.metrics.Mean(name="train_loss")
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name="train_accuracy")



In [0]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super().__init__()
        self.d_model = d_model
        self.warmup_steps = warmup_steps
    
    def __call__(self, step):
        arg1 = tf.math.rsqrt(tf.cast(step, tf.float32))
        arg2 = step * (self.warmup_steps**-1.5)

        return tf.math.rsqrt(tf.cast(self.d_model, tf.float32)) * tf.math.minimum(arg1, arg2)


In [0]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate,
                                     beta_1=0.9,
                                     beta_2=0.98,
                                     epsilon=1e-9)



In [0]:
checkpoint_path = "/content/drive/My Drive/Transformers/ckpt/"

ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Checkpoint restored!")


In [0]:
epochs = 4
for epoch in range(epochs):
    print("Start of epoch {}".format(epoch+1))
    start = time.time()

    train_loss.reset_states()
    train_accuracy.reset_states()

    for (batch, (enc_inputs, targets)) in enumerate(dataset):
        dec_inputs = targets[:, :-1]
        dec_outputs_real = targets[:, 1:]
        with tf.GradientTape() as tape:
            predictions = transformer(enc_inputs, dec_inputs, True)
            loss = loss_function(dec_outputs_real, predictions)

        gradients = tape.gradient(loss, transformer.trainable_variables)
        optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

        train_loss(loss)
        train_accuracy(dec_outputs_real, predictions)

        if batch % 50 == 0:
            print("Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}".format(epoch+1, batch, train_loss.result(), train_accuracy.result()))
        

    drive.mount("/content/drive")
    ckpt_save_path = ckpt_manager.save()
    print("Saving checkpoint for epoch {} at {}".format(epoch+1, ckpt_save_path))
    print("Time taken for 1 epoch {}".format(time.time()-start))


In [0]:
!nvidia-smi

In [0]:
transformer.save_weights("/content/drive/My Drive/Transformers/transformer_trained")

In [0]:
tf.test.gpu_device_name()

In [0]:
def evaluate(inp_sentence):
    inp_sentence = [vocab_size_en - 2] + tokenizer_en.encode(inp_sentence) + [vocab_size_en - 1]
    encoder_inp = tf.expand_dims(inp_sentence, axis=0)
    
    output = tf.expand_dims([vocab_size_pl - 2], axis=0)
    for _ in range(max_length):
        predictions = transformer(encoder_inp, output, False)
        prediction = predictions[:, -1:, :]

        predicted_id = tf.cast(tf.argmax(prediction, axis=-1), tf.float32)

        if predicted_id == vocab_size_pl-1:
            return tf.squeeze(output, axis=0)
        
        output = tf.concat([output, predicted_id], axis=-1)

    return tf.squeeze(output, axis=0)



In [0]:
def translate(sentence):
    output = evaluate(sentence).numpy()

    predicted_sentence = encoder_pl.decode([i for i in output if i < vocab_size_pl-2])

    print("Input sentence: {}".format(sentence))
    print("Translated sentence: {}".format(predicted_sentence))

In [0]:
translate("This is a problem we have to solve")