In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd "drive/MyDrive/Transformers/"

/content/drive/MyDrive/Transformers


In [3]:
# !wget "https://www.statmt.org/europarl/v7/it-en.tgz"
# !tar -xvzf it-en.tgz

In [4]:
import numpy as np
import pandas as pd
import math
import re
import time

In [5]:
import tensorflow as tf

from tensorflow.keras.models import Model
from tensorflow.keras import layers
import tensorflow_datasets as tfds

In [6]:
with open("europarl-v7.it-en.en",
          mode='r',
          encoding='utf-8') as my_file:
          europal_en = my_file.read()

with open("europarl-v7.it-en.it",
          mode='r',
          encoding='utf-8') as my_file:
          europal_it = my_file.read()

with open("nonbreaking_prefix.en.txt",
          mode='r',
          encoding='utf-8') as my_file:
          non_breaking_prefix_en = my_file.read()

with open("nonbreaking_prefix.it.txt",
          mode='r',
          encoding='utf-8') as my_file:
          non_breaking_prefix_it = my_file.read()

In [7]:
non_breaking_prefix_en = non_breaking_prefix_en.split("\n")
non_breaking_prefix_en = [" " + pref + "." for pref in non_breaking_prefix_en if pref != ""]
non_breaking_prefix_it = non_breaking_prefix_it.split("\n")
non_breaking_prefix_it = [" " + pref + "." for pref in non_breaking_prefix_it if pref != ""]

In [8]:
corpus_en = europal_en
for prefix in non_breaking_prefix_en:
    corpus_en = corpus_en.replace(prefix,prefix + "###")
corpus_en = re.sub(r"\.(?=[0-9]|[a-z]|[A-Z])",".###",corpus_en)
corpus_en = re.sub(r'\.###',"",corpus_en)
corpus_en = re.sub(r"  +"," ",corpus_en)
corpus_en = corpus_en.split("\n")

corpus_it = europal_it
for prefix in non_breaking_prefix_it:
    corpus_it = corpus_it.replace(prefix,prefix + "###")
corpus_it = re.sub(r"\.(?=[0-9]|[a-z]|[A-Z])",".###",corpus_it)
corpus_it = re.sub(r'\.###',"",corpus_it)
corpus_it = re.sub(r"  +"," ",corpus_it)
corpus_it = corpus_it.split("\n")   

In [9]:
tokenizer_en = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    corpus_en, target_vocab_size=2**13)
tokenizer_it = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    corpus_it, target_vocab_size=2**13)

In [10]:
VOCAB_SIZE_EN = tokenizer_en.vocab_size + 2
VOCAB_SIZE_IT = tokenizer_it.vocab_size + 2

In [11]:
inputs = [[VOCAB_SIZE_EN-2] + tokenizer_en.encode(sentence) + [VOCAB_SIZE_EN-1] 
          for sentence in corpus_en]

outputs = [[VOCAB_SIZE_IT-2] + tokenizer_it.encode(sentence) + [VOCAB_SIZE_IT-1] 
          for sentence in corpus_it]          

In [12]:
MAX_LENGTH = 20
idx_to_remove = [count for count,sentence in enumerate(inputs)
                if len(sentence) > MAX_LENGTH] 
for idx in reversed(idx_to_remove):
    del inputs[idx]
    del outputs[idx]

idx_to_remove = [count for count,sentence in enumerate(outputs)
                if len(sentence) > MAX_LENGTH] 
for idx in reversed(idx_to_remove):
    del inputs[idx]
    del outputs[idx]

In [13]:
inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs,
                                                       value=0,
                                                       padding='post',
                                                       maxlen=MAX_LENGTH)

outputs = tf.keras.preprocessing.sequence.pad_sequences(outputs,
                                                       value=0,
                                                       padding='post',
                                                       maxlen=MAX_LENGTH)

In [14]:
BATCH_SIZE = 64
BUFFER_SIZE = 20000

datasets = tf.data.Dataset.from_tensor_slices((inputs,outputs))

datasets = datasets.cache()
datasets = datasets.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
datasets = datasets.prefetch(tf.data.experimental.AUTOTUNE)

In [15]:
class PositionalEncoding(layers.Layer):
    def __init__(self):
        super(PositionalEncoding,self).__init__()

    def get_angles(self,pos,i,d_model): # pos:(seq_length,1) i:(1,d_model) 
        angles = 1 / np.power(10000.0,(2*(i//2))/np.float32(d_model))
        return pos * angles # (seq_length,d_model)
        
    def call(self,inputs):
        seq_length = inputs.shape.as_list()[-2]
        d_model = inputs.shape.as_list()[-1]
        angles = self.get_angles(np.arange(seq_length)[:,np.newaxis],
                                 np.arange(d_model)[np.newaxis,:],
                                 d_model)
        angles[:,0::2] = np.sin(angles[:,0::2])
        angles[:,1::2] = np.cos(angles[:,1::2])
        pos_encoding = angles[np.newaxis,...]
        return inputs + tf.cast(pos_encoding,tf.float32)

In [16]:
def scaled_dor_product_attention(queris,keys,values,mask):
    product = tf.matmul(queris,keys,transpose_b=True)

    keys_dim = tf.cast(tf.shape(keys)[-1],tf.float32)
    scaled_product = product / tf.math.sqrt(keys_dim)
    if mask is not None:
        scaled_product += (mask * -1e9)

    attention = tf.matmul(tf.nn.softmax(scaled_product,axis=-1),values)
    return attention

In [17]:
class MultiHeadAttention(layers.Layer):
    def __init__(self,nb_proj):
        super(MultiHeadAttention,self).__init__()
        self.nb_proj = nb_proj
        

    def build(self,input_shape):
        self.d_model = input_shape[-1]
        assert (self.d_model % self.nb_proj) == 0
        self.d_proj = self.d_model // self.nb_proj

        self.query_lin = layers.Dense(self.d_model)
        self.keys_lin = layers.Dense(self.d_model)
        self.values_lin = layers.Dense(self.d_model)

        self.final_lin = layers.Dense(self.d_model)

    def split_proj(self,inputs,batch_size): # inputs: (batch_size,seq_length,d_model)
        shape = (batch_size,
                 -1,
                 self.nb_proj,
                 self.d_proj)

        splited_inputs = tf.reshape(inputs,shape=shape) # (batch_size,seq_length,nb_proj,d_proj)
        return tf.transpose(splited_inputs,[0,2,1,3]) # (batch_size,nb_proj,seq_length,d_proj)

    def call(self,queries,keys,values,mask):
        batch_size = tf.shape(queries)[0]

        queries = self.query_lin(queries)
        keys = self.keys_lin(keys)
        values = self.values_lin(values)

        queries = self.split_proj(queries,batch_size)
        keys = self.split_proj(keys,batch_size)
        values = self.split_proj(values,batch_size)

        attention = scaled_dor_product_attention(queries,keys,values,mask)
        attention = tf.transpose(attention,[0,2,1,3])
        shape = (batch_size,-1,self.d_model)
        concat_attention = tf.reshape(attention,shape=shape)

        outputs = self.final_lin(concat_attention)

        return outputs


In [18]:
class EncoderLayer(layers.Layer):
    def __init__(self,FFN_units,nb_proj,dropout):
        super(EncoderLayer,self).__init__()
        self.FFN_units = FFN_units
        self.nb_proj = nb_proj
        self.dropout = dropout

    def build(self,input_shape):
        self.d_model = input_shape[-1]

        self.multi_head_attention = MultiHeadAttention(self.nb_proj)
        self.dropout_1 = layers.Dropout(rate=self.dropout)
        self.norm_1 = layers.LayerNormalization(epsilon=1e-6)

        self.dense_1 = layers.Dense(units=self.FFN_units,activation='relu')
        self.dense_2 = layers.Dense(units=self.d_model,activation='relu')
        self.dropout_2 = layers.Dropout(rate=self.dropout)
        self.norm_2 = layers.LayerNormalization(epsilon=1e-6)

    def call(self, inputs, mask, training):
        attention = self.multi_head_attention(inputs,inputs,inputs,mask)
        attention = self.dropout_1(attention,training=training)
        attention = self.norm_1(attention+inputs)

        outputs = self.dense_1(attention)
        outputs = self.dense_2(outputs)
        outputs = self.dropout_2(outputs,training=training)
        outputs = self.norm_2(outputs+attention)

        return outputs

In [19]:
class Encoder(layers.Layer):
    def __init__(self,
                 nb_layers,
                 FFN_units,
                 nb_proj,
                 dropout,
                 vocab_size,
                 d_model,
                 name="encoder"):
        super(Encoder,self).__init__(name=name)
        self.nb_layers = nb_layers
        self.d_model = d_model

        self.embedding = layers.Embedding(vocab_size,d_model)
        self.pos_encoding = PositionalEncoding()
        self.dropout = layers.Dropout(rate=dropout)
        self.enc_layers = [
                           EncoderLayer(FFN_units,nb_proj,dropout) 
                           for _ in range(self.nb_layers)
                        ]

    def call(self,inputs,mask,training):
        outputs = self.embedding(inputs)
        outputs *= tf.math.sqrt(tf.cast(self.d_model,tf.float32))
        outputs = self.pos_encoding(outputs)
        outputs = self.dropout(outputs,training)
        
        for i in range(self.nb_layers):
            outputs = self.enc_layers[i](outputs,mask,training)
        
        return outputs

In [20]:
class DecoderLayer(layers.Layer):
    def __init__(self,FFN_units,nb_proj,dropout):
        super(DecoderLayer,self).__init__()
        self.FFN_units = FFN_units
        self.nb_proj = nb_proj
        self.dropout = dropout
    
    def build(self,input_shape):
        self.d_model = input_shape[-1]
        self.multi_head_attention_1 = MultiHeadAttention(self.nb_proj)
        self.dropout_1 = layers.Dropout(rate=self.dropout)
        self.norm_1 = layers.LayerNormalization(epsilon=1e-6)

        self.multi_head_attention_2 = MultiHeadAttention(self.nb_proj)
        self.dropout_2 = layers.Dropout(rate=self.dropout)
        self.norm_2 = layers.LayerNormalization(epsilon=1e-6)

        self.dense_1 = layers.Dense(units=self.FFN_units,activation='relu')
        self.dense_2 = layers.Dense(units=self.d_model,activation='relu')
        self.dropout_3 = layers.Dropout(rate=self.dropout)    
        self.norm_3 = layers.LayerNormalization(epsilon=1e-6)

    def call(self,inputs,enc_outputs,mask_1,mask_2,training):
        attention = self.multi_head_attention_1(inputs,inputs,inputs,mask_1)
        attention = self.dropout_1(attention,training)
        attention = self.norm_1(attention+inputs)
        
        attention_2 = self.multi_head_attention_2(attention,
                                                  enc_outputs,
                                                  enc_outputs,
                                                  mask_2) 
        attention_2 = self.dropout_2(attention_2,training)
        attention_2 = self.norm_2(attention_2+attention)

        outputs = self.dense_1(attention_2)
        outputs = self.dense_2(outputs)
        outputs = self.dropout_3(outputs,training)
        outputs = self.norm_3(outputs + attention_2)

        return outputs

In [21]:
class Decoder(layers.Layer):
    def __init__(self,
                 nb_layers,
                 FFN_units,
                 nb_proj,
                 dropout,
                 vocab_size,
                 d_model,
                 name='decoder'):
        super(Decoder,self).__init__(name=name)
        self.d_model = d_model
        self.nb_layers = nb_layers
        self.embedding = layers.Embedding(vocab_size,d_model)
        self.pos_encoding = PositionalEncoding()
        self.dropout = layers.Dropout(rate=dropout)

        self.dec_layers = [
                           DecoderLayer(FFN_units,nb_proj,dropout)
                           for _ in range(nb_layers)]
        

    def call(self,inputs,enc_outputs,mask_1,mask_2,training):
        outputs = self.embedding(inputs)
        outputs *=  tf.math.sqrt(tf.cast(self.d_model,tf.float32))
        outputs = self.pos_encoding(outputs)
        outputs = self.dropout(outputs,training)
        
        for i in range(self.nb_layers):
            outputs = self.dec_layers[i](outputs,
                                         enc_outputs,
                                         mask_1,
                                         mask_2,
                                         training)      
        return outputs

In [22]:
class Transformer(tf.keras.Model):
    def __init__(self,
                 vocab_size_enc,
                 vocab_size_dec,
                 d_model,
                 nb_layers,
                 FFN_units,
                 nb_proj,
                 dropout,
                 name="transformer"):
        super(Transformer,self).__init__(name=name)
        self.encoder =  Encoder(nb_layers,
                                FFN_units,
                                nb_proj,
                                dropout,
                                vocab_size_enc,
                                d_model)
        self.decoder = Decoder(nb_layers,
                               FFN_units,
                               nb_proj,
                               dropout,
                               vocab_size_dec,
                               d_model)
        self.last_linear = layers.Dense(units=vocab_size_dec)
    
    def create_padding_mask(self,seq): # (batch_size,seq_length)
        mask = tf.cast(tf.math.equal(seq,0),tf.float32)
        return mask[:,tf.newaxis,tf.newaxis,:]

    def create_look_ahead_mask(self,seq):
        seq_len = tf.shape(seq)[1]
        look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len,seq_len)),-1,0)
        return look_ahead_mask
    
    def call(self,enc_inputs,dec_inputs,training):
        enc_mask = self.create_look_ahead_mask(enc_inputs)
        dec_mask_1 = tf.maximum(
            self.create_padding_mask(dec_inputs),
            self.create_look_ahead_mask(dec_inputs)
        )
        dec_mask_2 = self.create_padding_mask(enc_inputs)

        enc_outputs = self.encoder(enc_inputs,enc_mask,training)
        dec_outputs = self.decoder(dec_inputs,
                                   enc_outputs,
                                   dec_mask_1,
                                   dec_mask_2,
                                   training)
        
        outputs = self.last_linear(dec_outputs)

        return outputs

In [23]:
tf.keras.backend.clear_session()

# HYPER-PARAMETERS
D_MODEL = 128
NB_LAYERS = 4
FFN_UNITS = 512
NB_PROJ = 8
DROPOUT_RATE = 0.1

transformer = Transformer(vocab_size_enc = VOCAB_SIZE_EN,
                          vocab_size_dec = VOCAB_SIZE_IT,
                          d_model = D_MODEL,
                          nb_layers = NB_LAYERS,
                          FFN_units = FFN_UNITS,
                          nb_proj = NB_PROJ,
                          dropout = DROPOUT_RATE)

In [24]:
def custom_sparse_categorical_accuracy(y_true, y_pred):
    return K.cast(K.equal(K.max(y_true, axis=-1),
                          K.cast(K.argmax(y_pred, axis=-1), K.floatx())),
                  K.floatx())
    
def sparse_cross_entropy(y_true, y_pred):
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true,
                                                          logits=y_pred)
    return loss

In [25]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
                                                            reduction='none')

def loss_function(target,prediction):
    mask = tf.math.logical_not(tf.math.equal(target,0))

    loss_ = loss_object(target,prediction)
    
    mask = tf.cast(mask,dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

In [26]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self,d_model,warmup_steps=4000):
        super(CustomSchedule,self).__init__()
        self.d_model = tf.cast(d_model,tf.float32)
        self.warmup_steps = warmup_steps
    
    def __call__(self,step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps**-1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1,arg2)
    
learning_rate = CustomSchedule(D_MODEL)

optimizer = tf.keras.optimizers.Adam(learning_rate,
                                     beta_1=0.9,
                                     beta_2=0.98,
                                     epsilon=1e-9)

In [27]:
checkpoint_path = './Checkpoints'

ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt,checkpoint_path,max_to_keep=5)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest Checkpoint restored")

In [28]:
EPOCHS = 5
for epoch in range(EPOCHS):
    print(f"Start of epoch {epoch+1}")
    start = time.time()
    train_loss.reset_states()
    train_accuracy.reset_states()
    for (batch,(enc_inputs,targets)) in enumerate(datasets):
        dec_inputs = targets[:,:-1]
        dec_outputs_real = targets[:,1:]
        with tf.GradientTape() as tape:
            predictions = transformer(enc_inputs,dec_inputs,True)
            loss = loss_function(dec_outputs_real,predictions)
        
        gradients = tape.gradient(loss,transformer.trainable_variables)
        optimizer.apply_gradients(zip(gradients,transformer.trainable_variables))

        train_loss(loss)
        train_accuracy(dec_outputs_real,predictions)

        if batch % 50 == 0:
            print("Epoch {} Batch {} Loss {:.4} Accuracy {:0.4f}".format(
                epoch+1,batch,train_loss.result(),train_accuracy.result()))
    


    ckpt_save_path = ckpt_manager.save()
    print("Saving checkpoints for epoch {} at {}".format(epoch+1,ckpt_save_path))
    end = time.time()
    print("Time taken for 1 epoch: {} secs".format(end-start))

Start of epoch 1
Epoch 1 Batch 0 Loss 5.845 Accuracy 0.0000
Epoch 1 Batch 50 Loss 6.109 Accuracy 0.0119
Epoch 1 Batch 100 Loss 6.041 Accuracy 0.0321
Epoch 1 Batch 150 Loss 5.964 Accuracy 0.0389
Epoch 1 Batch 200 Loss 5.893 Accuracy 0.0423
Epoch 1 Batch 250 Loss 5.794 Accuracy 0.0444
Epoch 1 Batch 300 Loss 5.689 Accuracy 0.0457
Epoch 1 Batch 350 Loss 5.579 Accuracy 0.0469
Epoch 1 Batch 400 Loss 5.468 Accuracy 0.0507
Epoch 1 Batch 450 Loss 5.359 Accuracy 0.0552
Epoch 1 Batch 500 Loss 5.26 Accuracy 0.0591
Epoch 1 Batch 550 Loss 5.174 Accuracy 0.0627
Epoch 1 Batch 600 Loss 5.093 Accuracy 0.0664
Epoch 1 Batch 650 Loss 5.017 Accuracy 0.0702
Epoch 1 Batch 700 Loss 4.947 Accuracy 0.0744
Epoch 1 Batch 750 Loss 4.882 Accuracy 0.0785
Epoch 1 Batch 800 Loss 4.817 Accuracy 0.0825
Epoch 1 Batch 850 Loss 4.754 Accuracy 0.0865
Epoch 1 Batch 900 Loss 4.696 Accuracy 0.0903
Epoch 1 Batch 950 Loss 4.636 Accuracy 0.0941
Epoch 1 Batch 1000 Loss 4.58 Accuracy 0.0977
Epoch 1 Batch 1050 Loss 4.527 Accuracy 0.1

In [29]:
def evaluate(input_sentence):
    input_sentence = [VOCAB_SIZE_EN-2] + tokenizer_en.encode(input_sentence) + [VOCAB_SIZE_EN-1]
    enc_input = tf.expand_dims(input_sentence,axis=0)
    output = tf.expand_dims([VOCAB_SIZE_IT-2],axis=0)
    
    for _ in range(MAX_LENGTH):
        my_predictions = transformer(enc_input,output,False) # (1,seq_length,vocab_size_it)
        prediction = my_predictions[:,-1:,:]
        prediction_id = tf.cast(tf.argmax(prediction,axis=-1) , tf.int32)
        if prediction_id == VOCAB_SIZE_IT-1:
            return tf.squeeze(output,axis=0)
        output = tf.concat([output,prediction_id],axis=-1)
    return tf.squeeze(output,axis=0)

In [30]:
def translate(sentence):
    output = evaluate(sentence).numpy()
    predicted_sentence = tokenizer_it.decode(
        [i for i in output if i < VOCAB_SIZE_IT-2])
    print("Input: {}".format(sentence))
    print("Predicted translation: {}".format(predicted_sentence))

In [36]:
translate("Working from home is really hard")

Input: Working from home is really hard
Predicted translation: Lavori a casa si trova davvero molto difficile


In [31]:
# function ConnectButton(){
#     console.log("Connect pushed"); 
#     document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click() 
# }
# undefined
# setInterval(ConnectButton,60000);