In [1]:
!pip install -q -U 'tensorflow-text==2.8.*'

[K     |████████████████████████████████| 4.9 MB 26.0 MB/s 
[K     |████████████████████████████████| 498.0 MB 12 kB/s 
[K     |████████████████████████████████| 1.4 MB 62.2 MB/s 
[K     |████████████████████████████████| 462 kB 72.7 MB/s 
[K     |████████████████████████████████| 5.8 MB 59.5 MB/s 
[?25h

In [2]:
!pip install -q tf-models-official

[K     |████████████████████████████████| 2.4 MB 33.5 MB/s 
[K     |████████████████████████████████| 662 kB 71.9 MB/s 
[K     |████████████████████████████████| 238 kB 62.0 MB/s 
[K     |████████████████████████████████| 2.3 MB 61.6 MB/s 
[K     |████████████████████████████████| 5.8 MB 49.5 MB/s 
[K     |████████████████████████████████| 588.3 MB 20 kB/s 
[K     |████████████████████████████████| 352 kB 25.0 MB/s 
[K     |████████████████████████████████| 1.3 MB 57.1 MB/s 
[K     |████████████████████████████████| 43 kB 2.3 MB/s 
[K     |████████████████████████████████| 118 kB 69.6 MB/s 
[K     |████████████████████████████████| 38.2 MB 66.8 MB/s 
[K     |████████████████████████████████| 1.1 MB 65.2 MB/s 
[K     |████████████████████████████████| 6.0 MB 65.1 MB/s 
[K     |████████████████████████████████| 439 kB 74.1 MB/s 
[K     |████████████████████████████████| 1.7 MB 64.1 MB/s 
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import os
import re
import time
import unicodedata
import datetime
import pathlib

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from keras import backend as K
from keras import layers

import tensorflow_hub as hub
import tensorflow_models as tfm

import tensorflow_text as text
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab

In [5]:
tf.get_logger().setLevel('ERROR')
tf.config.run_functions_eagerly(True)

### Variabili Globali

In [6]:
# PARAMETRI GLOBALI
root_folder = 'drive/MyDrive/BERT/'

# DATI
data_folder_name = 'data'
train_filename = 'ita.txt'

DATA_PATH = os.path.abspath(os.path.join(root_folder, data_folder_name))
train_filenamepath = os.path.abspath(os.path.join(DATA_PATH, train_filename))

# PATH LOG Tensorboard
PATH_LOG = 'logs/fit/transformer_no_bert_v5'
PATH_LOG = os.path.abspath(os.path.join(root_folder, PATH_LOG))
log_dir =  os.path.abspath(os.path.join(PATH_LOG, datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))) 

# PATH WEIGHTS Tensorboard
PATH_WEIGHTS = 'weights/transformer_nobert_v5'
PATH_WEIGHTS = os.path.abspath(os.path.join(root_folder, PATH_WEIGHTS))

# MODELLO TOKENIZER
model_name = 'tokenizer_en_it_model'
tokenizer_folder_name = 'tokenizer'

TOKEN_PATH = os.path.abspath(os.path.join(root_folder, tokenizer_folder_name))
tokenizer_filenamepath = os.path.abspath(os.path.join(TOKEN_PATH, model_name))

In [7]:
# parametri per il modello
INPUT_COLUMN = 'input'
TARGET_COLUMN = 'target'
TARGET_FOR_INPUT = 'target_for_input'
NUM_SAMPLES = 350000 # portato da 10.000 a 100.000
TRAIN = 18016
VALIDATION = 6016
TEST = 100

MAX_VOCAB_SIZE = 20000 # portato da 20.0000 a 200.000
EMBEDDING_DIM = 64  # --> 256  Densa non lineare relu --> 64  Densa non lineare relu (oppure Conv1D kernel=1)
HIDDEN_DIM = 1024 # numero di celle nei layer ricorrenti nascosti

BATCH_SIZE = 32
BUFFER_SIZE = 2000
EPOCHS = 70
MAX_SEQ_LENGTH = 128

NUM_LAYERS = 1 # Numero di layer di Encoder e Decoder del Transformer
NUM_HEADS = 8 # Numero di meccanismi di multi-head attention
FF_DIM = 16 # Numero di celle dei Layer Feed Forward
DROPUOT = 0.5

# LEARNING_RATE=0.01

# IMPOSTO IL DEBUG A TRUE 
debug = True
training = True

### Caricamento Dati

In [8]:
# Caricamento dataset: frasi in inglese, frasi in italiano
df = pd.read_csv(
    train_filenamepath,
    sep="\t",
    header=None,
    names=[INPUT_COLUMN, TARGET_COLUMN],
    usecols=[0,1],
    nrows=NUM_SAMPLES
)

df = df[-(TRAIN+VALIDATION+TEST):].reset_index(drop=True)

# Mischio il dataset in modo che sia più uniforme tra train e test
df = df.iloc[np.random.permutation(df.index)].reset_index(drop=True)

print(df.iloc[-4:], '\n')

# Preprocessing dei dati di Input
input_data = df[INPUT_COLUMN].tolist()

# Preprocessing dei dati Target con aggiunta del token di fine frase
target_data = df[TARGET_COLUMN].tolist()


train_input_data = input_data[:TRAIN]
train_target_data = target_data[:TRAIN]

validation_input_data = input_data[TRAIN:TRAIN+VALIDATION]
validation_target_data = target_data[TRAIN:TRAIN+VALIDATION]

test_input_data = input_data[TRAIN+VALIDATION:]
test_target_data = target_data[TRAIN+VALIDATION:]

print('-----------TRAIN SET--------------')
print(train_input_data[-4:])
print(train_target_data[-4:])
print('-----------VALIDATION SET---------------')
print(validation_input_data[-4:])
print(validation_target_data[-4:])
print('-----------TEST SET---------------')
print(test_input_data[-4:])
print(test_target_data[-4:])

                                                   input  \
20796         How did you two meet? "It's a long story."   
20797  If you tell the truth, you don't have to remem...   
20798        Tom is young, rich, spoiled and egocentric.   
20799  Tom decided to give up skateboarding after his...   

                                                  target  
20796  Come vi siete conosciute voi due? "È una stori...  
20797       Se dice la verità, non deve ricordare nulla.  
20798      Tom è giovane, ricco, viziato ed egocentrico.  
20799  Tom decise di rinunciare ad andare in skateboa...   

-----------TRAIN SET--------------
['He is, without question, the best man for the job.', 'I want to know how you got past the guards.', 'Tom saw something on the floor by the sofa.', 'It took a long time to accustom myself to the noise.']
["Lui è, senza dubbio, l'uomo migliore per il lavoro.", 'Voglio sapere come hai superato le guardie.', 'Tom vide qualcosa sul pavimento accanto al divano.', 'Mi c

### Analisi Dati

In [9]:
print(f'Esempi nel Dataset di Train                            : {len(train_input_data)}')
print(f'Frase più corta in inglese nel Dataset di Train        : {min(train_input_data, key = len)}')
print(f'Frase più corta in italiano nel Dataset di Train       : {min(train_target_data, key = len)}')
print(f'Frase più lunga in inglese nel Dataset di Train        : {max(train_input_data, key = len)}')
print(f'Frase più lunga in italiano nel Dataset di Train       : {max(train_target_data, key = len)}')
print('---------------------------------------------------------------------------------------')
print(f'Esempi nel Dataset di Validation                       : {len(validation_input_data)}')
print(f'Frase più corta in inglese nel Dataset di Validation   : {min(validation_input_data, key = len)}')
print(f'Frase più corta in italiano nel Dataset di Validation  : {min(validation_target_data, key = len)}')
print(f'Frase più lunga in inglese nel Dataset di Validation   : {max(validation_input_data, key = len)}')
print(f'Frase più lunga in italiano nel Dataset di Validation  : {max(validation_target_data, key = len)}')
print('---------------------------------------------------------------------------------------')
print(f'Esempi nel Dataset di Test                             : {len(test_input_data)}')
print(f'Frase più corta in inglese nel Dataset di Test         : {min(test_input_data, key = len)}')
print(f'Frase più corta in italiano nel Dataset di Test        : {min(test_target_data, key = len)}')
print(f'Frase più lunga in inglese nel Dataset di Test         : {max(test_input_data, key = len)}')
print(f'Frase più lunga in italiano nel Dataset di Test        : {max(test_target_data, key = len)}')

Esempi nel Dataset di Train                       : 16000
Frase più corta in inglese nel Dataset di Train   : I'm not very good at this. "Neither am I."
Frase più corta in italiano nel Dataset di Train  : Dove vai in vacanza?
Frase più lunga in inglese nel Dataset di Train   : The shoes were made of some soft stuff that looked like leather.
Frase più lunga in italiano nel Dataset di Train  : I lavoratori del settore dei trasporti organizzarono uno sciopero per protestare contro i tagli di paga.
---------------------------------------------------------------------------------------
Esempi nel Dataset di Test                        : 4800
Frase più corta in inglese nel Dataset di Train   : How old is she? "She is twelve years old."
Frase più corta in italiano nel Dataset di Train  : Deve aiutare Tom, lo sa.
Frase più lunga in inglese nel Dataset di Train   : I want to go to Australia once again before my passport expires.
Frase più lunga in italiano nel Dataset di Train  : I lavoratori d

### Tokenizer

Carico il modello di tokenizer creato utilizzzando il set di dati a disposizione

In [10]:
dataset = tf.data.Dataset.from_tensor_slices((input_data, target_data))
dataset = dataset.shuffle(len(input_data)).batch(BATCH_SIZE, drop_remainder=True)

In [11]:
bert_tokenizer_params=dict(lower_case=True)
reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]"]

bert_vocab_args = dict(
    # The target vocabulary size
    vocab_size = MAX_VOCAB_SIZE,
    # Reserved tokens that must be included in the vocabulary
    reserved_tokens=reserved_tokens,
    # Arguments for `text.BertTokenizer`
    bert_tokenizer_params=bert_tokenizer_params,
    # Arguments for `wordpiece_vocab.wordpiece_tokenizer_learner_lib.learn`
    learn_params={},
)

In [12]:
train_en = dataset.map(lambda en, it: en)
train_it = dataset.map(lambda en, it: it)



In [13]:
%%time
en_vocab = bert_vocab.bert_vocab_from_dataset(
    train_en.batch(10000).prefetch(2),
    **bert_vocab_args
)

CPU times: user 18.8 s, sys: 66.1 ms, total: 18.8 s
Wall time: 23.4 s


In [14]:
%%time
it_vocab = bert_vocab.bert_vocab_from_dataset(
    train_it.batch(10000).prefetch(2),
    **bert_vocab_args
)

CPU times: user 22.1 s, sys: 84.2 ms, total: 22.2 s
Wall time: 22.3 s


In [15]:
def write_vocab_file(filepath, vocab):
  with open(filepath, 'w') as f:
    for token in vocab:
      print(token, file=f)

In [16]:
# VOCABOLARIO
vocab_folder = 'vocab'
en_vocab_finalname = 'en_vocab_1.txt'
it_vocab_finalname = 'it_vocab_1.txt'

VOCAB_PATH = os.path.abspath(os.path.join(root_folder, vocab_folder))
en_vocab_filenamepath = os.path.abspath(os.path.join(VOCAB_PATH, en_vocab_finalname))
it_vocab_filenamepath = os.path.abspath(os.path.join(VOCAB_PATH, it_vocab_finalname))

In [17]:
write_vocab_file(en_vocab_filenamepath, en_vocab)
write_vocab_file(it_vocab_filenamepath, it_vocab)

In [18]:
en_tokenizer = text.BertTokenizer(en_vocab_filenamepath, **bert_tokenizer_params)
it_tokenizer = text.BertTokenizer(it_vocab_filenamepath, **bert_tokenizer_params)

In [19]:
START = tf.argmax(tf.constant(reserved_tokens) == "[START]")
END = tf.argmax(tf.constant(reserved_tokens) == "[END]")

def add_start_end(ragged):
  count = ragged.bounding_shape()[0]
  starts = tf.fill([count,1], START)
  ends = tf.fill([count,1], END)
  x = tf.concat([starts, ragged, ends], axis=1)
  # x = keras.preprocessing.sequence.pad_sequences(x.numpy(), maxlen=MAX_SEQ_LENGTH, padding='post')
  return x

In [20]:
def cleanup_text(reserved_tokens, token_txt):
  # Drop the reserved tokens, except for "[UNK]".
  bad_tokens = [re.escape(tok) for tok in reserved_tokens if tok != "[UNK]"]
  bad_token_re = "|".join(bad_tokens)

  bad_cells = tf.strings.regex_full_match(token_txt, bad_token_re)
  result = tf.ragged.boolean_mask(token_txt, ~bad_cells)

  # Join them into strings.
  result = tf.strings.reduce_join(result, separator=' ', axis=-1)

  return result

In [21]:
class CustomTokenizer(tf.Module):
  def __init__(self, reserved_tokens, vocab_path):
    self.tokenizer = text.BertTokenizer(vocab_path, lower_case=True)
    self._reserved_tokens = reserved_tokens
    self._vocab_path = tf.saved_model.Asset(vocab_path)

    vocab = pathlib.Path(vocab_path).read_text().splitlines()
    self.vocab = tf.Variable(vocab)

    ## Create the signatures for export:   

    # Include a tokenize signature for a batch of strings. 
    self.tokenize.get_concrete_function(
        tf.TensorSpec(shape=[None], dtype=tf.string))
    
    # Include `detokenize` and `lookup` signatures for:
    #   * `Tensors` with shapes [tokens] and [batch, tokens]
    #   * `RaggedTensors` with shape [batch, tokens]
    self.detokenize.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.detokenize.get_concrete_function(
          tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    self.lookup.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.lookup.get_concrete_function(
          tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    # These `get_*` methods take no arguments
    self.get_vocab_size.get_concrete_function()
    self.get_vocab_path.get_concrete_function()
    self.get_reserved_tokens.get_concrete_function()
    
  @tf.function
  def tokenize(self, strings):
    enc = self.tokenizer.tokenize(strings)
    # Merge the `word` and `word-piece` axes.
    enc = enc.merge_dims(-2,-1)
    enc = add_start_end(enc)
    return enc

  @tf.function
  def detokenize(self, tokenized):
    words = self.tokenizer.detokenize(tokenized)
    return cleanup_text(self._reserved_tokens, words)

  @tf.function
  def lookup(self, token_ids):
    return tf.gather(self.vocab, token_ids)

  @tf.function
  def get_vocab_size(self):
    return tf.shape(self.vocab)[0]

  @tf.function
  def get_vocab_path(self):
    return self._vocab_path

  @tf.function
  def get_reserved_tokens(self):
    return tf.constant(self._reserved_tokens)

In [22]:
tokenizers = tf.Module()
tokenizers.en = CustomTokenizer(reserved_tokens, en_vocab_filenamepath)
tokenizers.it = CustomTokenizer(reserved_tokens, it_vocab_filenamepath)

In [23]:
print(f'Vocabolario Inglese  : {tokenizers.en.get_vocab_size()}')
print(f'Vocabolario Italiano : {tokenizers.it.get_vocab_size()}')

Vocabolario Inglese  : 2119
Vocabolario Italiano : 2604


In [24]:
# tokenizers = tf.saved_model.load(tokenizer_filenamepath)

In [25]:
print(input_data[-1:])
print(tokenizers.en.tokenize(input_data[-1:]))
print(keras.preprocessing.sequence.pad_sequences(tokenizers.en.tokenize(input_data[-1:]).numpy(), maxlen=MAX_SEQ_LENGTH, padding='post'))
print(tokenizers.en.detokenize(tokenizers.en.tokenize(input_data[-1:])))
print('------------------------------------------------------------------')
print(target_data[-1:])
print(tokenizers.it.tokenize(target_data[-1:]))
print(keras.preprocessing.sequence.pad_sequences(tokenizers.it.tokenize(target_data[-1:]).numpy(), maxlen=MAX_SEQ_LENGTH, padding='post'))
print(tokenizers.it.detokenize(tokenizers.it.tokenize(target_data[-1:])))

['Tom decided to give up skateboarding after his accident.']
<tf.RaggedTensor [[2, 54, 343, 52, 259, 139, 43, 1585, 801, 1054, 286, 1094, 121, 221, 72,
  317, 11, 3]]>
[[   2   54  343   52  259  139   43 1585  801 1054  286 1094  121  221
    72  317   11    3    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0]]
tf.Tensor([b'tom decided to give up skateboarding after his accident .'], shape=(1,), dtype=string)
------------------------------------------------------------------
['Tom decise di rinunciare ad andare in skateboard dopo il suo incidente.']
<tf.RaggedTensor [[2, 56, 28, 244, 1492, 53, 42, 2383, 760, 2396, 145, 95, 61, 43, 1865,
  104, 2594, 224, 301, 332, 682, 217, 57, 83, 348, 11, 3]]>
[[   2   56   28  244 1492   53   42 2383  760 2396  145   95   61   43
  1865  104 2594  224  301  332  682

In [26]:
print([min(train_input_data, key = len)])
print(tokenizers.en.tokenize([min(train_input_data, key = len)]))
print(keras.preprocessing.sequence.pad_sequences(tokenizers.en.tokenize([min(train_input_data, key = len)]).numpy(), maxlen=MAX_SEQ_LENGTH, padding='post'))
print(tokenizers.en.detokenize(tokenizers.en.tokenize([min(train_input_data, key = len)])))
print('------------------------------------------------------------------')
print([min(train_target_data, key = len)])
print(tokenizers.en.tokenize([min(train_target_data, key = len)]))
print(keras.preprocessing.sequence.pad_sequences(tokenizers.en.tokenize([min(train_target_data, key = len)]).numpy(), maxlen=MAX_SEQ_LENGTH, padding='post'))
print(tokenizers.en.detokenize(tokenizers.en.tokenize([min(train_target_data, key = len)])))

['I\'m not very good at this. "Neither am I."']
<tf.RaggedTensor [[2, 33, 8, 37, 80, 116, 154, 78, 64, 11, 5, 1403, 246, 33, 11, 5, 3]]>
[[   2   33    8   37   80  116  154   78   64   11    5 1403  246   33
    11    5    3    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0]]
tf.Tensor([b'i \' m not very good at this . " neither am i . "'], shape=(1,), dtype=string)
------------------------------------------------------------------
['Dove vai in vacanza?']
<tf.RaggedTensor [[2, 67, 1060, 46, 275, 591, 55, 46, 275, 2115, 422, 1998, 275, 24, 3]]>
[[   2   67 1060   46  275  591   55   46  275 2115  422 1998  275   24
     3    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0

In [27]:
print([max(train_input_data, key = len)])
print(tokenizers.en.tokenize([max(train_input_data, key = len)]))
print(keras.preprocessing.sequence.pad_sequences(tokenizers.en.tokenize([max(train_input_data, key = len)]).numpy(), maxlen=MAX_SEQ_LENGTH, padding='post'))
print(tokenizers.en.detokenize(tokenizers.en.tokenize([max(train_input_data, key = len)])))
print('------------------------------------------------------------------')
print([max(train_target_data, key = len)])
print(tokenizers.en.tokenize([max(train_target_data, key = len)]))
print(keras.preprocessing.sequence.pad_sequences(tokenizers.en.tokenize([max(train_target_data, key = len)]).numpy(), maxlen=MAX_SEQ_LENGTH, padding='post'))
print(tokenizers.en.detokenize(tokenizers.en.tokenize([max(train_target_data, key = len)])))

['The shoes were made of some soft stuff that looked like leather.']
<tf.RaggedTensor [[2, 51, 505, 107, 256, 56, 164, 115, 1817, 1359, 58, 471, 86, 36, 202,
  714, 1508, 11, 3]]>
[[   2   51  505  107  256   56  164  115 1817 1359   58  471   86   36
   202  714 1508   11    3    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0]]
tf.Tensor([b'the shoes were made of some soft stuff that looked like leather .'], shape=(1,), dtype=string)
------------------------------------------------------------------
['I lavoratori del settore dei trasporti organizzarono uno sciopero per protestare contro i tagli di paga.']
<tf.RaggedTensor [[2, 33, 36, 275, 2118, 429, 1673, 591, 28, 922, 1169, 203, 929, 28, 202,
  591, 44, 342, 570, 1247, 591, 159, 703, 422, 591, 1998, 1998, 578, 492,
  286, 45, 248, 286, 43, 2115, 591, 197

### Creazione dataset
Utilizzo della libreria tf.data per la gestione del dataset da utilizzare.
Verranno creati batch di esempi che verranno utilizzati durante l'addestramento.

In [28]:
def prepare_batch(en, it):
  zero = tf.zeros([BATCH_SIZE, MAX_SEQ_LENGTH], tf.int64)
  en = tokenizers.en.tokenize(en) # Output is ragged.
  en = tf.concat([en, zero], 1)
  en = en[:, :MAX_SEQ_LENGTH]     # Trim to MAX_TOKENS.
  en = en.to_tensor()             # Convert to 0-padded dense Tensor

  it = tokenizers.it.tokenize(it)
  it_inputs = it[:, :-1].to_tensor()  # Drop the [END] tokens
  it_labels = it[:, 1:].to_tensor()   # Drop the [START] tokens
  
  it_inputs = tf.concat([it_inputs, zero], 1)
  it_inputs = it_inputs[:, :(MAX_SEQ_LENGTH)]

  it_labels = tf.concat([it_labels, zero], 1)
  it_labels = it_labels[:, :(MAX_SEQ_LENGTH)]

  return (en, it_inputs), it_labels

In [29]:
def make_batches(ds):
  return (
      ds
      .shuffle(BUFFER_SIZE)
      .batch(BATCH_SIZE)
      .map(prepare_batch, tf.data.AUTOTUNE)
      .prefetch(buffer_size=tf.data.AUTOTUNE))

In [30]:
# Definizione del dataset
# [from_tensor_slices] permette di recuperare batch
# di esempi dai dataset di riferimento
train_dataset = tf.data.Dataset.from_tensor_slices((train_input_data, train_target_data))
validation_dataset = tf.data.Dataset.from_tensor_slices((validation_input_data, validation_target_data))

# impostazione del recupero di esempi presi in maniera
# casuale in gruppi di [BATCH_SIZE] tra quelli disponibili
train_dataset = make_batches(train_dataset)
validation_dataset = make_batches(validation_dataset)

In [31]:
# Recupero un batch di esempi per la verifica delle classi custom che andrò a creare
for (en_input, it_input), it_target in train_dataset.take(1):
  print(f'Shape en input           : {en_input.shape}')
  print(f'Example en input         : {en_input[0]}')  
  print('-------------------------------------------------------')
  print(f'Shape it input           : {it_input.shape}')
  print(f'Example it input         : {it_input[0]}')  
  print(f'Shape it input           : {it_target.shape}')
  print(f'Example it target        : {it_target[0]}')  

Shape en input           : (32, 64)
Example en input         : [  2  33   8  37 181  54  99  73 239  52 667 361  64 409  11   3   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0]
-------------------------------------------------------
Shape it input           : (32, 64)
Example it input         : [   2   65   63  282   54   56  236   61  245   53 2277   74  170   11
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0]
Shape it input           : (32, 64)
Example it target        : [  65   63  282   54   56  236   61  245   53 2277   74  170   11    3
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0 

### Token and Position Embedding

Implementazione del blocco Embedding per l'utilizzo di vettori posizionali insieme ai vettori di token di parole tramite estensione della classe Layer di Keras

In [32]:
class TokenAndPositionEmbedding(layers.Layer):
  def __init__(self, maxlen, vocab_size, embed_dim):
    super(TokenAndPositionEmbedding, self).__init__()
    self.maxlen = maxlen
    self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
    self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

  def call(self, x, debug=False):
    x = keras.preprocessing.sequence.pad_sequences(x, maxlen=self.maxlen, padding='post')
    maxlen = tf.shape(x)[-1]

    if debug:
      print('********** DEBUG TOKEN AND POSITION EMBEDDING ***********')
      print(f'Sequence Max len                          : {maxlen}')
      print(f'Sequence Shape                            : {tf.shape(x)}')

    positions = tf.range(start=0, limit=maxlen, delta=1)
    positions = self.pos_emb(positions)
    x = self.token_emb(x)
    output = x + positions

    if debug:
      print(f'Shape TokenAndPositionEmbedding           : {output.shape}')
      print('*********************************************************')

    return output

In [33]:
token_position_en = TokenAndPositionEmbedding(MAX_SEQ_LENGTH, tokenizers.en.get_vocab_size(), EMBEDDING_DIM)
token_position_it = TokenAndPositionEmbedding(MAX_SEQ_LENGTH, tokenizers.it.get_vocab_size(), EMBEDDING_DIM)

inputs_encoder = token_position_en(en_input, debug)
inputs_decoder = token_position_it(it_input, debug)

********** DEBUG TOKEN AND POSITION EMBEDDING ***********
Sequence Max len                          : 64
Sequence Shape                            : [32 64]
Shape TokenAndPositionEmbedding           : (32, 64, 64)
*********************************************************
********** DEBUG TOKEN AND POSITION EMBEDDING ***********
Sequence Max len                          : 64
Sequence Shape                            : [32 64]
Shape TokenAndPositionEmbedding           : (32, 64, 64)
*********************************************************


### Encoder

Implmentazione di un blocco di EncoderTransformer tramite estensione della classe Layer di Keras

In [34]:
class Encoder(layers.Layer):
  def __init__(self, max_len, embed_dim, num_heads, ff_dim, rate=0.5, name='ENC'):
    super(Encoder, self).__init__()
    self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
    self.ffn = keras.Sequential(
      [layers.Dense(ff_dim, activation='relu'), layers.Dense(embed_dim),]
    )
    self.layernorm1 = layers.LayerNormalization()
    self.layernorm2 = layers.LayerNormalization()
    self.dropout1 = layers.Dropout(rate)
    self.dropout2 = layers.Dropout(rate)
    self._name = name

  def call(self, inputs, training=False, debug=False):
    attn_output = self.att(query=inputs,
                           value=inputs, 
                           key=inputs)
    
    attn_output = self.dropout1(attn_output)
    out1 = self.layernorm1(inputs + attn_output)

    ffn_output = self.ffn(out1)
    ffn_output = self.dropout2(ffn_output, training=training)

    output = self.layernorm2(out1 + ffn_output)

    if debug:
      print('********************* DEBUG ENCODER *********************')
      print(f'Shape Input Layer Encoder       : {inputs.shape}')
      print(f'Shape Output Layer Encoder      : {output.shape}')
      print('*********************************************************')

    return output

In [35]:
encoder = Encoder(MAX_SEQ_LENGTH, 
                  EMBEDDING_DIM, 
                  NUM_HEADS, 
                  FF_DIM, 
                  DROPUOT)

outputs_encoder = encoder(inputs=inputs_encoder,
                          training=training, 
                          debug=debug)

********************* DEBUG ENCODER *********************
Shape Input Layer Encoder       : (32, 64, 64)
Shape Output Layer Encoder      : (32, 64, 64)
*********************************************************


### Decoder

Implementazione di un blocco di DecoderTransformer tramite estensione della classe Layer di Keras

In [36]:
class Decoder(layers.Layer):
  def __init__(self, max_len, embed_dim, num_heads, ff_dim, rate=0.5, name='DEC'):
    super(Decoder, self).__init__()
    self.att1 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
    self.att2 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
    self.ffn = keras.Sequential(
      [layers.Dense(ff_dim, activation='relu'), layers.Dense(embed_dim),]
    )
    self.layernorm1 = layers.LayerNormalization()
    self.layernorm2 = layers.LayerNormalization()
    self.layernorm3 = layers.LayerNormalization()
    self.dropout1 = layers.Dropout(rate)
    self.dropout2 = layers.Dropout(rate)
    self.dropout3 = layers.Dropout(rate)
    self._name = name

  def call(self, inputs, encoder_output, training=False, debug=False):
    attn_output1 = self.att1(query=inputs,
                             value=inputs, 
                             key=inputs, 
                             use_causal_mask=True)
    
    attn_output1 = self.dropout1(attn_output1)
    out1 = self.layernorm1(inputs + attn_output1)

    attn_output2 = self.att2(key=encoder_output, 
                             value=encoder_output, 
                             query=out1)
    
    attn_output2 = self.dropout2(attn_output2, training=training)
    out2 = self.layernorm2(out1 + attn_output2)

    ffn_output = self.ffn(out2)
    ffn_output = self.dropout3(ffn_output, training=training)

    output = self.layernorm3(out2 + ffn_output)

    if debug:
      print('******************* DEBUG DECODER ***********************')
      print(f'Input Shape                       : {inputs.shape}')
      print(f'Shape Outputs Decoder             : {output.shape}')
      print('*********************************************************')

    return output

In [37]:
decoder = Decoder(MAX_SEQ_LENGTH, 
                  EMBEDDING_DIM, 
                  NUM_HEADS, 
                  FF_DIM, 
                  DROPUOT)

outputs_decoder = decoder(inputs=inputs_decoder, 
                          encoder_output=outputs_encoder,  
                          training=training,
                          debug=debug)

******************* DEBUG DECODER ***********************
Input Shape                       : (32, 64, 64)
Shape Outputs Decoder             : (32, 64, 64)
*********************************************************


### Transformer

Implementazione del blocco Transformer tramite estensione della classe Layer di Keras

In [38]:
class TransformerBlock(keras.Model):
  def __init__(self, 
               num_layers, 
               embed_dim, 
               num_heads, 
               ff_dim, 
               max_len,
               input_vocab_size,
               target_vocab_size,
               rate=0.5):
    
    super(TransformerBlock, self).__init__()

    self.num_layers = num_layers

    self.token_pos_enc = TokenAndPositionEmbedding(max_len, input_vocab_size, embed_dim)
    self.token_pos_dec = TokenAndPositionEmbedding(max_len, target_vocab_size, embed_dim)

    self.encoder = [Encoder(max_len, embed_dim, num_heads, ff_dim, rate) for _ in range(num_layers)]
    self.decoder = [Decoder(max_len, embed_dim, num_heads, ff_dim, rate) for _ in range(num_layers)]

    self.dropout = layers.Dropout(rate)
    self.final_layer = tf.keras.layers.Dense(target_vocab_size)

  def call(self, inputs, training=False, debug=False):
    inputs_encoder, inputs_decoder  = inputs

    inputs_encoder = self.token_pos_enc(inputs_encoder, debug)
    inputs_decoder = self.token_pos_dec(inputs_decoder, debug)

    if debug:
      print(f'---------------- DEBUG TRANSFORMER BLOCK ----------------')
      print(f'inputs_encoder       : {inputs_encoder.shape}')
      print(f'inputs_decoder       : {inputs_decoder.shape}')      

    encoder_output = inputs_encoder
    transformer_output = inputs_decoder

    for i in range(self.num_layers):
      encoder_output = self.encoder[i](inputs=encoder_output, 
                                       training=training, 
                                       debug=debug) 
      
    for i in range(self.num_layers):
      transformer_output = self.decoder[i](inputs=transformer_output, 
                                           encoder_output=encoder_output, 
                                           training=training,
                                           debug=debug)

    transformer_output = self.dropout(transformer_output)
    logits = self.final_layer(transformer_output)

    if debug:
      print(f'Output Shape       : {logits.shape}')
      print(f'Output Transformer : {logits[0, :1, :12]}')    
      print(f'---------------------------------------------------------')

    return logits

In [39]:
transformer = TransformerBlock(NUM_LAYERS, 
                               EMBEDDING_DIM, 
                               NUM_HEADS, 
                               FF_DIM,
                               MAX_SEQ_LENGTH,
                               tokenizers.en.get_vocab_size(),
                               tokenizers.it.get_vocab_size(),
                               DROPUOT)

transformer_output = transformer((en_input, it_input), 
                                 training=training,
                                 debug=debug)

********** DEBUG TOKEN AND POSITION EMBEDDING ***********
Sequence Max len                          : 64
Sequence Shape                            : [32 64]
Shape TokenAndPositionEmbedding           : (32, 64, 64)
*********************************************************
********** DEBUG TOKEN AND POSITION EMBEDDING ***********
Sequence Max len                          : 64
Sequence Shape                            : [32 64]
Shape TokenAndPositionEmbedding           : (32, 64, 64)
*********************************************************
---------------- DEBUG TRANSFORMER BLOCK ----------------
inputs_encoder       : (32, 64, 64)
inputs_decoder       : (32, 64, 64)
********************* DEBUG ENCODER *********************
Shape Input Layer Encoder       : (32, 64, 64)
Shape Output Layer Encoder      : (32, 64, 64)
*********************************************************
******************* DEBUG DECODER ***********************
Input Shape                       : (32, 64, 64)
Shape Out

In [40]:
transformer.summary()

Model: "transformer_block"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 token_and_position_embeddin  multiple                 139712    
 g_2 (TokenAndPositionEmbedd                                     
 ing)                                                            
                                                                 
 token_and_position_embeddin  multiple                 170752    
 g_3 (TokenAndPositionEmbedd                                     
 ing)                                                            
                                                                 
 ENC (Encoder)               multiple                  135056    
                                                                 
 DEC (Decoder)               multiple                  267856    
                                                                 
 dropout_13 (Dropout)        multiple            

### Addestramento

In [42]:
import json

learning_rate = [3e-4]
beta_1 = [0.9, 0.75, 0.66]
beta_2 = [0.98, 0.99, 0.999]

for lr in learning_rate:
  for b1 in beta_1:
    for b2 in beta_2:
      transformer = TransformerBlock(NUM_LAYERS, 
                                EMBEDDING_DIM, 
                                NUM_HEADS, 
                                FF_DIM,
                                MAX_SEQ_LENGTH,
                                tokenizers.en.get_vocab_size(),
                                tokenizers.it.get_vocab_size(),
                                DROPUOT)

      print('Parametri Addestramento AdamW : lr=' + str(lr) + ' b1=' + str(b1) + ', b2=' + str(b2))   

      transformer.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                          optimizer=tf.keras.optimizers.experimental.AdamW(learning_rate=lr, beta_1=b1, beta_2=b2, epsilon=1e-9),
                          metrics=[keras.metrics.SparseCategoricalAccuracy()])

      start = datetime.datetime.now()
      history = transformer.fit(train_dataset,
                                initial_epoch=0,
                                epochs=3,
                                shuffle=True,
                                validation_data=validation_dataset)

      end = datetime.datetime.now()
      print(f'Tempo necessario per l\'addestramento: {end - start}')


Parametri Addestramento AdamW : lr=0.0003 b1=0.9, b2=0.98
Epoch 1/3
Epoch 2/3
Epoch 3/3
Tempo necessario per l'addestramento: 0:09:46.619769
Parametri Addestramento AdamW : lr=0.0003 b1=0.9, b2=0.99
Epoch 1/3
Epoch 2/3
Epoch 3/3
Tempo necessario per l'addestramento: 0:10:06.412189
Parametri Addestramento AdamW : lr=0.0003 b1=0.9, b2=0.999
Epoch 1/3
Epoch 2/3
Epoch 3/3
Tempo necessario per l'addestramento: 0:09:44.376987
Parametri Addestramento AdamW : lr=0.0003 b1=0.75, b2=0.98
Epoch 1/3
Epoch 2/3
Epoch 3/3
Tempo necessario per l'addestramento: 0:09:11.600603
Parametri Addestramento AdamW : lr=0.0003 b1=0.75, b2=0.99
Epoch 1/3
Epoch 2/3
Epoch 3/3
Tempo necessario per l'addestramento: 0:09:40.396950
Parametri Addestramento AdamW : lr=0.0003 b1=0.75, b2=0.999
Epoch 1/3
Epoch 2/3
Epoch 3/3
Tempo necessario per l'addestramento: 0:09:10.445906
Parametri Addestramento AdamW : lr=0.0003 b1=0.66, b2=0.98
Epoch 1/3
Epoch 2/3
Epoch 3/3
Tempo necessario per l'addestramento: 0:09:44.796433
Paramet

In [43]:
import json

learning_rate = [1e-4]
beta_1 = [0.9, 0.75, 0.66]
beta_2 = [0.98, 0.99, 0.999]

for lr in learning_rate:
  for b1 in beta_1:
    for b2 in beta_2:
      # learning_rate = CustomSchedule(EMBEDDING_DIM)
      
      transformer = TransformerBlock(NUM_LAYERS, 
                                EMBEDDING_DIM, 
                                NUM_HEADS, 
                                FF_DIM,
                                MAX_SEQ_LENGTH,
                                tokenizers.en.get_vocab_size(),
                                tokenizers.it.get_vocab_size(),
                                DROPUOT)

      print('Parametri Addestramento AdamW : lr=' + str(lr) + ' b1=' + str(b1) + ', b2=' + str(b2))   

      transformer.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                          optimizer=tf.keras.optimizers.experimental.AdamW(learning_rate=lr, beta_1=b1, beta_2=b2),
                          metrics=[keras.metrics.SparseCategoricalAccuracy()])

      start = datetime.datetime.now()
      history = transformer.fit(train_dataset,
                                initial_epoch=0,
                                epochs=3,
                                shuffle=True,
                                validation_data=validation_dataset)

      end = datetime.datetime.now()
      print(f'Tempo necessario per l\'addestramento: {end - start}')


Parametri Addestramento AdamW : lr=0.0001 b1=0.9, b2=0.98
Epoch 1/3
Epoch 2/3
Epoch 3/3
Tempo necessario per l'addestramento: 0:09:11.382883
Parametri Addestramento AdamW : lr=0.0001 b1=0.9, b2=0.99
Epoch 1/3
Epoch 2/3
Epoch 3/3
Tempo necessario per l'addestramento: 0:09:35.958554
Parametri Addestramento AdamW : lr=0.0001 b1=0.9, b2=0.999
Epoch 1/3
Epoch 2/3
Epoch 3/3
Tempo necessario per l'addestramento: 0:09:37.914336
Parametri Addestramento AdamW : lr=0.0001 b1=0.75, b2=0.98
Epoch 1/3
Epoch 2/3
Epoch 3/3
Tempo necessario per l'addestramento: 0:09:25.278238
Parametri Addestramento AdamW : lr=0.0001 b1=0.75, b2=0.99
Epoch 1/3
Epoch 2/3
Epoch 3/3
Tempo necessario per l'addestramento: 0:09:41.806849
Parametri Addestramento AdamW : lr=0.0001 b1=0.75, b2=0.999
Epoch 1/3
Epoch 2/3
Epoch 3/3
Tempo necessario per l'addestramento: 0:09:19.196525
Parametri Addestramento AdamW : lr=0.0001 b1=0.66, b2=0.98
Epoch 1/3
Epoch 2/3
Epoch 3/3
Tempo necessario per l'addestramento: 0:09:44.090651
Paramet