In [None]:
!pip install -q -U "tensorflow-text==2.8.*"

[K     |████████████████████████████████| 4.9 MB 7.5 MB/s 
[K     |████████████████████████████████| 497.9 MB 4.4 kB/s 
[K     |████████████████████████████████| 5.8 MB 28.3 MB/s 
[K     |████████████████████████████████| 1.4 MB 53.4 MB/s 
[K     |████████████████████████████████| 462 kB 57.2 MB/s 
[?25h

In [None]:
!pip install -q tensorflow_datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import collections
import os
import pathlib
import re
import string
import sys
import tempfile
import time
import pandas as pd
import unicodedata

import numpy as np
import matplotlib.pyplot as plt

import tensorflow_datasets as tfds
import tensorflow_text as text
import tensorflow as tf
from tensorflow import keras

from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab

In [None]:
# PARAMETRI GLOBALI
root_folder = 'drive/MyDrive/BERT/'

# DATI
data_folder_name = 'data'
train_filename = 'ita.txt'

DATA_PATH = os.path.abspath(os.path.join(root_folder, data_folder_name))
train_filenamepath = os.path.abspath(os.path.join(DATA_PATH, train_filename))

# VOCABOLARIO
vocab_folder = 'vocab'
en_vocab_finalname = 'en_vocab.txt'
it_vocab_finalname = 'it_vocab.txt'

VOCAB_PATH = os.path.abspath(os.path.join(root_folder, vocab_folder))
en_vocab_filenamepath = os.path.abspath(os.path.join(VOCAB_PATH, en_vocab_finalname))
it_vocab_filenamepath = os.path.abspath(os.path.join(VOCAB_PATH, it_vocab_finalname))

# MODELLO TOKENIZER
model_name = 'tokenizer_en_it_model'
tokenizer_folder_name = 'tokenizer'

TOKEN_PATH = os.path.abspath(os.path.join(root_folder, tokenizer_folder_name))
tokenizer_filenamepath = os.path.abspath(os.path.join(TOKEN_PATH, model_name))

In [None]:
# parametri per il modello
INPUT_COLUMN = 'input'
TARGET_COLUMN = 'target'
# TARGET_FOR_INPUT = 'target_for_input'
NUM_SAMPLES = 1000000
MAX_VOCAB_SIZE = 20000
BATCH_SIZE = 32
MAX_SEQ_LENGTH = 16

In [None]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def preprocess_sentence(w):
    '''
    Preprocessing dei testi di input, impostando tutti i caratteri
    minuscoli, aggiungendo uno spazio prima di ogni punto e sostituendo
    qualsiasi carattere con uno spazio se non è compreso nel seguente elenco:
    (a-z, A-Z, ".", "?", "!", ",")
    '''
    w = unicode_to_ascii(w.lower().strip())

    # inserimento di uno spazio tra ogni parola e il successivo punto,
    # punto esclamativo, punto interrogativo e virgola
    # esempio: "ciao, come và?" => "ciao , come và ?"
    w = re.sub(r"([?.!,])", r" \1 ", w) # inserimento di uno spazio

    # sostituzione dei caratteri non desiderati con uno spazio
    w = re.sub(r"[^a-zA-Z?.!,]+", " ", w)

    w = re.sub(r'[" "]+', " ", w) # rimozione di più spazi consecutivi
    return w

In [None]:
# Caricamento dataset: frasi in inglese, frasi in italiano
df = pd.read_csv(
    train_filenamepath,
    sep="\t",
    header=None,
    names=[INPUT_COLUMN, TARGET_COLUMN],
    usecols=[0,1],
    nrows=NUM_SAMPLES
)

print(df.iloc[42:52], '\n')

# Preprocessing dei dati di Input
input_data = df[INPUT_COLUMN].tolist()

# Preprocessing dei dati Target con aggiunta del token di fine frase
target_data = df[TARGET_COLUMN].tolist()

     input          target
42  Do it.      Lo faccia.
43  Do it.      La faccia.
44  Do it.         Fatelo.
45  Do it.         Fatela.
46  Go on.     Vai avanti.
47  Go on.       Continua.
48  Go on.       Continui.
49  Go on.     Continuate.
50  Go on.    Vada avanti.
51  Go on.  Andate avanti. 



In [None]:
# Definizione del dataset
# [from_tensor_slices] permette di recuperare batch
# di esempi dai dataset di riferimento
dataset = tf.data.Dataset.from_tensor_slices((input_data, target_data))

# impostazione del recupero di esempi presi in maniera
# casuale in gruppi di [BATCH_SIZE] tra quelli disponibili
dataset = dataset.shuffle(len(input_data)).batch(BATCH_SIZE, drop_remainder=True)

In [None]:
bert_tokenizer_params=dict(lower_case=True)
reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]"]

bert_vocab_args = dict(
    # The target vocabulary size
    vocab_size = MAX_VOCAB_SIZE,
    # Reserved tokens that must be included in the vocabulary
    reserved_tokens=reserved_tokens,
    # Arguments for `text.BertTokenizer`
    bert_tokenizer_params=bert_tokenizer_params,
    # Arguments for `wordpiece_vocab.wordpiece_tokenizer_learner_lib.learn`
    learn_params={},
)

In [None]:
train_en = dataset.map(lambda en, it: en)
train_it = dataset.map(lambda en, it: it)

In [None]:
%%time
en_vocab = bert_vocab.bert_vocab_from_dataset(
    train_en.batch(10000).prefetch(2),
    **bert_vocab_args
)

CPU times: user 57.1 s, sys: 494 ms, total: 57.6 s
Wall time: 1min 5s


In [None]:
%%time
it_vocab = bert_vocab.bert_vocab_from_dataset(
    train_it.batch(10000).prefetch(2),
    **bert_vocab_args
)

CPU times: user 1min 55s, sys: 570 ms, total: 1min 55s
Wall time: 1min 56s


In [None]:
print('VOCABOLARIO INGLESE')
print(en_vocab[:10])
print(en_vocab[100:110])
print(en_vocab[150:160])
print(en_vocab[-10:])
print('----------------------------------------------')
print('VOCABOLARIO ITALIANO')
print(it_vocab[:10])
print(it_vocab[100:110])
print(it_vocab[150:160])
print(it_vocab[-10:])

VOCABOLARIO INGLESE
['[PAD]', '[UNK]', '[START]', '[END]', '!', '"', '$', '%', "'", ',']
['and', 'how', 'will', 'there', 'has', 'about', 'now', 'isn', 'all', 'going']
['out', 'by', 'when', 'said', 'lot', 'work', 'let', 'told', 'something', 'car']
['##-', '##.', '##/', '##:', '##;', '##?', '##j', '##°', '##’', '##€']
----------------------------------------------
VOCABOLARIO ITALIANO
['[PAD]', '[UNK]', '[START]', '[END]', '!', '"', '$', '%', "'", ',']
['ancora', 'sia', 'cosi', 'del', 'penso', 'casa', 'hai', 'questa', 'detto', 'siete']
['sempre', 'oggi', 'dove', 'puo', 'parlare', 'tempo', 'adesso', 'ne', 'bene', 'delle']
['##/', '##:', '##;', '##?', '##b', '##j', '##q', '##°', '##’', '##€']


In [None]:
def write_vocab_file(filepath, vocab):
  with open(filepath, 'w') as f:
    for token in vocab:
      print(token, file=f)

In [None]:
write_vocab_file(en_vocab_filenamepath, en_vocab)
write_vocab_file(it_vocab_filenamepath, it_vocab)

In [None]:
en_tokenizer = text.BertTokenizer(en_vocab_filenamepath, **bert_tokenizer_params)
it_tokenizer = text.BertTokenizer(it_vocab_filenamepath, **bert_tokenizer_params)

In [None]:
for en_examples, it_examples in dataset.batch(1).take(1):
  for ex in en_examples:
    print(ex[:5].numpy())
  for ex in it_examples:
    print(ex[:5].numpy())  

[b'She will return within an hour.' b'I am decorating the classroom.'
 b'Your question is illogical.' b"Tom isn't going to hurt you."
 b'You said it would never happen.']
[b"Lei torner\xc3\xa0 tra un'ora." b"Io sto decorando l'aula."
 b'La vostra domanda \xc3\xa8 illogica.'
 b'Tom non le far\xc3\xa0 del male.'
 b'Ha detto che non sarebbe mai successo.']


In [None]:
# Tokenize the examples -> (batch, word, word-piece)
en_token_batch = en_tokenizer.tokenize(en_examples)
# Merge the word and word-piece axes -> (batch, tokens)
en_token_batch = en_token_batch.merge_dims(-2,-1)

for ex in en_token_batch.to_list():
  print(ex[:5])

[[88, 102, 862, 1615, 119, 654, 11], [34, 174, 5632, 58, 1571, 11], [79, 370, 59, 1092, 5010, 11], [56, 107, 8, 45, 109, 57, 371, 55, 11], [55, 153, 61, 132, 129, 354, 11]]


In [None]:
# Tokenize the examples -> (batch, word, word-piece)
it_token_batch = it_tokenizer.tokenize(it_examples)
# Merge the word and word-piece axes -> (batch, tokens)
it_token_batch = it_token_batch.merge_dims(-2,-1)

for ex in it_token_batch.to_list():
  print(ex[:5])

[[71, 1535, 493, 62, 8, 121, 11], [60, 141, 7834, 1166, 3968, 37, 8, 2804, 11], [59, 221, 349, 30, 61, 520, 5654, 11], [55, 56, 74, 459, 103, 306, 11], [63, 108, 58, 56, 326, 94, 263, 11]]


In [None]:
en_words = en_tokenizer.detokenize(en_token_batch)
en_words = tf.strings.reduce_join(en_words, separator=' ', axis=-1)
print(en_words[0][:5].numpy())

[b'she will return within an hour .' b'i am decorating the classroom .'
 b'your question is illogical .' b"tom isn ' t going to hurt you ."
 b'you said it would never happen .']


In [None]:
it_words = it_tokenizer.detokenize(it_token_batch)
it_words = tf.strings.reduce_join(it_words, separator=' ', axis=-1)
print(it_words[0][:5].numpy())

[b"lei tornera tra un ' ora ." b"io sto decorando l ' aula ."
 b'la vostra domanda e illogica .' b'tom non le fara del male .'
 b'ha detto che non sarebbe mai successo .']


In [None]:
START = tf.argmax(tf.constant(reserved_tokens) == "[START]")
END = tf.argmax(tf.constant(reserved_tokens) == "[END]")

def add_start_end(ragged):
  count = ragged.bounding_shape()[0]
  starts = tf.fill([count,1], START)
  ends = tf.fill([count,1], END)
  x = tf.concat([starts, ragged, ends], axis=1)
  # x = keras.preprocessing.sequence.pad_sequences(x.numpy(), maxlen=MAX_SEQ_LENGTH, padding='post')
  return x

In [None]:
en_words = add_start_end(en_token_batch[0][:5])
print(en_words[1])

en_words = en_tokenizer.detokenize(en_words)
en_words = tf.strings.reduce_join(en_words, separator=' ', axis=-1)

print(en_words[1].numpy())

tf.Tensor([   2   34  174 5632   58 1571   11    3], shape=(8,), dtype=int64)
b'[START] i am decorating the classroom . [END]'


In [None]:
it_words = add_start_end(it_token_batch[0][:5])
print(it_words[1])

it_words = it_tokenizer.detokenize(it_words)
it_words = tf.strings.reduce_join(it_words, separator=' ', axis=-1)

print(it_words[1].numpy())

tf.Tensor([   2   60  141 7834 1166 3968   37    8 2804   11    3], shape=(11,), dtype=int64)
b"[START] io sto decorando l ' aula . [END]"


In [None]:
def cleanup_text(reserved_tokens, token_txt):
  # Drop the reserved tokens, except for "[UNK]".
  bad_tokens = [re.escape(tok) for tok in reserved_tokens if tok != "[UNK]"]
  bad_token_re = "|".join(bad_tokens)

  bad_cells = tf.strings.regex_full_match(token_txt, bad_token_re)
  result = tf.ragged.boolean_mask(token_txt, ~bad_cells)

  # Join them into strings.
  result = tf.strings.reduce_join(result, separator=' ', axis=-1)

  return result

In [None]:
words = en_tokenizer.detokenize(en_token_batch)
words[0][:5]

<tf.RaggedTensor [[b'she', b'will', b'return', b'within', b'an', b'hour', b'.'],
 [b'i', b'am', b'decorating', b'the', b'classroom', b'.'],
 [b'your', b'question', b'is', b'illogical', b'.'],
 [b'tom', b'isn', b"'", b't', b'going', b'to', b'hurt', b'you', b'.'],
 [b'you', b'said', b'it', b'would', b'never', b'happen', b'.']]>

In [None]:
words = cleanup_text(reserved_tokens, words).numpy()
words[0][:5]

array([b'she will return within an hour .',
       b'i am decorating the classroom .',
       b'your question is illogical .',
       b"tom isn ' t going to hurt you .",
       b'you said it would never happen .'], dtype=object)

In [None]:
en_input = add_start_end(en_token_batch[0][:5])
en_input = en_input.to_tensor()
print(f'Shape en_input  : {en_input.shape}')

Shape en_input  : (5, 11)


### Classe Tokenizer Custom

In [None]:
class CustomTokenizer(tf.Module):
  def __init__(self, reserved_tokens, vocab_path):
    self.tokenizer = text.BertTokenizer(vocab_path, lower_case=True)
    self._reserved_tokens = reserved_tokens
    self._vocab_path = tf.saved_model.Asset(vocab_path)

    vocab = pathlib.Path(vocab_path).read_text().splitlines()
    self.vocab = tf.Variable(vocab)

    ## Create the signatures for export:   

    # Include a tokenize signature for a batch of strings. 
    self.tokenize.get_concrete_function(
        tf.TensorSpec(shape=[None], dtype=tf.string))
    
    # Include `detokenize` and `lookup` signatures for:
    #   * `Tensors` with shapes [tokens] and [batch, tokens]
    #   * `RaggedTensors` with shape [batch, tokens]
    self.detokenize.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.detokenize.get_concrete_function(
          tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    self.lookup.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.lookup.get_concrete_function(
          tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    # These `get_*` methods take no arguments
    self.get_vocab_size.get_concrete_function()
    self.get_vocab_path.get_concrete_function()
    self.get_reserved_tokens.get_concrete_function()
    
  @tf.function
  def tokenize(self, strings):
    enc = self.tokenizer.tokenize(strings)
    # Merge the `word` and `word-piece` axes.
    enc = enc.merge_dims(-2,-1)
    enc = add_start_end(enc)
    return enc

  @tf.function
  def detokenize(self, tokenized):
    words = self.tokenizer.detokenize(tokenized)
    return cleanup_text(self._reserved_tokens, words)

  @tf.function
  def lookup(self, token_ids):
    return tf.gather(self.vocab, token_ids)

  @tf.function
  def get_vocab_size(self):
    return tf.shape(self.vocab)[0]

  @tf.function
  def get_vocab_path(self):
    return self._vocab_path

  @tf.function
  def get_reserved_tokens(self):
    return tf.constant(self._reserved_tokens)        

In [None]:
tokenizers = tf.Module()
tokenizers.en = CustomTokenizer(reserved_tokens, en_vocab_filenamepath)
tokenizers.it = CustomTokenizer(reserved_tokens, it_vocab_filenamepath)

tf.saved_model.save(tokenizers, tokenizer_filenamepath)