In [1]:
!pip install -q -U "tensorflow-text==2.8.*"

[K     |████████████████████████████████| 4.9 MB 6.8 MB/s 
[K     |████████████████████████████████| 497.9 MB 4.2 kB/s 
[K     |████████████████████████████████| 5.8 MB 41.6 MB/s 
[K     |████████████████████████████████| 462 kB 70.8 MB/s 
[K     |████████████████████████████████| 1.4 MB 45.9 MB/s 
[?25h

In [2]:
!pip install -q tensorflow_datasets

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import collections
import os
import pathlib
import re
import string
import sys
import tempfile
import time
import pandas as pd
import unicodedata

import numpy as np
import matplotlib.pyplot as plt

import tensorflow_datasets as tfds
import tensorflow_text as text
import tensorflow as tf
from tensorflow import keras

from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab

In [5]:
# PARAMETRI GLOBALI
root_folder = 'drive/MyDrive/BERT/'

# DATI
data_folder_name = 'data'
train_filename = 'ita.txt'

DATA_PATH = os.path.abspath(os.path.join(root_folder, data_folder_name))
train_filenamepath = os.path.abspath(os.path.join(DATA_PATH, train_filename))

# VOCABOLARIO
vocab_folder = 'vocab'
it_vocab_finalname = 'it_vocab_transformer.txt'

VOCAB_PATH = os.path.abspath(os.path.join(root_folder, vocab_folder))
it_vocab_filenamepath = os.path.abspath(os.path.join(VOCAB_PATH, it_vocab_finalname))

# MODELLO TOKENIZER
model_name = 'tokenizer_it_transformer_model'
tokenizer_folder_name = 'tokenizer'

TOKEN_PATH = os.path.abspath(os.path.join(root_folder, tokenizer_folder_name))
tokenizer_filenamepath = os.path.abspath(os.path.join(TOKEN_PATH, model_name))

In [6]:
# parametri per il modello
INPUT_COLUMN = 'input'
TARGET_COLUMN = 'target'
NUM_SAMPLES = 1000000
MAX_VOCAB_SIZE = 20000
BATCH_SIZE = 32
MAX_SEQ_LENGTH = 16

In [7]:
# Caricamento dataset: frasi in inglese, frasi in italiano
df = pd.read_csv(
    train_filenamepath,
    sep="\t",
    header=None,
    names=[INPUT_COLUMN, TARGET_COLUMN],
    usecols=[0,1],
    nrows=NUM_SAMPLES
)

print(df.iloc[42:52], '\n')

# Preprocessing dei dati di Input
input_data = df[INPUT_COLUMN].tolist()

# Preprocessing dei dati Target con aggiunta del token di fine frase
target_data = df[TARGET_COLUMN].tolist()

     input          target
42  Do it.      Lo faccia.
43  Do it.      La faccia.
44  Do it.         Fatelo.
45  Do it.         Fatela.
46  Go on.     Vai avanti.
47  Go on.       Continua.
48  Go on.       Continui.
49  Go on.     Continuate.
50  Go on.    Vada avanti.
51  Go on.  Andate avanti. 



In [8]:
# Definizione del dataset
# [from_tensor_slices] permette di recuperare batch
# di esempi dai dataset di riferimento
dataset = tf.data.Dataset.from_tensor_slices(target_data)

# impostazione del recupero di esempi presi in maniera
# casuale in gruppi di [BATCH_SIZE] tra quelli disponibili
dataset = dataset.shuffle(len(target_data)).batch(BATCH_SIZE, drop_remainder=True)

In [9]:
bert_tokenizer_params=dict(lower_case=True)
reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]"]

bert_vocab_args = dict(
    # The target vocabulary size
    vocab_size = MAX_VOCAB_SIZE,
    # Reserved tokens that must be included in the vocabulary
    reserved_tokens=reserved_tokens,
    # Arguments for `text.BertTokenizer`
    bert_tokenizer_params=bert_tokenizer_params,
    # Arguments for `wordpiece_vocab.wordpiece_tokenizer_learner_lib.learn`
    learn_params={},
)

In [10]:
# train_en = dataset.map(lambda en, it: en)
# train_it = dataset.map(lambda en, it: it)

In [11]:
%%time
it_vocab = bert_vocab.bert_vocab_from_dataset(
    dataset.batch(10000).prefetch(2),
    **bert_vocab_args
)

CPU times: user 1min 59s, sys: 518 ms, total: 2min
Wall time: 2min 7s


In [12]:
print('VOCABOLARIO ITALIANO')
print(it_vocab[:10])
print(it_vocab[100:110])
print(it_vocab[150:160])
print(it_vocab[-10:])

VOCABOLARIO ITALIANO
['[PAD]', '[UNK]', '[START]', '[END]', '!', '"', '$', '%', "'", ',']
['ancora', 'sia', 'cosi', 'del', 'penso', 'casa', 'hai', 'questa', 'detto', 'siete']
['sempre', 'oggi', 'dove', 'puo', 'parlare', 'tempo', 'adesso', 'ne', 'bene', 'delle']
['##/', '##:', '##;', '##?', '##b', '##j', '##q', '##°', '##’', '##€']


In [13]:
def write_vocab_file(filepath, vocab):
  with open(filepath, 'w') as f:
    for token in vocab:
      print(token, file=f)

In [14]:
write_vocab_file(it_vocab_filenamepath, it_vocab)

In [15]:
it_tokenizer = text.BertTokenizer(it_vocab_filenamepath, **bert_tokenizer_params)

In [16]:
for it_examples in dataset.batch(1).take(1):
  for ex in it_examples:
    print(ex[:5].numpy())  

[b'Sarei potuto morire.' b'Non hai davvero bisogno di farlo, vero?'
 b'Io ho superato ogni singolo esame.' b'Hai subito un discreto trauma.'
 b'Io sapevo che Tom era un insegnante di francese a Boston, quindi non credetti a Mary quando mi disse che era un tassista a Chicago.']


In [17]:
# Tokenize the examples -> (batch, word, word-piece)
it_token_batch = it_tokenizer.tokenize(it_examples)
# Merge the word and word-piece axes -> (batch, tokens)
it_token_batch = it_token_batch.merge_dims(-2,-1)

for ex in it_token_batch.to_list():
  print(ex[:5])

[[556, 887, 778, 11], [56, 106, 160, 147, 57, 149, 9, 95, 25], [60, 66, 2046, 197, 6544, 563, 11], [106, 1342, 62, 2407, 7226, 11], [60, 347, 58, 55, 80, 62, 316, 57, 131, 26, 117, 9, 598, 56, 5929, 26, 72, 138, 68, 240, 58, 80, 62, 2932, 26, 1464, 11]]


In [18]:
it_words = it_tokenizer.detokenize(it_token_batch)
it_words = tf.strings.reduce_join(it_words, separator=' ', axis=-1)
print(it_words[0][:5].numpy())

[b'sarei potuto morire .' b'non hai davvero bisogno di farlo , vero ?'
 b'io ho superato ogni singolo esame .' b'hai subito un discreto trauma .'
 b'io sapevo che tom era un insegnante di francese a boston , quindi non credetti a mary quando mi disse che era un tassista a chicago .']


In [19]:
START = tf.argmax(tf.constant(reserved_tokens) == "[START]")
END = tf.argmax(tf.constant(reserved_tokens) == "[END]")

def add_start_end(ragged):
  count = ragged.bounding_shape()[0]
  starts = tf.fill([count,1], START)
  ends = tf.fill([count,1], END)
  x = tf.concat([starts, ragged, ends], axis=1)
  # x = keras.preprocessing.sequence.pad_sequences(x.numpy(), maxlen=MAX_SEQ_LENGTH, padding='post')
  return x

In [20]:
it_words = add_start_end(it_token_batch[0][:5])
print(it_words[1])

it_words = it_tokenizer.detokenize(it_words)
it_words = tf.strings.reduce_join(it_words, separator=' ', axis=-1)

print(it_words[1].numpy())

tf.Tensor([  2  56 106 160 147  57 149   9  95  25   3], shape=(11,), dtype=int64)
b'[START] non hai davvero bisogno di farlo , vero ? [END]'


In [21]:
def cleanup_text(reserved_tokens, token_txt):
  # Drop the reserved tokens, except for "[UNK]".
  bad_tokens = [re.escape(tok) for tok in reserved_tokens if tok != "[UNK]"]
  bad_token_re = "|".join(bad_tokens)

  bad_cells = tf.strings.regex_full_match(token_txt, bad_token_re)
  result = tf.ragged.boolean_mask(token_txt, ~bad_cells)

  # Join them into strings.
  result = tf.strings.reduce_join(result, separator=' ', axis=-1)

  return result

### Classe Tokenizer Custom

In [22]:
class CustomTokenizer(tf.Module):
  def __init__(self, reserved_tokens, vocab_path):
    self.tokenizer = text.BertTokenizer(vocab_path, lower_case=True)
    self._reserved_tokens = reserved_tokens
    self._vocab_path = tf.saved_model.Asset(vocab_path)

    vocab = pathlib.Path(vocab_path).read_text().splitlines()
    self.vocab = tf.Variable(vocab)

    ## Create the signatures for export:   

    # Include a tokenize signature for a batch of strings. 
    self.tokenize.get_concrete_function(
        tf.TensorSpec(shape=[None], dtype=tf.string))
    
    # Include `detokenize` and `lookup` signatures for:
    #   * `Tensors` with shapes [tokens] and [batch, tokens]
    #   * `RaggedTensors` with shape [batch, tokens]
    self.detokenize.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.detokenize.get_concrete_function(
          tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    self.lookup.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.lookup.get_concrete_function(
          tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    # These `get_*` methods take no arguments
    self.get_vocab_size.get_concrete_function()
    self.get_vocab_path.get_concrete_function()
    self.get_reserved_tokens.get_concrete_function()
    
  @tf.function
  def tokenize(self, strings):
    enc = self.tokenizer.tokenize(strings)
    # Merge the `word` and `word-piece` axes.
    enc = enc.merge_dims(-2,-1)
    enc = add_start_end(enc)
    return enc

  @tf.function
  def detokenize(self, tokenized):
    words = self.tokenizer.detokenize(tokenized)
    return cleanup_text(self._reserved_tokens, words)

  @tf.function
  def lookup(self, token_ids):
    return tf.gather(self.vocab, token_ids)

  @tf.function
  def get_vocab_size(self):
    return tf.shape(self.vocab)[0]

  @tf.function
  def get_vocab_path(self):
    return self._vocab_path

  @tf.function
  def get_reserved_tokens(self):
    return tf.constant(self._reserved_tokens)        

In [23]:
tokenizers = tf.Module()
tokenizers.it = CustomTokenizer(reserved_tokens, it_vocab_filenamepath)

tf.saved_model.save(tokenizers, tokenizer_filenamepath)