# References

- https://www.tensorflow.org/text/guide/bert_preprocessing_guide
- https://stackoverflow.com/questions/58507400/how-to-use-tf-lookup-tables-with-tensorflow-2-0-keras-and-mlflow
- https://www.tensorflow.org/text/guide/subwords_tokenizer

In [1]:
from pathlib import Path

import pandas as pd

import tensorflow as tf
import tensorflow_models as tfm
import tensorflow_text as tft
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset

import numpy as np

2023-01-07 09:53:58.526676: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-07 09:53:58.652355: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-01-07 09:53:58.652376: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-01-07 09:53:59.478668: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

## Load the dataset

In [2]:
path_to_file = Path().home() / "tensorflow_datasets" / "anki" / "ita-eng" / "ita.txt"
df = pd.read_csv(path_to_file, sep="\t", header=None)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 358373 entries, 0 to 358372
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   0       358373 non-null  object
 1   1       358373 non-null  object
 2   2       358373 non-null  object
dtypes: object(3)
memory usage: 8.2+ MB


In [3]:
ds = tf.data.Dataset.from_tensor_slices(df.head(1000)[0].values)

2023-01-07 09:54:03.014286: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/calcifer/git/marco/learn-deep-learning/.env/lib/python3.10/site-packages/cv2/../../lib64:
2023-01-07 09:54:03.014307: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2023-01-07 09:54:03.014324: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (calcifer-Inspiron-7370): /proc/driver/nvidia/version does not exist
2023-01-07 09:54:03.014550: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, reb

In [4]:
print("Example:", ds.take(1).as_numpy_iterator().next())

Example: b'Hi.'


## Subword vectorization

In [20]:
VOCAB_SIZE = 10000
RESERVED_TOKENS = ["[START]", "[END]", "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
BERT_TOKENIZER_PARAMS = dict(
    lower_case=True,
    keep_whitespace=None,
    normalization_form=None,
    preserve_unused_token=None,
)
VOCAB_FILE = Path.cwd() / "vocab.txt"

SAMPLE_SENTENCE = "hi this is me"

### Create the vocabulary

In [14]:
vocab = bert_vocab_from_dataset.bert_vocab_from_dataset(
    dataset=ds,
    vocab_size=VOCAB_SIZE,
    reserved_tokens=RESERVED_TOKENS,
    bert_tokenizer_params=BERT_TOKENIZER_PARAMS,
    learn_params=None,
)
print(f"{type(vocab)=}")
samples_idx = np.random.randint(0, len(vocab), size=10).tolist()
print(
    "Vocab samples:", 
    *["\t"+vocab[idx] for idx in samples_idx], 
    sep="\n"
)

type(vocab)=<class 'list'>
Vocab samples:
	##t
	stood
	##or
	t
	help
	!
	ate
	##a
	stay
	##i


In [15]:
with open(VOCAB_FILE, "w", encoding="utf-8") as fp:
    fp.write("\n".join(vocab))

### Option 1: Create tokenizer to be used in custom module

In [16]:
lookup_table = tf.lookup.StaticVocabularyTable(
    tf.lookup.KeyValueTensorInitializer(
      keys=vocab,
      key_dtype=tf.string,
      values=tf.range(
          tf.size(vocab, out_type=tf.int64), dtype=tf.int64),
          value_dtype=tf.int64
        ),
      num_oov_buckets=1
)

tokenizer = bert_vocab_from_dataset.bert_tokenizer.BertTokenizer(lookup_table)

In [23]:
tokens, offsets_start, offsets_end = tokenizer.tokenize_with_offsets(SAMPLE_SENTENCE)
print(f"{tokens.shape=}")
print(f"{tokens=}")
print(f"{offsets_start=}")
print(f"{offsets_end=}")

print()
print("Output: ", *[f"{vocab[tkn]:>8}" for r in tokens.numpy() for word in r for tkn in word])
# Repackage offsets for display
offsets = tf.stack([offsets_start, offsets_end], axis=3)
offsets = tf.squeeze(offsets, axis=0)
print(
    "Offsets:", 
    *[
        f"{str((s, e)):>8}"
        for o in offsets.numpy()  # iterate over each offset pair
        for s, e in o             # unpack start and end offset
        if s+e>0                  # skip empty tokens
    ]
)

tokens.shape=TensorShape([1, None, None])
tokens=<tf.RaggedTensor [[[21, 132], [119], [22, 57], [47]]]>
offsets_start=<tf.RaggedTensor [[[0, 1], [3], [8, 9], [11]]]>
offsets_end=<tf.RaggedTensor [[[1, 2], [7], [9, 10], [13]]]>

Output:         h      ##i     this        i      ##s       me
Offsets:   (0, 1)   (1, 2)   (3, 7)   (8, 9)  (9, 10) (11, 13)


### Option 2: use premade layer

If error in tokenizer initialization, restart the kernel

In [27]:
tokenizer = tfm.nlp.layers.BertTokenizer(
    vocab_file=VOCAB_FILE.as_posix(),
    lower_case=True,
    tokenize_with_offsets=True,
)

In [28]:
tokenizer([tf.constant(SAMPLE_SENTENCE)])

(<tf.RaggedTensor [[[21, 132], [119], [22, 57], [47]]]>,
 <tf.RaggedTensor [[[0, 1], [3], [8, 9], [11]]]>,
 <tf.RaggedTensor [[[1, 2], [7], [9, 10], [13]]]>)