In [4]:
import pathlib 

import tensorflow as tf 
import tensorflow_datasets as tfds

from tensorflow_text.tools.wordpiece_vocab \
    import bert_vocab_from_dataset as bert_vocab

from tokenizer import SubwordBertTokenizer

In [5]:
tf.get_logger().setLevel('ERROR')
pwd = pathlib.Path.cwd()

In [6]:
examples, metadata = tfds.load('ted_hrlr_translate/ru_to_en', with_info=True, 
                               as_supervised=True)
train_examples, val_examples = examples['train'], examples['validation']

[1mDownloading and preparing dataset ted_hrlr_translate/ru_to_en/1.0.0 (download: 124.94 MiB, generated: Unknown size, total: 124.94 MiB) to /root/tensorflow_datasets/ted_hrlr_translate/ru_to_en/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]






0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/ted_hrlr_translate/ru_to_en/1.0.0.incompleteNG91JZ/ted_hrlr_translate-train.tfrecord


  0%|          | 0/208106 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/ted_hrlr_translate/ru_to_en/1.0.0.incompleteNG91JZ/ted_hrlr_translate-validation.tfrecord


  0%|          | 0/4805 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/ted_hrlr_translate/ru_to_en/1.0.0.incompleteNG91JZ/ted_hrlr_translate-test.tfrecord


  0%|          | 0/5476 [00:00<?, ? examples/s]

[1mDataset ted_hrlr_translate downloaded and prepared to /root/tensorflow_datasets/ted_hrlr_translate/ru_to_en/1.0.0. Subsequent calls will reuse this data.[0m


In [7]:
for ru, en in train_examples.take(1):
    print('Russian: ', ru.numpy().decode('utf-8'))
    print('English: ', en.numpy().decode('utf-8'))

Russian:  к : успех , перемены возможны только с оружием в руках .
English:  c : success , the change is only coming through the barrel of the gun .


In [8]:
train_en = train_examples.map(lambda ru, en: en)
train_ru = train_examples.map(lambda ru, en: ru)

In [9]:
bert_tokenizer_params=dict(lower_case=True)
reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]"]

bert_vocab_args = dict(
    vocab_size=8000, 
    reserved_tokens=reserved_tokens, 
    bert_tokenizer_params=bert_tokenizer_params, 
    learn_params={}, 
)

In [10]:
def write_vocab_file(filepath, vocab):
  with open(filepath, 'w') as f:
    for token in vocab:
      print(token, file=f)

In [11]:
%%time 
ru_vocab = bert_vocab.bert_vocab_from_dataset(
    train_ru.batch(1000).prefetch(2), 
    **bert_vocab_args
)

CPU times: user 14min 33s, sys: 15.5 s, total: 14min 49s
Wall time: 14min 35s


In [12]:
print(ru_vocab[:10])
print(ru_vocab[100:110])
print(ru_vocab[1000:1010])
print(ru_vocab[-10:])

['[PAD]', '[UNK]', '[START]', '[END]', '!', '#', '$', '%', '&', "'"]
['ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я', 'і', '՛']
['трудно', 'хотела', 'далеко', 'качестве', 'мою', '##3', '##де', '##ила', 'планеты', 'большие']
['##’', '##“', '##”', '##„', '##•', '##′', '##⁄', '##∇', '##♪', '##♫']


In [13]:
write_vocab_file('ru_vocab.txt', ru_vocab)

In [14]:
%%time
en_vocab = bert_vocab.bert_vocab_from_dataset(
    train_en.batch(1000).prefetch(2),
    **bert_vocab_args
)

CPU times: user 3min 30s, sys: 11.5 s, total: 3min 42s
Wall time: 3min 34s


In [15]:
print(en_vocab[:10])
print(en_vocab[100:110])
print(en_vocab[1000:1010])
print(en_vocab[-10:])

['[PAD]', '[UNK]', '[START]', '[END]', '!', '#', '$', '%', '&', "'"]
['##s', 'have', 'but', 'what', 'on', 'do', 'with', 'can', 'there', 'about']
['revolution', '200', 'basic', 'potential', 'english', 'led', 'message', 'perfect', '##ce', 'nine']
['##–', '##—', '##‘', '##’', '##“', '##”', '##•', '##∇', '##♪', '##♫']


In [16]:
write_vocab_file('en_vocab.txt', en_vocab)

In [17]:
tokenizers = tf.Module()
tokenizers.ru = SubwordBertTokenizer(reserved_tokens, 'ru_vocab.txt')
tokenizers.en = SubwordBertTokenizer(reserved_tokens, 'en_vocab.txt')

In [18]:
model_name = '../models/ru_en_bert_converter'
tf.saved_model.save(tokenizers, model_name)