In [162]:
import collections
import os
import pathlib
import re
import string
import sys
import tempfile
import time

import numpy as np
import matplotlib.pyplot as plt

import tensorflow_datasets as tfds
import tensorflow_text as text
import tensorflow as tf

In [163]:
tf.get_logger().setLevel('ERROR')
pwd = pathlib.Path.cwd()

In [164]:
import os
import pandas as pd
import yaml
from dataset import DataLoader
from dataset import DataPreprocessing
from dataset import TokenizerWrap

In [165]:
def config_loader(filepath):
    with open (filepath, 'r') as file_descriptor:
        config = yaml.safe_load(file_descriptor)
    return config

In [166]:
config = config_loader('config/en_de.yaml')

In [167]:
lang_src, lang_dest = config['language']['src'],config['language']['dest'] 

In [168]:
path = 'training'
os.listdir(path)

['europarl-v7.es-en.en',
 'europarl-v7.es-en.es',
 'europarl-v7.de-en.en',
 'europarl-v7.cs-en.en',
 'questions_easy.csv',
 'europarl-v7.fr-en.fr',
 'europarl-v7.fr-en.en',
 'europarl-v7.de-en.de',
 'europarl-v7.cs-en.cs']

In [169]:
data_loader = DataLoader(config)

In [170]:
src, dest = data_loader.loadData(path)

Loaded dataset at directory:
training/europarl-v7.de-en.en


In [171]:
src_train = tf.convert_to_tensor(src[:50000])
dest_train = tf.convert_to_tensor(dest[:50000])

In [172]:
src_val = tf.convert_to_tensor(src[50001:60001])
dest_val = tf.convert_to_tensor(dest[50001:60001])

In [173]:
src_train.shape

TensorShape([50000])

In [174]:
src_val.shape

TensorShape([10000])

In [175]:
from sklearn.model_selection import train_test_split

In [176]:
examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True,
                               as_supervised=True)
train_examples, val_examples = examples['train'], examples['validation']

In [177]:
len(train_examples)

51785

In [178]:
len(examples['validation'])

1193

In [179]:
type(train_examples)

tensorflow.python.data.ops.dataset_ops.PrefetchDataset

In [180]:
type(src_train)

tensorflow.python.framework.ops.EagerTensor

In [181]:
data_preprocessing = DataPreprocessing(config)

In [182]:
src_set = data_preprocessing.createDataFrameSrcDest(src[:50000],dest[:50000])

In [183]:
test_set = data_preprocessing.createDataFrameSrcDest(src[50001:60001],dest[50001:60001])

In [184]:
train_set.head()

Unnamed: 0,en,de
0,Resumption of the session,Wiederaufnahme der Sitzungsperiode
1,I declare resumed the session of the European ...,"Ich erkläre die am Freitag, dem 17. Dezember u..."
2,"Although, as you will have seen, the dreaded '...","Wie Sie feststellen konnten, ist der gefürchte..."
3,You have requested a debate on this subject in...,Im Parlament besteht der Wunsch nach einer Aus...
4,"In the meantime, I should like to observe a mi...",Heute möchte ich Sie bitten - das ist auch der...


In [185]:
train_set = train_set.astype(str)

In [186]:
test_set = test_set.astype(str)

In [187]:
test_set

Unnamed: 0,en,de
0,"Furthermore, most of the amendments, which I a...","Die Mehrzahl der Änderungsanträge, denen ich n..."
1,That is an issue which I have referred to before.,Darauf bin ich bereits früher eingegangen.
2,I know it is a controversial issue and I look ...,"Ich weiß, daß in diesem Punkt Uneinigkeit herr..."
3,"In substance, therefore, the Commission and Pa...",Sowohl Kommission als auch Parlament haben ein...
4,I should like especially to thank Mrs Roth-Beh...,Vor allem möchte ich Frau Roth-Behrendt und He...
...,...,...
9995,It falls within the competence of the Presiden...,Dies fällt in den Zuständigkeitsbereich des Pr...
9996,First Part,Teil I
9997,Question No 29 by (H-0452/00):,Anfrage Nr. 29 von (H-0452/00):
9998,Subject: Political intervention in Georgia's p...,Betrifft: Politische Einschaltung der EU in di...


In [209]:
train_set

Unnamed: 0,en,de
0,Resumption of the session,Wiederaufnahme der Sitzungsperiode
1,I declare resumed the session of the European ...,"Ich erkläre die am Freitag, dem 17. Dezember u..."
2,"Although, as you will have seen, the dreaded '...","Wie Sie feststellen konnten, ist der gefürchte..."
3,You have requested a debate on this subject in...,Im Parlament besteht der Wunsch nach einer Aus...
4,"In the meantime, I should like to observe a mi...",Heute möchte ich Sie bitten - das ist auch der...
...,...,...
49995,"Contrary to the Commission' s proposal, the re...",Im Bericht werden entgegen dem Vorschlag der K...
49996,"This in itself is acceptable, but we must bear...","Das ist an sich akzeptabel, dennoch darf nicht..."
49997,"Hopefully, better tests than the current ones ...","Bleibt zu hoffen, daß die Tests auch bald bess..."
49998,"Mr President, I am pleased to outline our posi...","Herr Präsident, ich möchte Ihnen im folgenden ..."


In [214]:
train_set['en']

0                                Resumption of the session
1        I declare resumed the session of the European ...
2        Although, as you will have seen, the dreaded '...
3        You have requested a debate on this subject in...
4        In the meantime, I should like to observe a mi...
                               ...                        
49995    Contrary to the Commission' s proposal, the re...
49996    This in itself is acceptable, but we must bear...
49997    Hopefully, better tests than the current ones ...
49998    Mr President, I am pleased to outline our posi...
49999    I wish to begin by thanking you for the excell...
Name: en, Length: 50000, dtype: object

In [272]:
test_set['de']

0       Die Mehrzahl der Änderungsanträge, denen ich n...
1              Darauf bin ich bereits früher eingegangen.
2       Ich weiß, daß in diesem Punkt Uneinigkeit herr...
3       Sowohl Kommission als auch Parlament haben ein...
4       Vor allem möchte ich Frau Roth-Behrendt und He...
                              ...                        
9995    Dies fällt in den Zuständigkeitsbereich des Pr...
9996                                               Teil I
9997                      Anfrage Nr. 29 von (H-0452/00):
9998    Betrifft: Politische Einschaltung der EU in di...
9999    Die Abspaltung Süd-Ossetiens und Abchasiens so...
Name: de, Length: 10000, dtype: object

In [267]:
src_set = tf.data.Dataset.from_tensor_slices(train_set['de'])
dest_set = tf.data.Dataset.from_tensor_slices(train_set['en'])
training_set = tf.data.Dataset.zip((src_set,dest_set))

In [273]:
src_set = tf.data.Dataset.from_tensor_slices(test_set['de'])
dest_set = tf.data.Dataset.from_tensor_slices(test_set['en'])
testing_set = tf.data.Dataset.zip((src_set,dest_set))

In [194]:
for en, de in train_examples.take(5):
    print("English: ", en.numpy().decode('utf-8'))
    print()
    print("German:   ", de.numpy().decode('utf-8'))
    print()

English:  Resumption of the session

German:    Wiederaufnahme der Sitzungsperiode

English:  I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.

German:    Ich erkläre die am Freitag, dem 17. Dezember unterbrochene Sitzungsperiode des Europäischen Parlaments für wiederaufgenommen, wünsche Ihnen nochmals alles Gute zum Jahreswechsel und hoffe, daß Sie schöne Ferien hatten.

English:  Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.

German:    Wie Sie feststellen konnten, ist der gefürchtete "Millenium-Bug " nicht eingetreten. Doch sind Bürger einiger unserer Mitgliedstaaten Opfer von schrecklichen Naturkatastrophen geworden.

English:  You have requested a debate on this subject in the cour

In [197]:
type(train_examples)

tensorflow.python.data.ops.dataset_ops.TensorSliceDataset

In [289]:
train_en = training_set.map(lambda de, en: en)
train_de = testing_set.map(lambda de, en: de)

In [287]:
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab


In [288]:
bert_tokenizer_params=dict(lower_case=True)
reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]"]

bert_vocab_args = dict(
    # The target vocabulary size
    vocab_size = 8000,
    # Reserved tokens that must be included in the vocabulary
    reserved_tokens=reserved_tokens,
    # Arguments for `text.BertTokenizer`
    bert_tokenizer_params=bert_tokenizer_params,
    # Arguments for `wordpiece_vocab.wordpiece_tokenizer_learner_lib.learn`
    learn_params={},
)

In [290]:
%%time
de_vocab = bert_vocab.bert_vocab_from_dataset(
    train_de.batch(1000).prefetch(2),
    **bert_vocab_args
)

CPU times: user 2min 3s, sys: 518 ms, total: 2min 3s
Wall time: 2min 3s


In [292]:
print(de_vocab[:10])
print(de_vocab[100:110])
print(de_vocab[1000:1010])
print(de_vocab[-10:])

['[PAD]', '[UNK]', '[START]', '[END]', '!', '"', '%', "'", '(', ')']
['dieser', '##r', 'europaischen', '##t', 'bei', 'so', 'mochte', 'aber', 'wenn', 'bericht']
['somit', 'sorge', 'tat', 'usa', 'vier', 'wettbewerbsfahigkeit', 'wirtschaftliche', '##ausschuss', '##eren', '##fen']
['##-', '##.', '##/', '##:', '##;', '##?', '##[', '##]', '##j', '##q']


In [293]:
def write_vocab_file(filepath, vocab):
    with open(filepath, 'w') as f:
        for token in vocab:
              print(token, file=f)

In [295]:
write_vocab_file('de_vocab.txt', de_vocab)

In [296]:
%%time
en_vocab = bert_vocab.bert_vocab_from_dataset(
    train_en.batch(1000).prefetch(2),
    **bert_vocab_args
)

CPU times: user 1min 7s, sys: 1.4 s, total: 1min 9s
Wall time: 1min 6s


In [297]:
print(en_vocab[:10])
print(en_vocab[100:110])
print(en_vocab[1000:1010])
print(en_vocab[-10:])

['[PAD]', '[UNK]', '[START]', '[END]', '!', '"', '$', '%', '&', "'"]
['president', 'there', 'you', 'parliament', 'been', 'union', 'these', 'can', 'or', 'if']
['date', 'fishing', 'milk', 'possibility', 'scientific', 'setting', '##ment', 'applies', 'b5', 'reached']
['##[', '##]', '##j', '##q', '##£', '##·', '##æ', '##ø', '##μ', '##⁄']


In [298]:
write_vocab_file('en_vocab.txt', en_vocab)

In [300]:
de_tokenizer = text.BertTokenizer('de_vocab.txt', **bert_tokenizer_params)
en_tokenizer = text.BertTokenizer('en_vocab.txt', **bert_tokenizer_params)

In [302]:
for pt_examples, en_examples in training_set.batch(3).take(1):
      for ex in en_examples:
        print(ex.numpy())

b'Resumption of the session'
b'I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.'
b"Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful."


In [303]:
# Tokenize the examples -> (batch, word, word-piece)
token_batch = en_tokenizer.tokenize(en_examples)
# Merge the word and word-piece axes -> (batch, tokens)
token_batch = token_batch.merge_dims(-2,-1)

for ex in token_batch.to_list():
      print(ex)

[5212, 65, 64, 1010]
[40, 2724, 2744, 64, 1010, 65, 64, 85, 103, 4190, 74, 1704, 2105, 1241, 285, 13, 67, 40, 89, 113, 381, 301, 66, 412, 102, 32, 1413, 136, 222, 68, 64, 262, 70, 102, 4604, 32, 47, 1727, 6021, 37, 1394, 1100, 532, 15]
[433, 13, 79, 102, 83, 81, 691, 13, 64, 35, 1489, 5730, 9, 3317, 33, 6849, 9, 1772, 66, 1890, 1261, 13, 252, 64, 139, 68, 32, 273, 65, 131, 2300, 32, 1625, 65, 1036, 1547, 70, 1712, 175, 5131, 15]


In [304]:
# Lookup each token id in the vocabulary.
txt_tokens = tf.gather(en_vocab, token_batch)
# Join with spaces.
tf.strings.reduce_join(txt_tokens, separator=' ', axis=-1)

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'resumption of the session',
       b'i declare resumed the session of the european parliament adjourned on friday 17 december 1999 , and i would like once again to wish you a happy new year in the hope that you enjoyed a p ##le ##asant f ##est ##ive period .',
       b"although , as you will have seen , the d ##re ##aded ' millennium b ##ug ' failed to material ##ise , still the people in a number of countries suffered a series of natural disasters that truly were dreadful ."],
      dtype=object)>

In [305]:
words = en_tokenizer.detokenize(token_batch)
tf.strings.reduce_join(words, separator=' ', axis=-1)

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'resumption of the session',
       b'i declare resumed the session of the european parliament adjourned on friday 17 december 1999 , and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period .',
       b"although , as you will have seen , the dreaded ' millennium bug ' failed to materialise , still the people in a number of countries suffered a series of natural disasters that truly were dreadful ."],
      dtype=object)>

In [306]:
START = tf.argmax(tf.constant(reserved_tokens) == "[START]")
END = tf.argmax(tf.constant(reserved_tokens) == "[END]")

def add_start_end(ragged):
  count = ragged.bounding_shape()[0]
  starts = tf.fill([count,1], START)
  ends = tf.fill([count,1], END)
  return tf.concat([starts, ragged, ends], axis=1)

In [307]:
words = en_tokenizer.detokenize(add_start_end(token_batch))
tf.strings.reduce_join(words, separator=' ', axis=-1)

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'[START] resumption of the session [END]',
       b'[START] i declare resumed the session of the european parliament adjourned on friday 17 december 1999 , and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period . [END]',
       b"[START] although , as you will have seen , the dreaded ' millennium bug ' failed to materialise , still the people in a number of countries suffered a series of natural disasters that truly were dreadful . [END]"],
      dtype=object)>

In [308]:
def cleanup_text(reserved_tokens, token_txt):
  # Drop the reserved tokens, except for "[UNK]".
  bad_tokens = [re.escape(tok) for tok in reserved_tokens if tok != "[UNK]"]
  bad_token_re = "|".join(bad_tokens)

  bad_cells = tf.strings.regex_full_match(token_txt, bad_token_re)
  result = tf.ragged.boolean_mask(token_txt, ~bad_cells)

  # Join them into strings.
  result = tf.strings.reduce_join(result, separator=' ', axis=-1)

  return result

In [309]:
en_examples.numpy()

array([b'Resumption of the session',
       b'I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.',
       b"Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful."],
      dtype=object)

In [310]:
token_batch = en_tokenizer.tokenize(en_examples).merge_dims(-2,-1)
words = en_tokenizer.detokenize(token_batch)
words

<tf.RaggedTensor [[b'resumption', b'of', b'the', b'session'],
 [b'i', b'declare', b'resumed', b'the', b'session', b'of', b'the',
  b'european', b'parliament', b'adjourned', b'on', b'friday', b'17',
  b'december', b'1999', b',', b'and', b'i', b'would', b'like', b'once',
  b'again', b'to', b'wish', b'you', b'a', b'happy', b'new', b'year', b'in',
  b'the', b'hope', b'that', b'you', b'enjoyed', b'a', b'pleasant',
  b'festive', b'period', b'.']                                             ,
 [b'although', b',', b'as', b'you', b'will', b'have', b'seen', b',', b'the',
  b'dreaded', b"'", b'millennium', b'bug', b"'", b'failed', b'to',
  b'materialise', b',', b'still', b'the', b'people', b'in', b'a', b'number',
  b'of', b'countries', b'suffered', b'a', b'series', b'of', b'natural',
  b'disasters', b'that', b'truly', b'were', b'dreadful', b'.']              ]>

In [311]:
cleanup_text(reserved_tokens, words).numpy()

array([b'resumption of the session',
       b'i declare resumed the session of the european parliament adjourned on friday 17 december 1999 , and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period .',
       b"although , as you will have seen , the dreaded ' millennium bug ' failed to materialise , still the people in a number of countries suffered a series of natural disasters that truly were dreadful ."],
      dtype=object)

In [312]:
class CustomTokenizer(tf.Module):
  def __init__(self, reserved_tokens, vocab_path):
    self.tokenizer = text.BertTokenizer(vocab_path, lower_case=True)
    self._reserved_tokens = reserved_tokens
    self._vocab_path = tf.saved_model.Asset(vocab_path)

    vocab = pathlib.Path(vocab_path).read_text().splitlines()
    self.vocab = tf.Variable(vocab)

    ## Create the signatures for export:   

    # Include a tokenize signature for a batch of strings. 
    self.tokenize.get_concrete_function(
        tf.TensorSpec(shape=[None], dtype=tf.string))

    # Include `detokenize` and `lookup` signatures for:
    #   * `Tensors` with shapes [tokens] and [batch, tokens]
    #   * `RaggedTensors` with shape [batch, tokens]
    self.detokenize.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.detokenize.get_concrete_function(
          tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    self.lookup.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.lookup.get_concrete_function(
          tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    # These `get_*` methods take no arguments
    self.get_vocab_size.get_concrete_function()
    self.get_vocab_path.get_concrete_function()
    self.get_reserved_tokens.get_concrete_function()

  @tf.function
  def tokenize(self, strings):
    enc = self.tokenizer.tokenize(strings)
    # Merge the `word` and `word-piece` axes.
    enc = enc.merge_dims(-2,-1)
    enc = add_start_end(enc)
    return enc

  @tf.function
  def detokenize(self, tokenized):
    words = self.tokenizer.detokenize(tokenized)
    return cleanup_text(self._reserved_tokens, words)

  @tf.function
  def lookup(self, token_ids):
    return tf.gather(self.vocab, token_ids)

  @tf.function
  def get_vocab_size(self):
    return tf.shape(self.vocab)[0]

  @tf.function
  def get_vocab_path(self):
    return self._vocab_path

  @tf.function
  def get_reserved_tokens(self):
    return tf.constant(self._reserved_tokens)

In [314]:
tokenizers = tf.Module()
tokenizers.pt = CustomTokenizer(reserved_tokens, 'de_vocab.txt')
tokenizers.en = CustomTokenizer(reserved_tokens, 'en_vocab.txt')

In [315]:
model_name = 'translator_en_de_converter'
tf.saved_model.save(tokenizers, model_name)

In [316]:
reloaded_tokenizers = tf.saved_model.load(model_name)
reloaded_tokenizers.en.get_vocab_size().numpy()

7178

In [320]:
tokens = reloaded_tokenizers.en.tokenize(['This is a test'])
tokens.numpy()

array([[   2,   72,   69,   32, 2425,    3]])

In [321]:
text_tokens = reloaded_tokenizers.en.lookup(tokens)
text_tokens

<tf.RaggedTensor [[b'[START]', b'this', b'is', b'a', b'test', b'[END]']]>

In [322]:
round_trip = reloaded_tokenizers.en.detokenize(tokens)

print(round_trip.numpy()[0].decode('utf-8'))

this is a test


In [335]:
!zip -r 'translator_en_de_converter'.zip 'translator_en_de_converter'

  adding: translator_en_de_converter/ (stored 0%)
  adding: translator_en_de_converter/saved_model.pb (deflated 91%)
  adding: translator_en_de_converter/assets/ (stored 0%)
  adding: translator_en_de_converter/assets/en_vocab.txt (deflated 56%)
  adding: translator_en_de_converter/assets/de_vocab.txt (deflated 59%)
  adding: translator_en_de_converter/variables/ (stored 0%)
  adding: translator_en_de_converter/variables/variables.index (deflated 33%)
  adding: translator_en_de_converter/variables/variables.data-00000-of-00001 (deflated 52%)


In [336]:
!du -h *.zip

184K	ted_hrlr_translate_pt_en_converter.zip
148K	translator_en_de_converter.zip


In [334]:
!apt-get install zip

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  zip
0 upgraded, 1 newly installed, 0 to remove and 5 not upgraded.
Need to get 167 kB of archives.
After this operation, 638 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu focal/main amd64 zip amd64 3.0-11build1 [167 kB]
Fetched 167 kB in 1s (334 kB/s)
debconf: delaying package configuration, since apt-utils is not installed
Selecting previously unselected package zip.
(Reading database ... 20632 files and directories currently installed.)
Preparing to unpack .../zip_3.0-11build1_amd64.deb ...
Unpacking zip (3.0-11build1) ...
Setting up zip (3.0-11build1) ...
