In [None]:
import os
import json
import tensorflow as tf
tf.get_logger().setLevel('ERROR')

from utils.process_text import init_tokenizer

import ipywidgets as widgets
from IPython.display import display


TOKEN_CONFIG = "configs/tokenizer_default_config.json"

In [None]:
from nltk.tokenize import regexp_tokenize


# Define tokenization pattern:
phrase_pattern = r"[^\ ][\w\d\ \,\:\-\ʼ\"\'\`]+[\.\!\?;]*(?=\ |$|\n)"

# EUMT

> English-Ukrainian bidirectional neural machine translator, based on [fastText](https://fasttext.cc/docs/en/support.html) word embeddings (*sisg-* model [1]) and default Transformer architecture [2] of the [OpenNMT framework](https://opennmt.net/).

The сurrent version is trained on one of the [OPUS datasets](https://opus.nlpl.eu/) [3]: the [QED dataset](https://opus.nlpl.eu/QED-v2.0a.php) [4].

In [None]:
transformer_en = tf.saved_model.load("models_serve/QED_en/export/255000")
# transformer_uk = tf.saved_model.load("models_serve/QED_uk/export/30000")
transformer_uk = transformer_en

In [None]:
def preprocess(tokenizer, data):
    """
    Tokenize list of strings
    """

    all_tokens = []
    lengths = []

    for text in data:
        tokens = tokenizer._tokenize_string(text.lower())
        length = len(tokens)
        all_tokens.append(tokens)
        lengths.append(length)
    max_length = max(lengths)

    for tokens, length in zip(all_tokens, lengths):
        if length < max_length:
            tokens += [""] * (max_length - length)

    inputs = {
        "tokens": tf.constant(all_tokens, dtype=tf.string),
        "length": tf.constant(lengths, dtype=tf.int32),
    }

    return inputs

def postprocess(tokenizer, outputs):
    """
    Detokenize and merge list of tokens
    """

    translation = []
    for tokens, length in zip(outputs["tokens"].numpy(), outputs["length"].numpy()):
        tokens = tokens[0][: length[0]].tolist()
        translation.append(tokenizer._detokenize_string(tokens).replace("<unk>", ""))

    return " ".join(translation)

In [None]:
is_uk = False

# Initialize tokenizer:
with open(TOKEN_CONFIG, "r") as f:
    tokenizer_config = json.load(f)

tokenizer = init_tokenizer(tokenizer_config)

labels = ["English", "Ukrainian"]

In [None]:
# Buttons:
switch_language = widgets.Button(description='Switch language')
translate = widgets.Button(description='Translate')

buttons = widgets.VBox([switch_language, translate])

# Text fields:
source_label = widgets.Label(labels[is_uk])
source_text = widgets.Textarea()
source = widgets.VBox([source_label, source_text])

target_label = widgets.Label(labels[not is_uk])
target_text = widgets.Output()
target = widgets.VBox([target_label, target_text])

dashboard = widgets.HBox([source, buttons, target])

display(dashboard)

In [None]:
def on_click_translate(change):
    """
    Translate user text
    """

    # Get the input text and split it into phrases:
    source_doc = regexp_tokenize(
        source_text.value, pattern=phrase_pattern
    )
    with target_text:
        print("One")

    # Tokenize each phrase:
    inputs = preprocess(tokenizer, source_doc)

    target_text.clear_output()
    with target_text:
        print("Two")
    
    # Translate phrases:
    translation = transformer_uk.signatures["serving_default"](**inputs) \
        if is_uk else transformer_en.signatures["serving_default"](**inputs)
    
    target_text.clear_output()
    with target_text:
        print("Three")

    # Refresh output:
    target_text.clear_output()
    
    # Detokenize and merge translated phrases:
    with target_text:
        print(" I AM WORKING")

translate.on_click(on_click_translate)

In [None]:
def on_click_switch(change):
    """
    Swap source and target languages
    """

    global is_uk
    is_uk = not is_uk

    source_label.value = labels[is_uk]
    target_label.value = labels[not is_uk]
    
    source_text.value = ""
    target_text.clear_output()

switch_language.on_click(on_click_switch, is_uk)

## References

1. Bojanowski, P., Grave, E., Joulin, A., & Mikolov, T. (2017). [Enriching word vectors with subword information](https://www.mitpressjournals.org/doi/pdfplus/10.1162/tacl_a_00051?source=post_page---------------------------). Transactions of the Association for Computational Linguistics, 5, 135-146.
2. Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., ... & Polosukhin, I. (2017). [Attention is all you need](https://arxiv.org/pdf/1706.03762.pdf%EF%BC%89%E6%8F%8F%E8%BF%B0%E4%BA%86%E8%BF%99%E6%A0%B7%E5%81%9A%E7%9A%84%E5%8E%9F%E5%9B%A0%E3%80%82). arXiv preprint arXiv:1706.03762.
3. Jörg Tiedemann, 2012, [Parallel Data, Tools and Interfaces in OPUS](http://www.lrec-conf.org/proceedings/lrec2012/pdf/463_Paper.pdf). In *Proceedings of the 8th International Conference on Language Resources and Evaluation (LREC'2012)*.
4. A. Abdelali, F. Guzman, H. Sajjad and S. Vogel, "[The AMARA Corpus: Building parallel language resources for the educational domain](https://www.aclweb.org/anthology/L14-1675/)", The Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC'14). Reykjavik, Iceland, 2014. Pp. 1856-1862. Isbn. 978-2-9517408-8-4.