In [1]:
import os
import json
import tensorflow as tf
tf.get_logger().setLevel('ERROR')

from utils.process_text import init_tokenizer

import ipywidgets as widgets
from IPython.display import display


TOKEN_CONFIG = "configs/tokenizer_default_config.json"

## EUMT

> English-Ukrainian biderectional neural machine translator, based on [fastText](https://fasttext.cc/docs/en/support.html) word embeddings and [default Transformer architecture](https://arxiv.org/pdf/1706.03762.pdf%EF%BC%89%E6%8F%8F%E8%BF%B0%E4%BA%86%E8%BF%99%E6%A0%B7%E5%81%9A%E7%9A%84%E5%8E%9F%E5%9B%A0%E3%80%82) of the [OpenNMT framework](https://opennmt.net/).

In [2]:
transformer_en = tf.saved_model.load("models_serve/QED2_en/export/20000").signatures["serving_default"]
transformer_uk = tf.saved_model.load("models_serve/QED_uk/export/30000").signatures["serving_default"]

In [3]:
def preprocess(tokenizer, data):
    all_tokens = []
    lengths = []

    for text in data:
        tokens = tokenizer._tokenize_string(text.lower())
        length = len(tokens)
        all_tokens.append(tokens)
        lengths.append(length)
    max_length = max(lengths)

    for tokens, length in zip(all_tokens, lengths):
        if length < max_length:
            tokens += [""] * (max_length - length)

    inputs = {
        "tokens": tf.constant(all_tokens, dtype=tf.string),
        "length": tf.constant(lengths, dtype=tf.int32),
    }

    return inputs

def postprocess(tokenizer, outputs):
    translation = []
    for tokens, length in zip(outputs["tokens"].numpy(), outputs["length"].numpy()):
        tokens = tokens[0][: length[0]].tolist()
        translation.append(tokenizer._detokenize_string(tokens).replace("<unk>", ""))

    return translation

In [4]:
is_uk = False

# Initialize tokenizer:
with open(TOKEN_CONFIG, "r") as f:
    tokenizer_config = json.load(f)

tokenizer = init_tokenizer(tokenizer_config)

labels = ["English", "Ukrainian"]

In [5]:
# Buttons:
switch_language = widgets.Button(description='Switch language')
translate = widgets.Button(description='Translate')

buttons = widgets.VBox([switch_language, translate])

# Text fields:
source_label = widgets.Label(labels[is_uk])
source_text = widgets.Textarea()
source = widgets.VBox([source_label, source_text])

target_label = widgets.Label(labels[not is_uk])
target_text = widgets.Output()
target = widgets.VBox([target_label, target_text])

dashboard = widgets.HBox([source, buttons, target])

display(dashboard)

HBox(children=(VBox(children=(Label(value='English'), Textarea(value=''))), VBox(children=(Button(description=…

In [6]:
def on_click_translate(change):
    source_doc = source_text.value
    inputs = preprocess(tokenizer, [source_doc])
    translation = transformer_uk(**inputs) if is_uk else transformer_en(**inputs)
    target_text.clear_output()
    with target_text:
        print(". ".join(postprocess(tokenizer, translation)))

translate.on_click(on_click_translate)

In [7]:
def on_click_switch(change):
    global is_uk
    is_uk = not is_uk
    source_label.value = labels[is_uk]
    target_label.value = labels[not is_uk]
    source_text.value = ""
    target_text.clear_output()

switch_language.on_click(on_click_switch, is_uk)