## Load embedding data

In [27]:
def tokenize_data(datafiles, tokenizer, save_filepath):
    # data = []

    for filename in datafiles:
        with open(filename, "r") as f:
            data = f.readlines()

        data_preproc = [" ".join([word.lower() for word in tokenizer._tokenize_string(line)]) for line in data]
        print(len(data_preproc))

        with open(save_filepath, "a") as f:
            for line in data_preproc:
                f.write(u"{}\n".format(line))

In [10]:
en_embed_files = [
    "data/JW300/en.txt",
    # "data/OpenSubtitles/en.txt"
]

uk_embed_files = [
    "data/JW300/uk.txt",
    "data/OpenSubtitles/uk.txt"
]

In [3]:
# data_en = []
# data_uk = []

# for filename in en_embed_files:
#     with open(filename, "r") as f:
#         data_en += f.readlines()

# for filename in uk_embed_files:
#     with open(filename, "r") as f:
#         data_uk += f.readlines()

## Tokenize

In [4]:
from opennmt.tokenizers.opennmt_tokenizer import OpenNMTTokenizer

token_config = {
    "type": "OpenNMTTokenizer",
    "params": {
        "mode": "aggressive",
        "joiner_annotate": True,
        "segment_numbers": True,
        "segment_alphabet_change": True
    }
}

In [5]:
tokenizer = OpenNMTTokenizer(**token_config["params"])

In [45]:
# import re

# re_alpha = r"\w+[’'`-]?\w{0,}"

# data_en_preproc = [" ".join([word.lower() for word in tokenizer._tokenize_string(line)]) for line in data_en]
# data_uk_preproc = [" ".join([word.lower() for word in tokenizer._tokenize_string(line)]) for line in data_uk]

In [47]:
# with open("data/en_train_embeddings.txt", "w") as f:
#     for line in data_en_preproc:
#         f.write(u"{}\n".format(line))

In [48]:
# with open("data/uk_train_embeddings.txt", "w") as f:
#     for line in data_uk_preproc:
#         f.write(u"{}\n".format(line))

In [11]:
tokenize_data(uk_embed_files, tokenizer, "data/uk_train_embeddings.txt")

In [12]:
tokenize_data(en_embed_files, tokenizer, "data/en_train_embeddings.txt")

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("data/tatoeba_en-uk.tsv", sep='\t', header=None, index_col=0)
df.columns = ["en", 'code', 'uk']
df.drop("code", axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)

In [3]:
df

Unnamed: 0,en,uk
0,Let's try something.,Давайте щось спробуємо!
1,I have to go to sleep.,Маю піти спати.
2,Muiriel is 20 now.,Мюріел зараз двадцять.
3,"The password is ""Muiriel"".","Пароль - ""Muiriel""."
4,I will be back soon.,Я скоро повернуся.
...,...,...
199611,You pervert!,Збоченець!
199612,You pervert!,Збоченка!
199613,I'm in the post office.,Я на пошті.
199614,"Even though John could fight, Adrion was still...","Хоча Джон і вмів битися, із нас трьох найсильн..."


In [4]:
df["token_len_en"] = df.en.apply(lambda x: len(x.split()))
df["token_len_uk"] = df.uk.apply(lambda x: len(x.split()))

In [5]:
df.describe()

Unnamed: 0,token_len_en,token_len_uk
count,199616.0,199616.0
mean,5.535623,4.789336
std,2.329145,2.142607
min,1.0,1.0
25%,4.0,3.0
50%,5.0,4.0
75%,7.0,6.0
max,141.0,97.0


In [6]:
from nltk.tokenize import word_tokenize

In [7]:
df["en_tokenized"] = df.en.apply(lambda x: [word.lower() for word in word_tokenize(x)])
df["uk_tokenized"] = df.uk.apply(lambda x: [word.lower() for word in word_tokenize(x)])

In [8]:
df

Unnamed: 0,en,uk,token_len_en,token_len_uk,en_tokenized,uk_tokenized
0,Let's try something.,Давайте щось спробуємо!,3,3,"[let, 's, try, something, .]","[давайте, щось, спробуємо, !]"
1,I have to go to sleep.,Маю піти спати.,6,3,"[i, have, to, go, to, sleep, .]","[маю, піти, спати, .]"
2,Muiriel is 20 now.,Мюріел зараз двадцять.,4,3,"[muiriel, is, 20, now, .]","[мюріел, зараз, двадцять, .]"
3,"The password is ""Muiriel"".","Пароль - ""Muiriel"".",4,3,"[the, password, is, ``, muiriel, '', .]","[пароль, -, ``, muiriel, '', .]"
4,I will be back soon.,Я скоро повернуся.,5,3,"[i, will, be, back, soon, .]","[я, скоро, повернуся, .]"
...,...,...,...,...,...,...
199611,You pervert!,Збоченець!,2,1,"[you, pervert, !]","[збоченець, !]"
199612,You pervert!,Збоченка!,2,1,"[you, pervert, !]","[збоченка, !]"
199613,I'm in the post office.,Я на пошті.,5,3,"[i, 'm, in, the, post, office, .]","[я, на, пошті, .]"
199614,"Even though John could fight, Adrion was still...","Хоча Джон і вмів битися, із нас трьох найсильн...",16,13,"[even, though, john, could, fight, ,, adrion, ...","[хоча, джон, і, вмів, битися, ,, із, нас, трьо..."


In [9]:
# df.en.to_csv("data/en_data_test.txt")
# df.uk.to_csv("data/uk_data_test.txt")

In [10]:
f_train_en = open("data/tatoeba_en_train.txt", "w")
f_train_uk = open("data/tatoeba_uk_train.txt", "w")

for idx, row in df.iterrows():
    f_train_en.write(" ".join(row.en_tokenized) + "\n")
    f_train_uk.write(" ".join(row.uk_tokenized) + "\n")

f_train_en.close()
f_train_uk.close()

## Learn / apply fastText embeddings 

In [15]:
import fasttext
from models.fasttext_embeddings import train_embeddings, load_embeddings

In [16]:
d = 200

In [17]:
# embeddings_en = load_embeddings('models/jw300_en_embed_256.bin')
# embeddings_uk = load_embeddings('models/jw300_uk_embed_256.bin')

In [18]:
embeddings_en = train_embeddings('data/en_train_embeddings.txt', f'models/en_embed_{d}.bin', dim=d)
embeddings_uk = train_embeddings('data/uk_train_embeddings.txt', f'models/uk_embed_{d}.bin', dim=d)

In [19]:
embeddings_en.get_nearest_neighbors("hello")

[(0.6899593472480774, 'antonello'),
 (0.6833013296127319, 'tello'),
 (0.6821622848510742, 'dello'),
 (0.6261999607086182, '￭¡￭'),
 (0.6207203269004822, 'bello'),
 (0.6008277535438538, 'quijano'),
 (0.5916381478309631, 'mommy'),
 (0.5866191387176514, 'cello'),
 (0.5746167898178101, 'huh'),
 (0.5734177827835083, 'marcello')]

In [20]:
embeddings_uk.get_nearest_neighbors("привіт")

[(0.8191489577293396, 'привітик'),
 (0.7963200807571411, 'привітай'),
 (0.7717908620834351, 'привіти'),
 (0.7443921566009521, 'привітайся'),
 (0.7399115562438965, 'привіту'),
 (0.7350025773048401, 'привітує'),
 (0.7289767861366272, 'привітань'),
 (0.7125568985939026, 'привітом'),
 (0.668448269367218, 'привітався'),
 (0.6648420691490173, 'привітав')]

In [21]:
embeddings_en.get_nearest_neighbors('france')

[(0.7763105034828186, 'franc'),
 (0.7759435176849365, 'netherlands'),
 (0.7759305834770203, 'francesca'),
 (0.7719064950942993, 'belgium'),
 (0.7451868057250977, 'spain'),
 (0.732179582118988, 'italy'),
 (0.7302939295768738, 'franck'),
 (0.7294098734855652, 'germany'),
 (0.7277642488479614, 'frances'),
 (0.7251033186912537, 'francois')]

In [22]:
embeddings_uk.get_nearest_neighbors('україна')

[(0.8032983541488647, 'україни'),
 (0.7992186546325684, 'українці'),
 (0.782039225101471, 'україні'),
 (0.7620049118995667, 'україною'),
 (0.7545892000198364, 'україну'),
 (0.7529142498970032, 'українська'),
 (0.751456081867218, 'країна'),
 (0.7451082468032837, 'українців'),
 (0.7433254718780518, 'українець'),
 (0.6802698373794556, 'чехія')]

In [23]:
f_vocab_en = open(f"data/vocab_en_{d}.txt", "w")
f_embed_en = open(f"data/embed_en_{d}.txt", "w")

en_words = embeddings_en.get_words()
en_embeds = embeddings_en.get_output_matrix()

f_embed_en.write(f"{len(en_words)} {d}\n")
for word, embed_vector in zip(en_words, en_embeds):
    f_embed_en.write(" ".join([word] + embed_vector.astype('str').tolist()) + "\n")
    f_vocab_en.write(word + "\n")

f_vocab_en.close()
f_embed_en.close()

In [24]:
f_vocab_uk = open(f"data/vocab_uk_{d}.txt", "w")
f_embed_uk = open(f"data/embed_uk_{d}.txt", "w")

uk_words = embeddings_uk.get_words()
uk_embeds = embeddings_uk.get_output_matrix()

f_embed_uk.write(f"{len(uk_words)} {d}\n")
for word, embed_vector in zip(uk_words, uk_embeds):
    f_embed_uk.write(" ".join([word] + embed_vector.astype('str').tolist()) + "\n")
    f_vocab_uk.write(word + "\n")

f_vocab_uk.close()
f_embed_uk.close()

## Load translation data

In [26]:
en_translate_files = [
    "data/CC/CCAligned.en-uk.en",
    "data/MultiCC/MultiCCAligned.en-uk.en",
    "data/WikiMatrix/WikiMatrix.en-uk.en",
    "data/XLEnt/XLEnt.en-uk.en"
]

uk_translate_files = [
    "data/CC/CCAligned.en-uk.uk",
    "data/MultiCC/MultiCCAligned.en-uk.uk",
    "data/WikiMatrix/WikiMatrix.en-uk.uk",
    "data/XLEnt/XLEnt.en-uk.uk"
]

## Tokenize

In [1]:
tokenize_data(en_translate_files, tokenizer, "data/en_translate.txt")

NameError: name 'tokenize_data' is not defined

In [None]:
tokenize_data(uk_translate_files, tokenizer, "data/uk_translate.txt")

In [16]:
import opennmt

config = {
    "model_dir": "data/models/fastTextTransformer/checkpoints/",
    "data": {
        "source_embedding": {
            "path": f"data/embed_en_{d}.txt",
            "with_header": True,
            "case_insensitive": True,
            "trainable": False
        },
        "target_embedding": {
            "path": f"data/embed_uk_{d}.txt",
            "with_header": True,
            "case_insensitive": True,
            "trainable": False
        },
        "source_vocabulary": f"data/vocab_en_{d}.txt",
        "target_vocabulary": f"data/vocab_uk_{d}.txt",
        "train_features_file": "data/tatoeba_en_train.txt",
        "train_labels_file": "data/tatoeba_uk_train.txt",
    }
}

# model = opennmt.models.TransformerBase()
# runner = opennmt.Runner(model, config, auto_config=True)
# runner.train(num_devices=2, with_eval=True)

In [90]:
loaded_en_embeds = opennmt.inputters.load_pretrained_embeddings("data/embed_en.txt", "data/vocab_en.txt")
loaded_uk_embeds = opennmt.inputters.load_pretrained_embeddings("data/embed_uk.txt", "data/vocab_uk.txt")

In [2]:
class fastTextTransformer(opennmt.models.Transformer):
    def __init__(self, source_embeddings, target_embeddings):
        super().__init__(
            source_inputter=source_embeddings,
            target_inputter=target_embeddings,
            num_layers=6,
            num_units=d,
            num_heads=8,
            ffn_inner_dim=4 * d,
            dropout=0.1,
            attention_dropout=0.1,
            ffn_dropout=0.1,
            # share_embeddings=opennmt.models.EmbeddingsSharingLevel.ALL,
        )

model = fastTextTransformer(
    opennmt.inputters.WordEmbedder(),
    opennmt.inputters.WordEmbedder()
)

NameError: name 'opennmt' is not defined

In [23]:
runner = opennmt.Runner(model, config, auto_config=True)
runner.train(num_devices=1, with_eval=False)

INFO:tensorflow:Using OpenNMT-tf version 2.17.1
INFO:tensorflow:Using model:
(model): fastTextTransformer(
  (examples_inputter): SequenceToSequenceInputter(
    (features_inputter): WordEmbedder()
    (labels_inputter): WordEmbedder()
    (inputters): ListWrapper(
      (0): WordEmbedder()
      (1): WordEmbedder()
    )
  )
  (encoder): SelfAttentionEncoder(
    (position_encoder): SinusoidalPositionEncoder(
      (reducer): SumReducer()
    )
    (layer_norm): LayerNorm()
    (layers): ListWrapper(
      (0): SelfAttentionEncoderLayer(
        (self_attention): TransformerLayerWrapper(
          (layer): MultiHeadAttention(
            (linear_queries): Dense(256)
            (linear_keys): Dense(256)
            (linear_values): Dense(256)
            (linear_output): Dense(256)
          )
          (input_layer_norm): LayerNorm()
        )
        (ffn): TransformerLayerWrapper(
          (layer): FeedForwardNetwork(
            (inner): Dense(1024)
            (outer): Dense(256

In [1]:
model

NameError: name 'model' is not defined