# Prerequisites

In [18]:
langs = ['EN','DE','FR']
vocab_addr = {
    'EN' : "/*address to EN vocab file*/",
    'FR' : "/*address to FR vocab file*/",
    'DE' : "/*address to DE vocab file*/",
    'Multi' : "/*address to multlilingual vocab file*/",
}

corpus_addr = {
    'EN' : "/content/en.txt",
    'FR' : "/*address to FR corpus file*/",
    'DE' : "/*address to DE corpus file*/",
}

word_by_word_output_addr = {
    'EN' : "./en-wbw",
    'FR' : "/*address to FR word-by-word translation files*/",
    'DE' : "/*address to DE word-by-word translation files*/",
    'Multi' : "/*address to  multilingual word-by-word translation improved files*/"
}

denosing_output_addr = {
    'EN' : "./en-d.txt",
    'FR' : "/*address to FR denoising file*/",
    'DE' : "/*address to DE denoising file*/",
}

output_model_addr = {
    'EN' : "./en-m",
    'FR' : "/*address to FR denosing model output dir*/",
    'DE' : "/*address to DE denosing model output dir*/",
    'Multi' : "/*address to multlilingual model output dir*/",
}

fine_tune_addr = {
    "EN-FR" : "/*address to fine-tuned on EN-FR model output dir*/"
}

# Generating Denoising Pre-Training Data

In [16]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [19]:
input_corpus = corpus_addr['EN']
denoising = denosing_output_addr['EN']
!python pretrain/peach/denoising/main.py pretrain/peach/denoising/config-en.json $input_corpus $denoising

In [None]:
input_corpus = corpus_addr['FR']
denoising = denosing_output_addr['FR']
!python pretrain/peach/denoising/main.py pretrain/peach/denoising/config-en.json $input_corpus $denoising

In [None]:
input_corpus = corpus_addr['DE']
denoising = denosing_output_addr['DE']
!python pretrain/peach/denoising/main.py pretrain/peach/denoising/config-en.json $input_corpus $denoising

# Pre-Training Denoising models

In [None]:
!pip install -r PEACH/models/requirements.txt

In [None]:
!pip install -e PEACH/models/

**Note:** First, fill the addresses field in the dataset files properly due to the guideline provided, then execute next blocks.

In [20]:
!cp models/peach/datasets/denoising/*.py /usr/local/lib/python3.7/dist-packages/tensorflow_datasets/translate

In [None]:
!cat models/peach/datasets/denoising/imports.txt >> /usr/local/lib/python3.7/dist-packages/tensorflow_datasets/translate/__init__.py

In [None]:
vocab = vocab_addr['EN']
output_dir = output_model_addr['EN']
!python3 models/peach/bin/train.py --params="en-denosing" \
--param_overrides=vocab_filename=$vocab \
--model_dir=$output_dir \

In [None]:
vocab = vocab_addr['FR']
output_dir = output_model_addr['FR']
!python3 models/peach/bin/train.py --params="fr-denosing" \
--param_overrides=vocab_filename=$vocab \
--model_dir=$output_dir \

In [None]:
vocab = vocab_addr['DE']
output_dir = output_model_addr['DE']
!python3 models/peach/bin/train.py --params="de-denosing" \
--param_overrides=vocab_filename=$vocab \
--model_dir=$output_dir \

# Generating Word-By-Word Translation Data

In [40]:
pip install -r PEACH/pretrain/requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Morfessor
  Downloading Morfessor-2.0.6-py3-none-any.whl (35 kB)
Installing collected packages: Morfessor
Successfully installed Morfessor-2.0.6


In [None]:
!bash PEACH/pretrain/peach/translation/requirements.sh

In [50]:
input_corpus = corpus_addr['EN']
output_w_by_w = word_by_word_output_addr['EN']
!python pretrain/peach/translation/main.py pretrain/peach/translation/config-en.json $input_corpus $output_w_by_w

In [None]:
input_corpus = corpus_addr['FR']
output_w_by_w = word_by_word_output_addr['FR']
!python pretrain/peach/translation/main.py pretrain/peach/translation/config-fr.json $input_corpus $output_w_by_w

In [None]:
input_corpus = corpus_addr['DE']
output_w_by_w = word_by_word_output_addr['DE']
!python pretrain/peach/translation/main.py pretrain/peach/translation/config-de.json $input_corpus $output_w_by_w

# Improving the Quality of Word-By-Word Translation Data Using Pre-Trained Denosing Models

**Note:** First, fill the addresses field in the dataset files properly due to the guideline provided, then execute next blocks.

In [None]:
!cp models/peach/datasets/improve/*.py /usr/local/lib/python3.7/dist-packages/tensorflow_datasets/translate

In [None]:
!cat models/peach/datasets/improve/imports.txt >> /usr/local/lib/python3.7/dist-packages/tensorflow_datasets/translate/__init__.py

In [None]:
import os

In [None]:
vocab = vocab_addr['EN']
output_dir = os.path.join(output_model_addr['EN'],"model.ckpt-500000")
number_of_preds = 1000000
!python3 models/peach/bin/predict.py --params="en-improve" \
--param_overrides=vocab_filename=$vocab \
--model_dir=$output_dir \
--evaluate_test="True" \
--total_predictions=$number_of_preds \

In [None]:
vocab = vocab_addr['FR']
output_dir = os.path.join(output_model_addr['FR'],"model.ckpt-500000")
number_of_preds = 1000000
!python3 models/peach/bin/predict.py --params="fr-improve" \
--param_overrides=vocab_filename=$vocab \
--model_dir=$output_dir/model.ckpt-500000 \
--evaluate_test="True" \
--total_predictions=$number_of_preds \

In [None]:
vocab = vocab_addr['DE']
output_dir = os.path.join(output_model_addr['DE'],"model.ckpt-500000")
number_of_preds = 1000000
!python3 models/peach/bin/predict.py --params="de-improve" \
--param_overrides=vocab_filename=$vocab \
--model_dir=$output_dir/model.ckpt-500000 \
--evaluate_test="True" \
--total_predictions=$number_of_preds \

In [None]:
import glob
import os
en_pred_files = glob.glob(os.path.join(output_model_addr['EN'],"predictions*.txt"))
en_targ_files = glob.glob(os.path.join(output_model_addr['EN'], "targets*.txt"))
en_inp_files = glob.glob(os.path.join(output_model_addr['EN'], "inputs*.txt"))
en_pred_files.sort()
en_targ_files.sort()
en_inp_files.sort()

fr_pred_files = glob.glob(os.path.join(output_model_addr['FR'], "predictions*.txt"))
fr_targ_files = glob.glob(os.path.join(output_model_addr['FR'], "targets*.txt"))
fr_inp_files = glob.glob(os.path.join(output_model_addr['FR'], "inputs*.txt"))
fr_pred_files.sort()
fr_targ_files.sort()
fr_inp_files.sort()

de_pred_files = glob.glob(os.path.join(output_model_addr['DE'], "predictions*.txt"))
de_targ_files = glob.glob(os.path.join(output_model_addr['DE'], "targets*.txt"))
de_inp_files = glob.glob(os.path.join(output_model_addr['DE'], "inputs*.txt"))
de_pred_files.sort()
de_targ_files.sort()
de_inp_files.sort()

pred_files = en_pred_files + fr_pred_files + de_pred_files
targ_files = en_targ_files + fr_targ_files + de_targ_files
inp_files = en_inp_files + fr_inp_files + de_inp_files

preds = []
targets = []
for pred_file,tar_file,input_file in zip(pred_files,targ_files,inp_files):
    with open(pred_file) as inpFilePer, open(tar_file) as inpFileTar, open(input_file) as inpInFile:
        i=0
        for pred, target in zip(inpFilePer,inpFileTar):
            i += 1
            if i % 2 == 0:
                pred = pred.replace("⁇ n ⁇","").strip().replace("⁇","")
                target = target.replace("⁇ n ⁇","").replace("⁇ de ⁇ fr ⁇","<de><fr>").replace("⁇ en ⁇ mk ⁇","<en><mk>").replace("⁇ mk ⁇ en ⁇","<mk><en>").replace("⁇ de ⁇ en ⁇","<de><en>").replace("⁇ en ⁇ de ⁇","<en><de>").replace("⁇ en ⁇ fr ⁇","<en><fr>").replace("⁇ fr ⁇ de ⁇","<fr><de>").replace("⁇ fr ⁇ en ⁇","<fr><en>").strip().replace("⁇","")
                preds.append(pred)
                targets.append(target)
with open(os.path.join(word_by_word_output_addr['Multi'], "inputs.txt"),"w") as inp, open(os.path.join(word_by_word_output_addr['Multi'], "outputs.txt"),"w") as out:
    for i,o in zip(targets,preds):
        inp.write(i + "\n")
        out.write(o + "\n")

# Pre-Training PEACH

**Note:** First, fill the addresses field in the dataset files properly due to the guideline provided, then execute next blocks.

In [None]:
!cp models/peach/datasets/SPDG/*.py /usr/local/lib/python3.7/dist-packages/tensorflow_datasets/translate

In [None]:
!cat models/peach/datasets/SPDG/imports.txt >> /usr/local/lib/python3.7/dist-packages/tensorflow_datasets/translate/__init__.py

In [None]:
vocab = vocab_addr['Multi']
output_dir = output_model_addr['Multi']
!python3 models/peach/bin/train.py --params="multi-SPDG" \
--param_overrides=vocab_filename=$vocab \
--model_dir=$output_dir \

# Fine-Tuning PEACH on Translation

**Note:** First, fill the addresses field in the dataset files properly due to the guideline provided, then execute next blocks.

In [None]:
!cp models/peach/datasets/downstream/*.py /usr/local/lib/python3.7/dist-packages/tensorflow_datasets/translate

In [None]:
!cat models/peach/datasets/downstream/imports.txt >> /usr/local/lib/python3.7/dist-packages/tensorflow_datasets/translate/__init__.py

In [None]:
vocab = vocab_addr['Multi']
output_dir = fine_tune_addr['EN-FR']
base_model = output_dir = os.path.join(output_model_addr['Multi'],"model.ckpt-500000")
!python3 models/peach/bin/train.py --params="en-fr" \
--param_overrides=vocab_filename=$vocab \
--model_dir=$output_dir \
--train_init_checkpoint=$base_model

In [None]:
vocab = vocab_addr['Multi']
output_dir = os.path.join(fine_tune_addr['EN-FR'], "model.ckpt-50000")
!python3 models/peach/bin/evaluate.py --params="en-fr" \
--param_overrides=vocab_filename=$vocab \
--model_dir=$output_dir \
--evaluate_test="True"