# Exploring the limits of Mix-training for NMT task on low resource languages.

Authors:  
- [Anam ur Rehman](inshaa307@gmail.com)
- [Mohammed El Dor](mohammed.eldor@studenti.polito.it)
- [Mohamad Mostafa](mohamad.mostafa@studenti.polito.it)


# 1. Preliminaries
This section is common among all upcoming training stages. 
This section will provide basis for next training phases.  
Here, we import common set of libraries, Initialize the tokenizer and introduce the utility scripts used while finetuning the models.  
The tokenizer is shared among all stages of fine tuning.


## Imports

In [7]:
# %%capture

# ! pip install transformers==4.12.5
# ! pip install sentencepiece==0.1.96
# ! pip install sacrebleu==2.0.0
# ! pip install datasets==1.16.1

# ! apt-get install sudo -y
# ! sudo apt-get install git-lfs
# ! git lfs install
# ! git clone "https://github.com/DeskDown/NMT.git"

In [9]:
import numpy as np
import pandas as pd
import os, time
from tqdm.auto import tqdm
from datasets import Dataset, load_dataset, load_metric
from transformers import (
    MarianTokenizer,
    MBart50TokenizerFast,
    AutoModelForSeq2SeqLM, 
    DataCollatorForSeq2Seq, 
    Seq2SeqTrainingArguments, 
    Seq2SeqTrainer,
    utils
                    )

utils.logging.set_verbosity(50)
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [None]:
SEED = 99
TOKEN = "<ENTER YOUR HF TOKEN HERE>"

## Tokenizer

Requirements:
1. Provide the name for helper language ('ja' or 'zh') in the next section of code.  

Method:
1. A pretrained tokenizer is loaded from HF model hub based on the selected helper language.
2. Tokens from ALT dataset languages are added in the vocabulary.
3. Tokens were generated using utility script `./make_tokens.py`

In [None]:
# Specify the language(s) to include in dataset
helper = 'zh'

model_name = {
        "zh":"Helsinki-NLP/opus-mt-en-zh",
        "ja":"Helsinki-NLP/opus-tatoeba-en-ja"
    }

utils.logging.set_verbosity(50)
# languages_subset = ['hi']
# all_ds = 'DeskDown/ALTDataset'
# alt_ds = load_dataset(all_ds)
# sub_ds = alt_ds.filter(lambda example: example['lang_yy'] in languages_subset)


def init_tokenizer(model_name):
    utils.logging.set_verbosity(50)
    # Load the fresh tokenizer from HF hub
    marian_model_name = model_name
    marian_tokenizer = MarianTokenizer.from_pretrained(marian_model_name)

    # Add tokens to enable tokenization of languages not originaly supported by Marian tokenizer
    tokens_file = "NMT/tokens.txt"
    assert os.path.isfile(tokens_file), "Clone the github repository: \
        git clone https://github.com/DeskDown/NMT.git"

    with open(tokens_file) as fp:
        tokens = fp.read().split("\n")

    marian_tokenizer.add_tokens(tokens, special_tokens=True)
    return marian_tokenizer

marian_tokenizer = init_tokenizer(model_name[helper])

## Preprocessing and compute metrics

Requirements:
Provide values for these parameters in next section  
1. max_input_length
2. max_output_length
3. batch_size



In [None]:
max_input_length = 128
max_target_length = 128
batch_size = 64

columns_to_transform = ['input_ids', 'labels', 'attention_mask']
metric = load_metric("sacrebleu")
marian_model_name = model_name[helper]


def preprocess_function(examples):
    utils.logging.set_verbosity(50)
    tokenizer = marian_tokenizer
    
    inputs = [s for s in examples["Sent_en"]]
    targets = [s for s in examples["Sent_yy"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")

    labels = tokenizer(targets, max_length=max_target_length, truncation=True, padding="max_length")

    if len(examples["Sent_en"]) > 1 and (len(model_inputs["input_ids"][0])!=len(model_inputs["input_ids"][1])):
        print ("Error!", )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def compute_metrics(eval_preds, tokenizer = None):
    if tokenizer is None:
        tokenizer = marian_tokenizer
    
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
        
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds =  [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels] 
    
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)

    result = {k: round(v, 4) for k, v in result.items()}
    return result 

# 2. Pure-Finetuning the Pretrained Model

In this section we load the pre-trained model from huggingface hub and perform pure-finetuning using one single language subset from ALT dataset. 

## Dataset

Requirements:
1. Provide `language_code` of ALT dataset language from set {'fil', 'hi', 'id', 'ja', 'khm', 'ms', 'my', 'th', 'vi'}

Method:
1. We load the custom ALT dataset from HF hub `DeskDown/ALTDataset` generated using utility script `make_dataset.py` and later uploaded to HF hub
2. The desired language is filtered from the ALT dataset
3. Encodings are generated using Marian tokenizer and we update the encoding matrix size for tranlation model


In [None]:
all_ds = 'DeskDown/ALTDataset'
alt_ds = load_dataset(all_ds)

In [None]:
language_code = 'hi'

sub_ds = alt_ds.filter(lambda example: example['lang_yy'] == language_code)
model = AutoModelForSeq2SeqLM.from_pretrained(marian_model_name)
model.resize_token_embeddings(len(marian_tokenizer))
sub_ds = sub_ds.map(preprocess_function, batched=True, batch_size=batch_size*3)
sub_ds.set_format(type='torch', columns=columns_to_transform)

## Training

Requirements:  
1. Provide Hyper parameters for finetuning process.

In [None]:
data_collator = DataCollatorForSeq2Seq(marian_tokenizer, model=model)
source_lang = f"en-{helper}"
target_lang = language_code
model_name = f"marianPFT_{source_lang}-{target_lang}"

args = Seq2SeqTrainingArguments(
    # Misc
    output_dir = model_name,
    seed = 99,
    evaluation_strategy = "epoch",
    log_level = 'warning',
    disable_tqdm = False,
    
    # Hyper parameters
    learning_rate=1e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    warmup_steps = 10,
    num_train_epochs=15,
    predict_with_generate=True,
    remove_unused_columns = True,
    fp16=True,

    # Model backup
    save_total_limit=3,
    save_strategy = "epoch",
)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=sub_ds["train"],
    eval_dataset=sub_ds["eval"],
    data_collator=data_collator,
    tokenizer=marian_tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.predict(sub_ds["test "])

# 3. Mix-FinueTuning

In this section we load the pre-trained model from huggingface hub and perform Mix-finetuning using a subset of languages from ALT dataset and Helper language dataset which was used during pre-training of HF model.

## Dataset

Requirements:
1. Provide `yy_mix_subset` list of ALT dataset languages from set {'fil', 'hi', 'id', 'ja', 'khm', 'ms', 'my', 'th', 'vi'}  
    This subset is alongside helper dataset to further train the model.
2. Provide `helper_size` which is the portion of helper corpus used for training. We used the minimum number of senetences of ALT dataset as helper_size.


Method:
1. We load the custom ALT dataset from HF hub `DeskDown/ALTDataset` generated using utility script `make_dataset.py` and later uploaded to HF hub
2. The desired languages are filtered from the ALT dataset
3. We load the helper dataset from HF hub and concetinate these datasets.
4. Encodings are generated using Marian tokenizer and we update the encoding matrix size for tranlation model

In [None]:
from datasets import concatenate_datasets

yy_mix_subset = ['fil', 'hi', 'id', 'ja', 'khm', 'ms', 'my', 'th', 'vi']
helper_size = 18_000


def make_mix_training_dataset(
    ALT_ds,
    helper,
    helper_size = 20_000,
    yy_mix_subset = ["vi"]
    ):
    
    pad = "*"*4
    select_subset = lambda x: x["lang_yy"] in yy_mix_subset

    def pre_process_helper(example):
        example["Sent_en"] = example["translation"]["en"]
        example["Sent_yy"] = example["translation"][helper]

        return example

    
    helper_ds = load_dataset("opus100", f'en-{helper}', split = "train")
    alt_train_ds = ALT_ds["train"].filter(select_subset).remove_columns(["SID", "lang_yy"])
    helper_train_ds = helper_ds.select(range(helper_size)).map(pre_process_helper).remove_columns("translation")

    print(f"{pad}Alt dataset used for mix training{pad}\nTotal Languages: {len(yy_mix_subset)}\nSentences: {len(alt_train_ds)}")
    print(f"{pad}Opus dataset used for mix training{pad}\nLanguages: en-zh\nSentences: {helper_train_ds}")

    mix_ds = concatenate_datasets([alt_train_ds, helper_train_ds])
    print(f"{pad}Final dataset for Mix Training{pad}\nLanguages: {1+len(yy_mix_subset)}\nSentences: {len(mix_ds)}")
    
    return mix_ds.shuffle(seed = SEED)


mix_ds = make_mix_training_dataset(alt_ds, helper, helper_size = helper_size, yy_mix_subset = yy_mix_subset)

In [None]:
mix_ds = mix_ds.map(preprocess_function, batched=True, batch_size=batch_size*3)
mix_ds.set_format(type='torch', columns=columns_to_transform)

mix_eval_ds = alt_ds["eval"].filter(lambda x: x["lang_yy"] in yy_mix_subset)
mix_eval_ds = mix_eval_ds.map(preprocess_function, batched=True, batch_size=batch_size)
mix_eval_ds.set_format(type='torch', columns=columns_to_transform)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name[helper])
model.resize_token_embeddings(len(marian_tokenizer))

## Training

Requirements:  
1. Provide Hyper parameters for finetuning process.
2. If you want to push the model of HF hub, Make sure you have provided the correct `TOKEN` code.


In [None]:
batch_size = 32
data_collator = DataCollatorForSeq2Seq(marian_tokenizer, model=model)
model_name = f"MarianMix_en-{helper}-10"

args = Seq2SeqTrainingArguments(
    # Misc
    output_dir = model_name,
    seed = 99,
    evaluation_strategy = "steps",
    eval_steps = 10_000,
    log_level = 'warning',
    disable_tqdm = False,
    
    # Hyper parameters
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    warmup_steps = 10,
    num_train_epochs=5,
    predict_with_generate=True,
    remove_unused_columns = True,
    fp16=True,

    # Model backup
    save_total_limit=3,
    save_strategy = "steps", #"epoch",
    save_steps = 10_000,
    push_to_hub = True,
    hub_token = TOKEN,
    hub_model_id = model_name,
    hub_strategy = "checkpoint"
)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=mix_ds,
    eval_dataset=mix_eval_ds,
    data_collator=data_collator,
    tokenizer=marian_tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
model.push_to_hub(repo_path_or_name = model_name,
                  use_auth_token = TOKEN, 
                  commit_message = f"Mix Trained marian model over 9 low resource languages of ALT dataset + 18K senetences from en-{helper} opus dataset.")


# 4. Pure-Finetuning the Mix-trained model

In this section we load the Mix-trained model from our huggingface hub and perform Pure-finetuning using one language from ALT dataset.

## Dataset

Requirements:
1. Provide `language_code` of ALT dataset language from set {'fil', 'hi', 'id', 'ja', 'khm', 'ms', 'my', 'th', 'vi'}  

Method:
1. We load the custom ALT dataset from HF hub `DeskDown/ALTDataset` generated using utility script `make_dataset.py` and later uploaded to HF hub
2. The desired language is filtered from the ALT dataset
3. Encodings are generated using Marian tokenizer and we update the encoding matrix size for tranlation model
4. We use the Mix-trained model `DeskDown/MarianMix_en-[ja|zh]-10` to be further fine-tuned on selected language


In [None]:
def init_mix_model(size, name):
    model = AutoModelForSeq2SeqLM.from_pretrained(name)
    model.resize_token_embeddings(size)
    return model

alt_ds = load_dataset('DeskDown/ALTDataset')
tokenizer = init_tokenizer(model_name[helper])
model = init_mix_model(len(tokenizer), name = "DeskDown/MarianMix_en-ja-10")

In [None]:
language_code = 'th'


mix_trainded_model = "DeskDown/MarianMix_en-ja-10"
fil = lambda x: x["lang_yy"] == language_code
model = init_mix_model(len(tokenizer), mix_trainded_model)
sub_ds = alt_ds.filter(fil)
sub_ds = sub_ds.map(preprocess_function, batch_size = batch_size*3, batched=True)
sub_ds.set_format(type='torch', columns=columns_to_transform)

## Training

Requirements:  
1. Provide Hyper parameters for finetuning process.
2. If you want to push the model of HF hub, Make sure you have provided the correct `TOKEN` code.


In [None]:
batch_size = 32
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
source_lang = "en"
target_lang = language_code
model_name = f"MarianMixFT_en-{language_code}"

args = Seq2SeqTrainingArguments(
    # Misc
    output_dir = model_name, #_{source_lang}_to_{target_lang}",
    seed = 99,
    evaluation_strategy = "epoch",
    log_level = 'warning',
    disable_tqdm = False,
    
    # Hyper parameters
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    warmup_steps = 10,
    num_train_epochs=10,
    predict_with_generate=True,
    remove_unused_columns = True,
    fp16=True,

    # Model backup
    save_total_limit=3,
    save_strategy = "epoch",
)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=sub_ds["train"],
    eval_dataset=sub_ds["eval"],
    data_collator=data_collator,
    tokenizer= tokenizer,
    compute_metrics= compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.predict(sub_ds["test"])

In [None]:
model_name = f"MarianMixFT_en-{language_code}"

model.push_to_hub(
    model_name,
    use_temp_dir= True,
    use_auth_token=TOKEN
    )