# This model is trained to translate in both directions at the same time  


# Installing Dependencies

In [None]:
!pip install wandb -qqq
!pip install simpletransformers -q
!pip install evaluate
!pip install sacrebleu
from google.colab import drive
drive.mount('/content/drive')
import logging
import pandas as pd
import torch
import os
import random
import math
import wandb
#For logging loss
wandb.login(key = "46cb1e2ffa78177f23adbe2d7d16cf09a6176348")
import numpy as np
# import evaluate
from simpletransformers.t5 import  T5Args
from simpletransformers.t5 import T5Model, T5Args
# 46cb1e2ffa78177f23adbe2d7d16cf09a6176348

In [2]:
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.enabled = False
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed(7)

# Data Preprocessing & Checking


In [3]:
!unzip /content/MT5_Dataset.zip

Archive:  /content/MT5_Dataset.zip
  inflating: eval.tsv                
  inflating: test.ar                 
  inflating: test.sw                 
  inflating: train.ar                
  inflating: train.sw                
  inflating: train.tsv               


In [5]:
def prepare_translation_datasets(data_path):
    with open(os.path.join(data_path, "train.sw"), "r", encoding="utf-8") as f:
        sw_text = f.readlines()
        sw_text = [text.strip("\n") for text in sw_text]

    with open(os.path.join(data_path, "train.ar"), "r") as f:
        arabic_text = f.readlines()
        arabic_text = [text.strip("\n") for text in arabic_text]

    data = []
    for swahili, arabic in zip(sw_text, arabic_text):
        data.append(["translate swahili to arabic", swahili, arabic])
        data.append(["translate arabic to swahili", arabic, swahili])

    train_df = pd.DataFrame(data, columns=["prefix", "input_text", "target_text"])



    with open(os.path.join(data_path, "test.sw"), "r", encoding="utf-8") as f:
        sw_text = f.readlines()
        sw_text = [text.strip("\n") for text in sw_text]

    with open(os.path.join(data_path, "test.ar"), "r") as f:
        arabic_text = f.readlines()
        arabic_text = [text.strip("\n") for text in arabic_text]

    data = []
    for swahili, arabic in zip(sw_text, arabic_text):
        data.append(["translate swahili to arabic", swahili, arabic])
        data.append(["translate arabic to swahili", arabic, swahili])



    eval_df = pd.DataFrame(data, columns=["prefix", "input_text", "target_text"])

    return train_df, eval_df

train_df, eval_df = prepare_translation_datasets("/content")

train_df.to_csv("/content/train.tsv", sep="\t",index = False)
eval_df.to_csv("/content/eval.tsv", sep="\t", index= False)


In [None]:
train_df.head(10)

# Training The Model
The training was resumed from saved checkpoints due to limited resources

In [None]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

train_df = pd.read_csv("/content/train.tsv", sep="\t").astype(str)


model_args = T5Args()
model_args.max_seq_length = 100
model_args.train_batch_size = 10
model_args.eval_batch_size = 10
model_args.num_train_epochs = 10
model_args.scheduler = "cosine_schedule_with_warmup"
model_args.evaluate_during_training = False
model_args.evaluate_during_training_steps = 10000
model_args.learning_rate = 0.0003
model_args.optimizer = 'Adafactor'
model_args.use_multiprocessing = False
model_args.fp16 = False
model_args.save_steps = -1
model_args.save_eval_checkpoints = False
model_args.no_cache = True
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True
model_args.save_model_every_epoch = True
model_args.use_early_stopping = True
model_args.num_return_sequences = 1
model_args.do_lower_case = False
model_args.output_dir = "/content/drive/MyDrive/mt5_training_checkpoints"
model_args.best_model_dir = "/content/drive/MyDrive/best_model"
model_args.wandb_project = "bidirectional_fine_tuning_with_mt5"

model = T5Model("mt5", "google/mt5-base", args=model_args)

model_output_dir = "/content/drive/MyDrive/mt5_training_checkpoints"


#this line was used to resume training from the last saved checkpoint - due to the limiteed resources, training was done by resuming from a saved checkpoint
# model = T5Model("mt5","/content/drive/MyDrive/ddd/checkpoint-29535-epoch-1", args=model_args)




# Train the model
model.train_model(train_df)



# Generating Predictions (Translations) & Calculating BLEU Scores for Both Directions

In [None]:
import os
import logging
import sacrebleu
import pandas as pd
from simpletransformers.t5 import T5Model, T5Args
import torch
torch.manual_seed(0)
import random
random.seed(0)

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)



model_args = T5Args()
model_args.max_length = 128
model_args.length_penalty = 1
model_args.num_beams = 5
model_args.eval_batch_size = 32

model_output_dir = "/content/drive/MyDrive/mt5_training_checkpoints"

# for the resumed checkpoints
# model_output_dir = "/content/drive/MyDrive/ddd/checkpoint-29535-epoch-1"


#Initialize your trained model
model = T5Model("mt5", model_output_dir, args=model_args, )#cuda_devices=[6])

eval_df = pd.read_csv("/content/eval.tsv", sep="\t").astype(str)

ar_truth = [eval_df.loc[eval_df["prefix"] == "translate swahili to arabic"]["target_text"].tolist()]
to_arabic = eval_df.loc[eval_df["prefix"] == "translate swahili to arabic"]["input_text"].tolist()
ar_truth_list = eval_df.loc[eval_df["prefix"] == "translate swahili to arabic"]["target_text"].tolist()

swahili_truth = [eval_df.loc[eval_df["prefix"] == "translate arabic to swahili"]["target_text"].tolist()]
to_swahili = eval_df.loc[eval_df["prefix"] == "translate arabic to swahili"]["input_text"].tolist()
swahili_truth_list = eval_df.loc[eval_df["prefix"] == "translate arabic to swahili"]["target_text"].tolist()
sw2ar = "translate swahili to arabic: "
ar2sw = "translate arabic to swahili: "
to_arabic_ = [sw2ar + s for s in to_arabic]
to_swahili_  = [ar2sw + s for s in to_swahili]

# generate predictions & Calculate BLEU scores


# Predict
arabic_preds = model.predict(to_arabic_)


sw_ar_bleu = sacrebleu.corpus_bleu(arabic_preds, ar_truth)
print("--------------------------")
print("swahili to Arabic: ", sw_ar_bleu.score)

print(f"the type of the prediction is type(arabic_preds)")

with open("Swahili_to_Arabic_REPORT_RESULTS.txt", "w", encoding="utf-8") as fb:
    counter=0
    for index,ar in enumerate(arabic_preds):
        source_line = "src: " + to_arabic[counter] + "\n"
        fb.write(source_line)
        real_line = "real: " + str(ar_truth_list[counter]) + "\n"
        fb.write(real_line)
        pred_line = "pred: " + str(ar) +"\n"

        fb.write(pred_line)
        sperator = " --------------     --------------  ----------------   -------------  \n "
        fb.write(sperator)
        counter +=1


swahili_preds = model.predict(to_swahili_)

ar_sw_bleu = sacrebleu.corpus_bleu(swahili_preds, swahili_truth)
print("Arabic to Swahili: ", ar_sw_bleu.score)


#Saving the translations to txt files for human evaluation
counter=0
with open("Arabic_to_Swahili_REPORT_RESULTS.txt", "w", encoding="utf-8") as fb:
    for index,sw in enumerate(swahili_preds):
        source_line =  "src: " + to_swahili[counter] + "\n"
        fb.write(source_line)
        real_line = "real: " + str(swahili_truth_list[counter]) + "\n"
        fb.write(real_line)
        pred_line = "pred: " + str(sw) +"\n"
        counter +=1
        fb.write(pred_line)
        sperator = " --------------     --------------  ----------------   -------------  \n "
        fb.write(sperator)

Generating outputs:   0%|          | 0/103 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Decoding outputs:   0%|          | 0/3281 [00:00<?, ?it/s]

--------------------------
swahili to Arabic:  11.666466786466126
the type of the prediction is type(arabic_preds)


Generating outputs:   0%|          | 0/103 [00:00<?, ?it/s]

Decoding outputs:   0%|          | 0/3281 [00:00<?, ?it/s]

Arabic to Swahili:  20.147580018140495


In [None]:
# 3 epochs
# swahili to Arabic:  9.567610359033786
# Arabic to Swahili:  17.312042765944515

# 4 epochs
# swahili to Arabic:  10.62605757681651
# Arabic to Swahili:  18.327960441827823

# 5 epochs
# swahili to Arabic:  10.543739139463039
# Arabic to Swahili:  18.357842430345453

# 6 epochs
# swahili to Arabic:  11.666466786466126
# Arabic to Swahili:  20.147580018140495