# Using Transformers for Language Translation (English to French)
**Introduction:**
Based off knowledge gained from analyzing the initial implementaiton of a transformer in ["Attention Is All You Need"](https://arxiv.org/pdf/1706.03762.pdf) from Google in 2017, their work can be replicated -- and even trained using ROSIE -- in order to break down the individual components and their relationships within the overall structure of the Transformer.<br/>
The transformer architecture has been a revolutionary piece of knowledge provided to the world of sequence processing. As a seq2seq model, multiple implementations -- including modern BERT and GPT -- have used variations of this architecture to provide parallelizable throughput of input sequences, accept large input sequences with only a limit of onboard memory, and still provide generalized insights during the decoding process of the embedding space due to the introduction of self-attention.

**Walkthrough by Hugging Face:** https://huggingface.co/docs/transformers/tasks/translation <br/>
**Additional Documentation Added By:** Ben Paulson & John Cisler

# Part 1: Data Importing

**Ensure all Dependencies Installed**

In [None]:
# !pip install transformers datasets evaluate sacrebleu

**Import All Required Packages**

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
import evaluate
import numpy as np
from transformers import AdamWeightDecay
from transformers import TFAutoModelForSeq2SeqLM
import tensorflow as tf
from transformers.keras_callbacks import KerasMetricCallback
from transformers import pipeline
from transformers import AutoTokenizer


**Import Data**<br/>
Data includes input tensors, as well as the tokenizer used to create the tensors from the loaded corpus

In [None]:
books = load_dataset("opus_books", "en-fr")

In [None]:
books = books["train"].train_test_split(test_size=0.2)

In [None]:
checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
source_lang = "en"
target_lang = "fr"
prefix = "translate English to French: "

def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [None]:
tokenized_books = books.map(preprocess_function, batched=True)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf")

**Batch File Run Arguments**

In [None]:
# Potentionally specified by the command line (default values)
EXP_FOLDER = None
num_epochs = 5
save_iterations = 10

# Don't Touch -- Only for Batch Job
IS_PYTHON = False

if IS_PYTHON:
    parser = argparse.ArgumentParser()
    parser.add_argument('--exp_name', type=str, default='Experiment') # EXP_FOLDER
    parser.add_argument('--epochs', type=int, default = num_epochs) # num_epochs
    parser.add_argument('--save_iterations', type=int, default = save_iterations) # save_iterations
    
    args = parser.parse_args()
    experiment_name = args.exp_name
    num_epochs = args.epochs
    save_iterations = args.save_iterations

    # Create folder to hold sbatch runtime data
    # if not os.path.exists('NoFileCreation'):
    now = datetime.now().strftime("%Y_%m_%d-%I_%M_%S_%p")
    EXP_FOLDER = experiment_name + ' - ' + str(now)
    os.mkdir(EXP_FOLDER)
    
print("\n")
print("RUN TIME ARGUMENTS: ")
print("EXP FOLDER: ", EXP_FOLDER)
print("NUM EPOCHS: ", num_epochs)
print("SAVE ITERATIONS: ", save_iterations)
print("\n")

# Part 2: Metrics During Training (Evaluation Callback)
Oultine the evaluated metrics that will be output during the training process in order to visualize the accuracy of the language-translation model without requiring inference by a human -- will produce visuals required for associated presentation in Intro to Artificial Intelligence class at MSOE.

**BLEU Score**<br/>
BLEU score will be used to best match the metrics output by the "Attention Is All You Need" paper.

In [None]:
metric = evaluate.load("sacrebleu")

In [None]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

In [None]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
if EXP_FOLDER is not None:
    class model_per_epoch(keras.callbacks.Callback):
        """
        Simple callbacks class for transformer.
        Used to save model and display loss throughout training
        following specified save_iterations amount.
        """
        
        def __init__(self, model, filepath):
            """
            Instantiating the callback for the model.
            Outline properties to watch.
            :param tf.Model model: Model this callback is for
            :param str filepath: Filepath model is being saved to
            """
            self.filepath=filepath
            self.model=model
            self.losses = [] # Compile for graphing
            self.current_epoch = 0
        
        def on_epoch_end(self, epoch, logs=None):
            """
            What should occur on epoch end
            :param int epoch: Epoch number
            :param dictionary logs: Logs for the current training
            """
            self.current_epoch = epoch
            if epoch%save_iterations == 0: # Only save/display on save iterations
                # Save the model at epoch
#                 v_loss=logs.get('val_loss') 
#                 name= "Epoch" + str(epoch) +'-' + str(v_loss)[:str(v_loss).rfind('.')+3] + '.h5'
#                 file_id=os.path.join(self.filepath, name)
#                 self.model.save(file_id)
                name = 'Transformer_Weights_Epoch' + str(self.current_epoch) + '.h5'
                file_id = os.path.join(self.filepath, name)
                self.model.save_weights(file_id)
                
                # Display a loss plot
                self.losses.append(logs.get('val_loss'))
                self._plot_loss()

        def _plot_loss(self):
            """
            Plot the loss function of compiled loss values
            """
            plt.figure()
            plt.plot(np.arange(len(self.losses)), self.losses)
            plt.title('Training Loss')
            plt.xlabel('Epoch')
            plt.ylabel('Loss')
            
            # Save the plot for later viewing
            plot_name = str(self.current_epoch) + 'Epoch_Loss_Plot.png'
            plt.savefig(EXP_FOLDER + '/' + plot_name)
                
    save_dir=EXP_FOLDER

# Part 3: Training the Model
Define the model as pretrained from Hugging Face [small-t5](https://huggingface.co/t5-small) to evaluate text from English to French as trained from a paired book corpus.

In [None]:
optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)

In [None]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [None]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_books["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_test_set = model.prepare_tf_dataset(
    tokenized_books["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [None]:
model.compile(optimizer=optimizer)

In [None]:
!git lfs install

In [None]:
!brew

In [None]:
from transformers.keras_callbacks import PushToHubCallback

push_to_hub_callback = PushToHubCallback(
    output_dir="my_awesome_opus_books_model",
    tokenizer=tokenizer,
)

In [None]:
metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_test_set)
callbacks = [metric_callback]

In [None]:
if EXP_FOLDER is not None:
    callbacks.append(model_per_epoch(model, save_dir))

In [None]:
# input_ids = list(tf_train_set)[0][0]["input_ids"]
# attention_mask = list(tf_train_set)[0][0]["attention_mask"]

In [None]:
# outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask)

# # Decode the generated output tokens
# output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=1)

In [None]:
model.predict(text)

# Part 4: Inference

In [None]:
text = "translate English to French: Legumes share resources with nitrogen-fixing bacteria."

In [None]:
translator = pipeline("translation", model="my_awesome_opus_books_model")
translator(text)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("my_awesome_opus_books_model")
inputs = tokenizer(text, return_tensors="tf").input_ids

In [None]:
model = TFAutoModelForSeq2SeqLM.from_pretrained("my_awesome_opus_books_model")
outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)

In [None]:
tokenizer.decode(outputs[0], skip_special_tokens=True)