# Translation from Ancient to Modern Italian

In [None]:
# Mount Drive folder
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
%cp -r /content/drive/MyDrive/Many_Naps_Little_Progress/src/* . # Copy source files in env space
%cp -r /content/drive/MyDrive/Many_Naps_Little_Progress/*.*sv . # Copy datasets in env space
%cp /content/drive/MyDrive/Many_Naps_Little_Progress/colab_install.sh .
%ls

## Env Configuration

Install additional libs required for traning/testing

In [None]:
!bash colab_install.sh >> /dev/null

## Hugging-Face Login

Login on Hugging-Face (to download pre-trained network)

In [None]:
from huggingface_hub import login

HF_TOKEN = ""
login(token=HF_TOKEN)

## Import Necessary Libraries

In [None]:
# Import Datases to work with Transformers by Hugging-Face
from datasets import Dataset
import pandas as pd

# Imports for Transformers
from transformers import AutoTokenizer  # Datasets
from transformers import DataCollatorWithPadding

from transformers import AutoModelForSequenceClassification  # Model
from transformers import TrainingArguments, Trainer

import numpy as np  # Evaluation
import evaluate

from torch import nn
from datasets import Dataset

import matplotlib.pyplot as plt
import seaborn as sns

## Load The Dataset

In [None]:
dataset = pd.read_csv("the_old_english_dataset.csv", sep=",")
dataset.head(10)

In [None]:
hf_train = dataset.loc[0:2000, :]
hf_val = dataset.loc[2000:, :]
hf_train = Dataset.from_pandas(hf_train, split="train")
hf_val = Dataset.from_pandas(hf_val, split="validation")

## Tokenization

In [None]:
network = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(network)

In [None]:
sourcelang = "en"
targetlang = "en"
prefix = "Translate Old English to Modern English"

In [None]:
def preprocess(examples):
    inputs = [prefix + example for example in examples["original"]]
    targets = [example for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [None]:
tkd_sentence_train = hf_train.map(preprocess, batched=True)
tkd_sentence_val = hf_val.map(preprocess, batched=True)

In [None]:
print(tkd_sentence_train)
print(tkd_sentence_val)

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=network)

In [None]:
metric = evaluate.load("sacrebleu")

In [None]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

## Models

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(network)

## Training Phase

In [None]:
training_args = Seq2SeqTrainingArguments(output_dir="t5-small", eval_strategy="epoch", save_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=16, per_device_eval_batch_size=16, weight_decay=0.01, save_total_limit=3, num_train_epochs=3, predict_with_generate=True, logging_dir="logs", logging_steps=10)

In [None]:
trainer = Seq2SeqTrainer(model=model, args=training_args, train_dataset=tkd_sentence_train, eval_dataset=tkd_sentence_val, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics)

In [None]:
trainer.train()

## Testing Phase