# Translation from Ancient to Modern Italian

In [1]:
# Import Datases to work with Transformers by Hugging-Face
from datasets import Dataset
import torch
import pandas as pd

# Imports for Transformers
from transformers import AutoTokenizer  # Datasets

import numpy as np  # Evaluation
import evaluate

from torch import nn
from datasets import Dataset

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Mount Drive folder
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
%cp -r /content/drive/MyDrive/Many_Naps_Little_Progress/src/* . # Copy source files in env space
%cp -r /content/drive/MyDrive/Many_Naps_Little_Progress/*.*sv . # Copy datasets in env space
%cp /content/drive/MyDrive/Many_Naps_Little_Progress/colab_install.sh .
%ls

## Env Configuration

Install additional libs required for traning/testing

In [None]:
!bash colab_install.sh >> /dev/null

## Import Necessary Libraries

Login on Hugging-Face (to download pre-trained network)

## Load The Dataset

In [2]:
device = ('cuda' if torch.cuda.is_available else "cpu")

In [3]:
dataset = pd.read_csv("the_old_english_dataset.csv", sep=",")
dataset.head(10)

In [4]:
hf_train = dataset.loc[0:2000, :]
hf_val = dataset.loc[2000:, :]
hf_train = Dataset.from_pandas(hf_train, split="train")
hf_val = Dataset.from_pandas(hf_val, split="validation")

## Tokenization

In [5]:
network = "google-t5/t5-base"
tokenizer = AutoTokenizer.from_pretrained(network)

In [6]:
sourcelang = "ang"
targetlang = "en"
prefix = "Translate OldEnglish to English:"

In [None]:
def preprocess(examples):
    inputs = [prefix + example for example in examples["original"]]
    targets = [example for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=512, truncation=True)
    return model_inputs

In [8]:
tkd_sentence_train = hf_train.map(preprocess, batched=True)
tkd_sentence_val = hf_val.map(preprocess, batched=True)

In [9]:
print(tkd_sentence_train)
print(tkd_sentence_val)

In [10]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=network)

In [11]:
metric = evaluate.load("sacrebleu")

In [12]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

## Models

In [13]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(network)

## Training Phase

In [None]:
training_args = Seq2SeqTrainingArguments(output_dir="t5-small", eval_strategy="epoch", save_strategy="epoch", learning_rate=0.001, per_device_train_batch_size=16, per_device_eval_batch_size=16, save_total_limit=3, num_train_epochs=10, predict_with_generate=True, logging_dir=None, logging_strategy="no")

In [None]:
trainer = Seq2SeqTrainer(model=model, args=training_args, train_dataset=tkd_sentence_train, eval_dataset=tkd_sentence_val, processing_class=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics)

In [16]:
trainer.train()

## Testing Phase

In [17]:
text = "Geseh he Matheus in þam morðorcofan, hæleð higerofne under heolstorlocan, secgan dryhtne lof, domweorðinga engla ðeodne. He ðær ana sæt geohðum geomor in þam gnornhofe."

In [20]:
inputs = tokenizer(text, return_tensors="pt").input_ids
#trainer._load_best_model("t5-small")

In [None]:
from transformers import pipeline

model = model.to('cpu')
output = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)


In [22]:
tokenizer.decode(output[0], skip_special_tokens=True)