# Translation from Ancient to Modern Italian

In [1]:
# Import Datases to work with Transformers by Hugging-Face
import torch
import pandas as pd

# Imports for Transformers
from transformers import AutoTokenizer  # Datasets
from transformers import DataCollatorForSeq2Seq
import numpy as np  # Evaluation
import evaluate
from datasets import Dataset, load_dataset
import matplotlib.pyplot as plt
import os
from transformers import TrainerCallback
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType, PeftModelForSeq2SeqLM # Optimize traning for big models! (more than 1B parameters)
import numpy as np
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class Report(TrainerCallback):
    """
    Personalized callback to draw loss and metrics graphs.
    """
    def __init__(self, plotting_dir="./training_plots"):
        self.plotting_dir = plotting_dir
        self.log_history = []
        os.makedirs(self.plotting_dir, exist_ok=True)

    def on_log(self, args, state, control, logs=None, **kwargs):
        """
        Event called after logging the last metrics.
        Collects loss and metrics data.
        """
        if logs is not None:
            self.log_history.append(logs)

    def on_train_end(self, args, state, control, **kwargs):
        """
        Event called at the end of training.
        Draws and saves the graphs.
        """
        print("Training done. Generating graphs...")

        train_losses = []
        eval_losses = []
        eval_metrics = {}
        global_steps = []
        epochs = []

        for log in self.log_history:
            # Collect training loss (recorded at logging_steps)
            if 'loss' in log:
                train_losses.append(log['loss'])
                global_steps.append(log.get('step', None)) # Use 'step' if available
            # Collect evaluation metrics (recorded at evaluation_strategy)
            elif 'eval_loss' in log:
                eval_losses.append(log['eval_loss'])
                epochs.append(log.get('epoch', None)) # Use 'epoch' if available
                for key, value in log.items():
                    if key.startswith('eval_') and key != 'eval_loss' and isinstance(value, (int, float)):
                        if key not in eval_metrics:
                            eval_metrics[key] = []
                        eval_metrics[key].append(value)

        # Remove None from global_steps if not uniformely available
        if not all(step is None for step in global_steps):
            # Only filters log containing step for training loss
            train_logs_with_step = [(log['loss'], log['step']) for log in self.log_history if 'loss' in log and 'step' in log]
            train_losses = [log[0] for log in train_logs_with_step]
            global_steps = [log[1] for log in train_logs_with_step]
        else:
            global_steps = list(range(len(train_losses))) # Use range if steps are not logged

        # Remove None from epochs if not uniformely available
        if not all(epoch is None for epoch in epochs):
            # Only filter log containing epoch for eval metrics
            eval_logs_with_epoch = [(log['eval_loss'], log['epoch'], {k:v for k,v in log.items() if k.startswith('eval_') and k != 'eval_loss'}) for log in self.log_history if 'eval_loss' in log and 'epoch' in log]
            eval_losses = [log[0] for log in eval_logs_with_epoch]
            epochs = [log[1] for log in eval_logs_with_epoch]
            eval_metrics = {k: [log[2][k] for log in eval_logs_with_epoch if k in log[2]] for k in eval_metrics.keys()}

        else:
             epochs = list(range(len(eval_losses))) # Use range if epochs are not logged
             # Ensure metrics have same length
             for key in eval_metrics:
                 eval_metrics[key] = eval_metrics[key][:len(epochs)]


        # Plot loss
        if train_losses or eval_losses:
            plt.figure(figsize=(10, 6))
            if train_losses:
                plt.plot(global_steps, train_losses, label='Training Loss')
            if eval_losses:
                plt.plot(epochs, eval_losses, label='Validation Loss')
            plt.xlabel('Step (Training Loss) / Epoch (Validation Loss)')
            plt.ylabel('Loss')
            plt.title('Training and Validation Loss')
            plt.legend()
            plt.grid(True)
            plt.savefig(os.path.join(self.plotting_dir, "loss_graph.png"))
            # plt.show()
            plt.close()

        # Plot evaluation metrics
        for metric_name, metric_values in eval_metrics.items():
            if metric_values:
                plt.figure(figsize=(10, 6))
                plt.plot(epochs, metric_values, label=metric_name)
                plt.xlabel('Epoch')
                plt.ylabel(metric_name.replace('eval_', '').capitalize())
                plt.title(f'Validation Metric: {metric_name.replace("eval_", "").capitalize()}')
                plt.legend()
                plt.grid(True)
                plt.savefig(os.path.join(self.plotting_dir, f"{metric_name}.png"))
                # plt.show()
                plt.close()

In [3]:
device = ('cuda' if torch.cuda.is_available() else "cpu")
DATASET = "the_old_english_dataset.csv"
PROMPT = "translate OldEnglish to English: "
SRC_L = "ang"
TRG_L = "en"
network = "google-t5/t5-base"

# Dataset Analysis

In [4]:
df = pd.read_csv(DATASET, sep=",", index_col=False)

In [5]:
print(f"length mean {SRC_L} text: {df[SRC_L].apply(lambda x: len(x.split())).mean()}")
print(f"length mean {TRG_L} text: {df[TRG_L].apply(lambda x: len(x.split())).mean()}")

length mean ang text: 58.496155585707825
length mean en text: 81.98507462686567


In [6]:
df.head()

Unnamed: 0,start,end,text_name,new_match,original_match,en,ang,len_translation,len_original,len_diff
0,0,0,alms_giving.txt,(0-0),,It will be well for that earl who keeps inside...,"Wel bið þam eorle þe him on innan hafað, reþe...",77,54,23
1,628,631,andreas.txt,(628-631),(628-31),And so Andrew gave answer: “What are you askin...,"Him þa Andreas ondsware agef: ""Hwæt frinest ðu...",30,22,8
2,977,980,andreas.txt,(977-980),(977-80),"Then the holy one departed from him, seeking t...","Gewat him þa se halga heofonas secan, eallra c...",37,26,11
3,981,996a,andreas.txt,(981-996a),(981-96a),"Then Andrew, soul-patient and mindful, a warri...","ða wæs gemyndig modgeþyldig, beorn beaduwe hea...",114,77,37
4,996b,1003,andreas.txt,(996b-1003),(996b-1003),Then the holy one prayed to the merciful fathe...,"ða se halga gebæd bilwytne fæder, breostgehygd...",54,35,19


# Proposed Models
* RNN (GRU-cell + attention) : [related paper](https://arxiv.org/pdf/1704.08430)
* Text-Generator(prompt2text) : **google/flan-t5-base**
* Machine Translator : **google-t5/t5-base**
* LLM : **openai-community/gpt2-medium**

## Env Configuration

Install additional libs required for traning/testing

## Import Necessary Libraries

Login on Hugging-Face (to download pre-trained network)

## Load The Dataset

In [7]:
from datasets.features import Value, Features
hf = Dataset.from_csv(DATASET, features=
    Features({
        SRC_L : Value("string"),
        TRG_L : Value("string")
    })          
                      
    ).shuffle(2025).train_test_split(test_size=0.15)

hf["train"].take(5)[:]

{'ang': ['Gehyge þu, frea min, fæstlicne ræd. Syle ælmyssan, wes earmra hleo, þinga for ðeodne, ær ðam seo þrah cyme þæt he þec aworpe of woruldrice. Oft metod alæt monige ðeode wyrcan bote, þonne hie woldon sylfe, fyrene fæstan, ær him fær godes þurh egesan gryre aldre gesceode."',
  'Geseah ic þæt fuse beacen wendan wædum ond bleom; hwilum hit wæs mid wætan bestemed, beswyled mid swates gange, hwilum mid since gegyrwed.',
  'We bi sumum hyrdon wrætlice gecynd wildra secgan firum freamærne feorlondum on',
  'þæt fram ham gefrægn Higelaces þegn, god mid Geatum, Grendles dæda; se wæs moncynnes mægenes strengest on þæm dæge þysses lifes, æþele ond eacen. Het him yðlidan godne gegyrwan, cwæð, he guðcyning ofer swanrade secean wolde, mærne þeoden, þa him wæs manna þearf. ðone siðfæt him snotere ceorlas lythwon logon, þeah he him leof wære; hwetton higerofne, hæl sceawedon.',
  'Mæg ic be me sylfum soðgied wrecan, siþas secgan, hu ic geswincdagum earfoðhwile oft þrowade, bitre breostceare g

In [8]:
from datasets import load_dataset

#hf = load_dataset("grosenthal/latin_english_translation")

## Tokenization

In [9]:
tokenizer = AutoTokenizer.from_pretrained(network)

class Preprocessor:
    def __init__(self, PREFIX, src_lan, dest_lan, max_length):
        self.PREFIX = PREFIX
        self.src_lan = src_lan
        self.dest_lan = dest_lan
        self.ml = max_length
    
    def __call__(self, examples):
        inputs = [self.PREFIX + example for example in examples[self.src_lan]]
        targets = [example for example in examples[self.dest_lan]]
        
        model_inputs = tokenizer(inputs, text_target=targets, max_length=self.ml, truncation=True)
        
        return model_inputs

In [10]:
prc = Preprocessor(PROMPT, SRC_L, TRG_L, 64)

In [11]:
hf_tokenized = hf.map(prc, batched=True)
print(hf_tokenized)

Map:   0%|          | 0/1879 [00:00<?, ? examples/s]

Map:   0%|          | 0/332 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['ang', 'en', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1879
    })
    test: Dataset({
        features: ['ang', 'en', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 332
    })
})


In [12]:
src = hf_tokenized["train"]['input_ids'][1]
target = hf_tokenized["train"]['labels'][1]

In [13]:
print(tokenizer.decode(src, skip_special_tokens=True))
print(tokenizer.decode(target, skip_special_tokens=True))

translate OldEnglish to English: Geseah ic t fuse beacen wendan wdum ond bleom; hwilum hit ws mid wtan bestemed, beswyled mid s
I witnessed the change, the streaking beacon, warping its own in clad & color: sometimes it was blood steaming, swilling in trills & rills of ruddy sweat; sometimes it was bedazzled with richness.


In [14]:
sacrebleu_metric = evaluate.load("sacrebleu")
rouge_metric = evaluate.load("rouge")
meteor_metric = evaluate.load("meteor")
chrf_metric = evaluate.load("chrf")
ter_metric = evaluate.load("ter")


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels] # Specific format for SacreBLEU
    return preds, labels

def compute_metrics(eval_preds):
    preds_input, label_ids = eval_preds

    # Dealing with logits or token IDs for predictions
    # If preds_input are logits (es. direct output of training modello)
    current_preds = preds_input
    if isinstance(current_preds, tuple): # Common in HF Trainer, es. (logits, hidden_states)
        current_preds = current_preds[0]
    
    if hasattr(current_preds, "ndim") and current_preds.ndim == 3: # Array of logits (batch_size, seq_len, vocab_size)
        current_preds_ids = np.argmax(current_preds, axis=-1)
    else: # Otherwise, assumed to be token ID (batch_size, seq_len)
        current_preds_ids = current_preds

    # Decode predictions and labels
    decoded_preds_raw = tokenizer.batch_decode(current_preds_ids, skip_special_tokens=True)

    # Replace -100 in labels (common for token to be ignored) with pad_token_id for decoding
    processed_label_ids = np.where(label_ids != -100, label_ids, tokenizer.pad_token_id)
    decoded_labels_raw = tokenizer.batch_decode(processed_label_ids, skip_special_tokens=True)
    
    processed_preds, processed_labels_for_sacrebleu = postprocess_text(decoded_preds_raw, decoded_labels_raw)

    # For other metrics (ROUGE, METEOR, CHRF, TER), usually expects a flat list of reference strings
    flat_references = [ref[0] for ref in processed_labels_for_sacrebleu]

    results = {}

    # 1. SacreBLEU
    sacrebleu_output = sacrebleu_metric.compute(predictions=processed_preds, references=processed_labels_for_sacrebleu)
    if sacrebleu_output and "score" in sacrebleu_output:
        results["bleu"] = sacrebleu_output["score"]
    else:
        results["bleu"] = 0.0 # Fallback

    # 2. ROUGE (rouge1, rouge2, rougeL, rougeLsum)
    rouge_output = rouge_metric.compute(predictions=processed_preds, references=flat_references, use_stemmer=True)
    if rouge_output:
        results["rouge1"] = rouge_output.get("rouge1", 0.0)
        results["rouge2"] = rouge_output.get("rouge2", 0.0)
        results["rougeL"] = rouge_output.get("rougeL", 0.0)
        results["rougeLsum"] = rouge_output.get("rougeLsum", 0.0) # Spesso più robusto per sommario
    else:
        results.update({"rouge1": 0.0, "rouge2": 0.0, "rougeL": 0.0, "rougeLsum": 0.0})

    # 3. METEOR
    meteor_output = meteor_metric.compute(predictions=processed_preds, references=flat_references)
    if meteor_output and "meteor" in meteor_output:
        results["meteor"] = meteor_output["meteor"]
    else:
        results["meteor"] = 0.0

    # 4. CHRF++ (CHRF with n-grams of words)
    # For CHRF++, word_order (or word_n) is > 0. Default of evaluate.load('chrf') are word_order=0 (CHRF standard).
    # Common parameters for CHRF++: word_order=2, beta=2 (beta=2 default)
    chrf_output = chrf_metric.compute(predictions=processed_preds, references=flat_references, word_order=2, beta=2)
    if chrf_output and "score" in chrf_output:
        results["chrf++"] = chrf_output["score"] # CHRF++ score
    else:
        results["chrf++"] = 0.0
        
    # (Optional) CHRF standard (only characters)
    # chrf_std_output = chrf_metric.compute(predictions=processed_preds, references=flat_references, word_order=0)
    # if chrf_std_output and "score" in chrf_std_output:
    #     results["chrf"] = chrf_std_output["score"]
    # else:
    #     results["chrf"] = 0.0

    # 5. TER (Translation Edit Rate) - the smaller, the better
    ter_output = ter_metric.compute(predictions=processed_preds, references=flat_references)
    if ter_output and "score" in ter_output:
        results["ter"] = ter_output["score"]
    else:
        results["ter"] = 1.0 # Fallback on worst score TER possible

    # Mean length of generated predictions (excluding padding tokens)
    # 'current_preds_ids' are ID token of the predictions
    prediction_lengths = [np.count_nonzero(pid_seq != tokenizer.pad_token_id) for pid_seq in current_preds_ids]
    results["gen_len"] = np.mean(prediction_lengths) if prediction_lengths else 0.0

    # Rounding of all numerical results
    final_results = {k: round(v, 4) for k, v in results.items() if isinstance(v, (int, float))}
    
    return final_results

[nltk_data] Downloading package wordnet to /home/andrea/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/andrea/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/andrea/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Models

In [15]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(network)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=network)

## Training Phase

### PEFT Fine-Tuning

In [16]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 884,736 || all params: 223,788,288 || trainable%: 0.3953


In [17]:
OUT_DIR = network.split("/")[-1]
EPOCHS = 10
BATCH_SIZE = 16

In [18]:
from transformers import Seq2SeqTrainer

class MyTrainer(Seq2SeqTrainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        if 'num_items_in_batch' in inputs:
            inputs = {k: v for k, v in inputs.items() if k != 'num_items_in_batch'}
        return super().compute_loss(model, inputs, return_outputs=return_outputs)

In [19]:
training_args = Seq2SeqTrainingArguments(
    output_dir=OUT_DIR,
    learning_rate=1e-3,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=False,
    report_to="none",
    logging_dir=OUT_DIR,
    logging_steps=10
)

trainer = MyTrainer(
    model=model,
    args=training_args,
    train_dataset=hf_tokenized["train"],
    eval_dataset=hf_tokenized["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[Report(OUT_DIR)]
)

No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [20]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Bleu,Rouge1,Rouge2,Rougel,Rougelsum,Meteor,Chrf++,Ter,Gen Len
1,4.1396,3.800684,2.1886,0.3143,0.0442,0.2511,0.2511,0.2024,22.3513,84.735,64.0
2,3.8801,3.687197,2.7725,0.3202,0.0498,0.2619,0.2618,0.2079,22.9452,83.6558,64.0
3,3.768,3.629912,3.0129,0.3315,0.0542,0.2691,0.2693,0.2172,23.4388,83.0753,64.0
4,3.6898,3.588008,3.5717,0.3351,0.0604,0.2732,0.2732,0.2249,24.6696,83.2837,64.0
5,3.6238,3.548133,4.0332,0.3431,0.0645,0.2801,0.2804,0.2283,24.9299,83.0232,64.0
6,3.5699,3.521148,4.0108,0.3428,0.0652,0.281,0.281,0.231,25.0746,82.8074,64.0
7,3.533,3.509242,4.4053,0.3471,0.0679,0.2849,0.2851,0.234,25.7201,82.5246,64.0
8,3.4953,3.495327,4.4071,0.3505,0.0691,0.2881,0.2883,0.2364,25.6672,82.1078,64.0
9,3.4692,3.488515,4.4706,0.3521,0.0703,0.2884,0.2887,0.2373,25.93,81.9291,64.0
10,3.4499,3.486037,4.6334,0.3523,0.071,0.2895,0.2897,0.2372,25.9353,81.8696,64.0


Addestramento terminato. Generazione dei grafici...


TrainOutput(global_step=1180, training_loss=3.6618559239274364, metrics={'train_runtime': 349.1349, 'train_samples_per_second': 53.819, 'train_steps_per_second': 3.38, 'total_flos': 1436673534197760.0, 'train_loss': 3.6618559239274364, 'epoch': 10.0})

In [21]:
model = model.eval()
text = hf_tokenized['test'][2][SRC_L]
target = hf_tokenized['test'][2][TRG_L]
inputs = tokenizer(text, return_tensors="pt")
ids = inputs.input_ids.to(device)
attention = inputs.attention_mask.to(device)

output = model.generate(input_ids=ids,  attention_mask=attention, max_new_tokens=120, do_sample=True, top_k=10, top_p=0.95)
output = tokenizer.decode(output[0], skip_special_tokens=True)

In [22]:
print(f"Original Sentence:\n {text}")
print(f"Target Sentence:\n {target}")
print(f"Translated Sentence:\n {output}")

Original Sentence:
 Wundor is to secganne hu mihtig god manna cynne þurh sidne sefan snyttru bryttað, eard ond eorlscipe; he ah ealra geweald. Hwilum he on lufan læteð hworfan monnes modgeþonc mæran cynnes, seleð him on eþle eorþan wynne to healdanne, hleoburh wera, gedeð him swa gewealdene worolde dælas, side rice, þæt he his selfa ne mæg for his unsnyttrum ende geþencean. Wunað he on wiste; no hine wiht dweleð adl ne yldo, ne him inwitsorh on sefan sweorceð, ne gesacu ohwær ecghete eoweð, ac him eal worold wendeð on willan (he þæt wyrse ne con),
Target sentence:
 It is a wonder to speak how Mighty God dispenses wisdom to the kindred of men through a spacious soul, a home to command. He owns the power over all creatures. Sometimes he allows the mind-thoughts of man to rove in love of his famous kinsmen, giving him joy on earth — in order to keep well the sheltering stronghold of mortals, lending him such authority over his worldly share, this broad realm, so that he imagines no end fo