# Translation from Ancient to Modern Italian

In [1]:
# Import Datases to work with Transformers by Hugging-Face
import torch
import pandas as pd

# Imports for Transformers
from transformers import AutoTokenizer  # Datasets
from transformers import DataCollatorForSeq2Seq
import numpy as np  # Evaluation
import evaluate
from datasets import Dataset, load_dataset
import matplotlib.pyplot as plt
import os
from transformers import TrainerCallback
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType, PeftModelForSeq2SeqLM # Optimize traning for big models! (more than 1B parameters)
import numpy as np
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class Report(TrainerCallback):
    """
    Personalized callback to draw loss and metrics graphs.
    """
    def __init__(self, plotting_dir="./training_plots"):
        self.plotting_dir = plotting_dir
        self.log_history = []
        os.makedirs(self.plotting_dir, exist_ok=True)

    def on_log(self, args, state, control, logs=None, **kwargs):
        """
        Event called after logging the last metrics.
        Collects loss and metrics data.
        """
        if logs is not None:
            self.log_history.append(logs)

    def on_train_end(self, args, state, control, **kwargs):
        """
        Event called at the end of training.
        Draws and saves the graphs.
        """
        print("Training done. Generating graphs...")

        train_losses = []
        eval_losses = []
        eval_metrics = {}
        global_steps = []
        epochs = []

        for log in self.log_history:
            # Collect training loss (recorded at logging_steps)
            if 'loss' in log:
                train_losses.append(log['loss'])
                global_steps.append(log.get('step', None)) # Use 'step' if available
            # Collect evaluation metrics (recorded at evaluation_strategy)
            elif 'eval_loss' in log:
                eval_losses.append(log['eval_loss'])
                epochs.append(log.get('epoch', None)) # Use 'epoch' if available
                for key, value in log.items():
                    if key.startswith('eval_') and key != 'eval_loss' and isinstance(value, (int, float)):
                        if key not in eval_metrics:
                            eval_metrics[key] = []
                        eval_metrics[key].append(value)

        # Remove None from global_steps if not uniformely available
        if not all(step is None for step in global_steps):
            # Only filters log containing step for training loss
            train_logs_with_step = [(log['loss'], log['step']) for log in self.log_history if 'loss' in log and 'step' in log]
            train_losses = [log[0] for log in train_logs_with_step]
            global_steps = [log[1] for log in train_logs_with_step]
        else:
            global_steps = list(range(len(train_losses))) # Use range if steps are not logged

        # Remove None from epochs if not uniformely available
        if not all(epoch is None for epoch in epochs):
            # Only filter log containing epoch for eval metrics
            eval_logs_with_epoch = [(log['eval_loss'], log['epoch'], {k:v for k,v in log.items() if k.startswith('eval_') and k != 'eval_loss'}) for log in self.log_history if 'eval_loss' in log and 'epoch' in log]
            eval_losses = [log[0] for log in eval_logs_with_epoch]
            epochs = [log[1] for log in eval_logs_with_epoch]
            eval_metrics = {k: [log[2][k] for log in eval_logs_with_epoch if k in log[2]] for k in eval_metrics.keys()}

        else:
             epochs = list(range(len(eval_losses))) # Use range if epochs are not logged
             # Ensure metrics have same length
             for key in eval_metrics:
                 eval_metrics[key] = eval_metrics[key][:len(epochs)]


        # Plot loss
        if train_losses or eval_losses:
            plt.figure(figsize=(10, 6))
            if train_losses:
                plt.plot(global_steps, train_losses, label='Training Loss')
            if eval_losses:
                plt.plot(epochs, eval_losses, label='Validation Loss')
            plt.xlabel('Step (Training Loss) / Epoch (Validation Loss)')
            plt.ylabel('Loss')
            plt.title('Training and Validation Loss')
            plt.legend()
            plt.grid(True)
            plt.savefig(os.path.join(self.plotting_dir, "loss_graph.png"))
            # plt.show()
            plt.close()

        # Plot evaluation metrics
        for metric_name, metric_values in eval_metrics.items():
            if metric_values:
                plt.figure(figsize=(10, 6))
                plt.plot(epochs, metric_values, label=metric_name)
                plt.xlabel('Epoch')
                plt.ylabel(metric_name.replace('eval_', '').capitalize())
                plt.title(f'Validation Metric: {metric_name.replace("eval_", "").capitalize()}')
                plt.legend()
                plt.grid(True)
                plt.savefig(os.path.join(self.plotting_dir, f"{metric_name}.png"))
                # plt.show()
                plt.close()

# Promposed Models
* google/flan-t5-large - google/mt5-small (text2text model) ::NO_WORK
* google/gemma-3-1b-it (LLM) 🚀
* sapienzanlp/Minerva-1B-base-v1.0 🇮🇹 (LMM)
* openai-community/gpt2 (LLM) ::NO IT
* Helsinki-NLP/opus-mt-itc-itc (Machine Translation) 🏆 - use OpusPrompt 
* facebook/nllb-200-3.3B (Translation)
* FacebookAI/xlm-roberta-base (fill-mask)

In [3]:
device = ('cuda' if torch.cuda.is_available() else "cpu")
DATASET = "dataset_ann.csv"
SRC_L = "Sentence"
TRG_L = "Target"
network = "google/flan-t5-large"

# Dataset Analysis

In [4]:
df = pd.read_csv(DATASET, sep=",", index_col=False)

In [5]:
print(f"length mean {SRC_L} text: {df[SRC_L].apply(lambda x: len(x.split())).mean()}")
print(f"length mean {TRG_L} text: {df[TRG_L].apply(lambda x: len(x.split())).mean()}")

length mean Sentence text: 20.04123711340206
length mean Target text: 20.690721649484537


In [6]:
df.head()

Unnamed: 0,Author,Date,Region,Sentence,Target
0,Brunetto Latini,1260-61,fior.,quella guerra ben fatta l' opera perché etc. E...,Quella guerra fu ben condotta per via delle az...
1,Bono Giamboni,1292,fior.,"crudele, e di tutte le colpe pigli vendetta, c...","È severo, e punisce tutte le colpe come prescr..."
2,Valerio Massimo (red. V1,1336,fior.,Non d' altra forza d' animo fue ornato Ponzio ...,"Ponzio Aufidiano, cavaliere romano, fu dotato ..."
3,Lucano volg. (ed. Marinoni),1330/40,prat.,Se questo piace a tutti e se 'l tempo hae biso...,Se questo è quello che tutti desiderano e se l...
4,Brunetto Latini,1260-61,fior.,Officio di questa arte pare che sia dicere app...,Il compito di quest’arte sembra essere quello ...


# Proposed Models
* Text-Generator(prompt2text) : **google/flan-t5-base**
* Machine Translator : **google-t5/t5-base**
* LLM : **openai-community/gpt2-medium**

## Env Configuration

Install additional libs required for traning/testing

## Import Necessary Libraries

Login on Hugging-Face (to download pre-trained network)

## Load The Dataset

In [7]:
from datasets.features import Value, Features
hf = Dataset.from_csv(DATASET, features=
    Features({
        SRC_L : Value("string"),
        TRG_L : Value("string")
    })          
                      
    ).shuffle(2025).train_test_split(test_size=0.10)

## Tokenization

In [8]:
tokenizer = AutoTokenizer.from_pretrained(network)
max_length = tokenizer.model_max_length

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token 

    
def base_prompt_la_it(examples):
    inputs = ["traduci in Italiano il seguente Testo: " + example for example in examples[SRC_L]]
    targets = [example for example in examples[TRG_L]]
    
    model_inputs = tokenizer(inputs, text_target=targets, max_length=max_length, truncation=True, padding="longest")
    return model_inputs
    

## Tokenizer Parameters

In [9]:
map_callback = base_prompt_la_it

In [10]:
hf_tokenized = hf.map(map_callback, batched=True)
hf_tokenized.set_format(type="torch", columns=['input_ids', 'attention_mask', "labels"])
print(hf_tokenized.column_names)
print(hf_tokenized.shape)

Map:   0%|          | 0/87 [00:00<?, ? examples/s]

Map: 100%|██████████| 87/87 [00:00<00:00, 6678.46 examples/s]
Map: 100%|██████████| 10/10 [00:00<00:00, 2294.23 examples/s]

{'train': ['Sentence', 'Target', 'input_ids', 'attention_mask', 'labels'], 'test': ['Sentence', 'Target', 'input_ids', 'attention_mask', 'labels']}
{'train': (87, 5), 'test': (10, 5)}





In [11]:
for idx, s in enumerate(hf_tokenized["train"], 1):
    print(f"===:(sentence n°{idx}):===")
    print(f"{SRC_L}:{tokenizer.decode(s["input_ids"], attention_mask=s["attention_mask"], skip_special_tokens=True)}" )
    print(f"{TRG_L}:{tokenizer.decode(s["labels"], skip_special_tokens=True)}")

===:(sentence n°1):===
Sentence:traduci in Italiano il seguente Testo: Teseo reguard Achelao fortemente meravigliandose e disse cos: "O messer Achelao, io vi prego che voi me diciate in che modo voi perdeste
Target:Teseo guard Achelao con grande meraviglia e disse: "O Signor Achelao vi prego di raccontarmi in che modo avete perso
===:(sentence n°2):===
Sentence:traduci in Italiano il seguente Testo: Per che or chi spererebbe quello che eziandio questi che non vogliono ancora credere in Cristo, già veggiono con noi, e perché nol possono negare, stridono colli denti.
Target:Chi potrebbe ancora dubitare di ci? Anche coloro che non credono in Cristo vedono chiaramente come noi e, non potendo negarlo, provano rabbia e risentimento.
===:(sentence n°3):===
Sentence:traduci in Italiano il seguente Testo: Ora spaventerai li disidirosi cervi con varie e diverse paure, o lo porco cenghiare caggia in terra passato collo spiedo.
Target:O spaventerai i timorosi cervi in diversi modi, oppure il cingh

In [12]:
sacrebleu_metric = evaluate.load("sacrebleu")
rouge_metric = evaluate.load("rouge")
meteor_metric = evaluate.load("meteor")
chrf_metric = evaluate.load("chrf")
ter_metric = evaluate.load("ter")


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels] # Specific format for SacreBLEU
    return preds, labels

def compute_metrics(eval_preds):
    preds_input, label_ids = eval_preds

    # Dealing with logits or token IDs for predictions
    # If preds_input are logits (es. direct output of training modello)
    current_preds = preds_input
    if isinstance(current_preds, tuple): # Common in HF Trainer, es. (logits, hidden_states)
        current_preds = current_preds[0]
    
    if hasattr(current_preds, "ndim") and current_preds.ndim == 3: # Array of logits (batch_size, seq_len, vocab_size)
        current_preds_ids = np.argmax(current_preds, axis=-1)
    else: # Otherwise, assumed to be token ID (batch_size, seq_len)
        current_preds_ids = current_preds

    # Decode predictions and labels
    decoded_preds_raw = tokenizer.batch_decode(current_preds_ids, skip_special_tokens=True)

    # Replace -100 in labels (common for token to be ignored) with pad_token_id for decoding
    processed_label_ids = np.where(label_ids != -100, label_ids, tokenizer.pad_token_id)
    decoded_labels_raw = tokenizer.batch_decode(processed_label_ids, skip_special_tokens=True)
    
    processed_preds, processed_labels_for_sacrebleu = postprocess_text(decoded_preds_raw, decoded_labels_raw)

    # For other metrics (ROUGE, METEOR, CHRF, TER), usually expects a flat list of reference strings
    flat_references = [ref[0] for ref in processed_labels_for_sacrebleu]

    results = {}

    # 1. SacreBLEU
    sacrebleu_output = sacrebleu_metric.compute(predictions=processed_preds, references=processed_labels_for_sacrebleu)
    if sacrebleu_output and "score" in sacrebleu_output:
        results["bleu"] = sacrebleu_output["score"]
    else:
        results["bleu"] = 0.0 # Fallback

    # 2. ROUGE (rouge1, rouge2, rougeL, rougeLsum)
    rouge_output = rouge_metric.compute(predictions=processed_preds, references=flat_references, use_stemmer=True)
    if rouge_output:
        results["rouge1"] = rouge_output.get("rouge1", 0.0)
        results["rouge2"] = rouge_output.get("rouge2", 0.0)
        results["rougeL"] = rouge_output.get("rougeL", 0.0)
        results["rougeLsum"] = rouge_output.get("rougeLsum", 0.0) # Spesso più robusto per sommario
    else:
        results.update({"rouge1": 0.0, "rouge2": 0.0, "rougeL": 0.0, "rougeLsum": 0.0})

    # 3. METEOR
    meteor_output = meteor_metric.compute(predictions=processed_preds, references=flat_references)
    if meteor_output and "meteor" in meteor_output:
        results["meteor"] = meteor_output["meteor"]
    else:
        results["meteor"] = 0.0

    # 4. CHRF++ (CHRF with n-grams of words)
    # For CHRF++, word_order (or word_n) is > 0. Default of evaluate.load('chrf') are word_order=0 (CHRF standard).
    # Common parameters for CHRF++: word_order=2, beta=2 (beta=2 default)
    chrf_output = chrf_metric.compute(predictions=processed_preds, references=flat_references, word_order=2, beta=2)
    if chrf_output and "score" in chrf_output:
        results["chrf++"] = chrf_output["score"] # CHRF++ score
    else:
        results["chrf++"] = 0.0
        
    # (Optional) CHRF standard (only characters)
    # chrf_std_output = chrf_metric.compute(predictions=processed_preds, references=flat_references, word_order=0)
    # if chrf_std_output and "score" in chrf_std_output:
    #     results["chrf"] = chrf_std_output["score"]
    # else:
    #     results["chrf"] = 0.0

    # 5. TER (Translation Edit Rate) - the smaller, the better
    ter_output = ter_metric.compute(predictions=processed_preds, references=flat_references)
    if ter_output and "score" in ter_output:
        results["ter"] = ter_output["score"]
    else:
        results["ter"] = 1.0 # Fallback on worst score TER possible

    # Mean length of generated predictions (excluding padding tokens)
    # 'current_preds_ids' are ID token of the predictions
    prediction_lengths = [np.count_nonzero(pid_seq != tokenizer.pad_token_id) for pid_seq in current_preds_ids]
    results["gen_len"] = np.mean(prediction_lengths) if prediction_lengths else 0.0

    # Rounding of all numerical results
    final_results = {k: round(v, 4) for k, v in results.items() if isinstance(v, (int, float))}
    
    return final_results

[nltk_data] Downloading package wordnet to /home/andrea/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/andrea/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/andrea/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Models

In [13]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoModelForCausalLM, AutoModel, MT5ForConditionalGeneration
from peft import PeftModel, PeftModelForSeq2SeqLM
from transformers import EarlyStoppingCallback

model =AutoModelForSeq2SeqLM.from_pretrained(network)


## Training Phase

### PEFT Fine-Tuning

In [31]:
model = PeftModel(model, peft_config=LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM, 
    inference_mode=False, 
    r=8, lora_alpha=32, 
    lora_dropout=0.5
))
model.print_trainable_parameters()
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=network)


trainable params: 884,736 || all params: 583,286,016 || trainable%: 0.1517


In [32]:
OUT_DIR = network.split("/")[-1]
EPOCHS = 100
BATCH_SIZE = 10

early_stopping_patience = 10 # Attendi 3 epoche senza miglioramenti
early_stopping_threshold = 0.01 # Un miglioramento deve essere di almeno 0.01

early_callback = EarlyStoppingCallback(
    early_stopping_patience=3, # Se la loss di valutazione non migliora per 3 epoche consecutive
    early_stopping_threshold=0 # Ignora miglioramenti inferiori a 0.001
)

In [33]:
class MyTrainer(Seq2SeqTrainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        if 'num_items_in_batch' in inputs:
            inputs = {k: v for k, v in inputs.items() if k != 'num_items_in_batch'}
        return super().compute_loss(model, inputs, return_outputs=return_outputs)

In [34]:
training_args = Seq2SeqTrainingArguments(
    output_dir=OUT_DIR,
    learning_rate=1e-4,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none",
    save_total_limit=3,
    lr_scheduler_type="linear",
    logging_dir=OUT_DIR,
    logging_steps=10,
    label_names=['labels'],
    metric_for_best_model="eval_loss", 
    greater_is_better=False,
)

trainer = MyTrainer(
    model=model,
    args=training_args,
    train_dataset=hf_tokenized["train"],
    eval_dataset=hf_tokenized["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[Report(OUT_DIR), early_callback]
)

In [35]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Rouge1,Rouge2,Rougel,Rougelsum,Meteor,Chrf++,Ter,Gen Len
1,13.7043,11.184262,0.1334,0.0079,0.0,0.0079,0.0079,0.0344,5.7366,105.4054,22.7
2,14.3763,11.244613,0.1439,0.03,0.0,0.0299,0.0289,0.0312,5.4687,100.4505,22.1
3,14.4396,11.064319,0.1455,0.0231,0.0,0.0229,0.022,0.0379,5.7774,99.5495,20.6
4,14.2866,11.754907,0.1647,0.0323,0.0,0.0323,0.0323,0.0418,5.9185,99.5495,21.9
5,13.8707,12.492759,0.1094,0.0074,0.0,0.0074,0.0074,0.0246,5.3444,106.3063,20.8
6,13.0662,11.507318,0.1301,0.0061,0.0,0.0061,0.0061,0.0276,5.4715,100.4505,20.3


Training done. Generating graphs...


TrainOutput(global_step=54, training_loss=13.95726097954644, metrics={'train_runtime': 21.2727, 'train_samples_per_second': 408.976, 'train_steps_per_second': 42.308, 'total_flos': 85766519623680.0, 'train_loss': 13.95726097954644, 'epoch': 6.0})

In [36]:
# Imposta il modello in modalità valutazione e spostalo sul device
model = model.eval()
model = model.to(device)

# Crea il DataLoader
loader = torch.utils.data.DataLoader(hf_tokenized["train"], batch_size=8)


print(f"Inizio generazione su {device}")
print("=============================")

for idx, batch in enumerate(loader, 1):
    # Sposta l'intero batch sul device
    # Nota: DataLoader restituisce un batch come dizionario di tensori
    batch = {k: v.to(device) for k, v in batch.items()}

    # Genera l'output per il batch
    # Input al generate devono essere input_ids e attention_mask
    result_ids = model.generate(
        input_ids=batch["input_ids"],
        attention_mask=batch["attention_mask"],
        max_new_tokens=120,
        do_sample=True,
        top_k=10,
        top_p=0.95
    )

    # Decodifica *separatamente* ogni prompt e ogni risultato generato
    # Iteriamo sul batch per decodificare uno per uno
    # batch["input_ids"] ha forma (batch_size, seq_len_input)
    # result_ids ha forma (batch_size, seq_len_output)
    
    # Decodifica i prompt originali
    decoded_prompts = tokenizer.batch_decode(batch["input_ids"], skip_special_tokens=True)
    
    # Decodifica i risultati generati
    decoded_results = tokenizer.batch_decode(result_ids, skip_special_tokens=True)

    # Stampa i risultati per ogni elemento del batch
    for i in range(len(decoded_prompts)):
        print("======")
        print(f"prompt -> {decoded_prompts[i]}")
        print(f"===:(model {network} prompt):===")
        print(decoded_results[i])
        print("======")

print("\nGenerazione completata.")

Inizio generazione su cuda
prompt -> parafrasa in italiano moderno: Se questo piace a tutti e se 'l tempo hae bisogno d'avere Pompeio per cavaliere e non per compagno, non riterrò più i fati.
===:(model google/mt5-base prompt):===
<pad> <extra_id_0>. A queste notizie <pad>
prompt -> parafrasa in italiano moderno: Unde gli poeti, parlando de lloro, dicono le virtute loro e dicono fabolosamente gli loro difetti, quando alcuna passava l'ordine a lloro deputato
===:(model google/mt5-base prompt):===
<pad> <extra_id_0> a l'insegnamento
prompt -> parafrasa in italiano moderno: ne salìo in su l'argine del fosso, e in su lo steccato, se da alto si potessero difendere, o per alcuna maniera passare oltre e scampare.
===:(model google/mt5-base prompt):===
<pad> <extra_id_0> il tocco umano <pad>
prompt -> parafrasa in italiano moderno: da' monti de' Romani si feciero nuovi nemici; contra i quali è conbactuto cum diversa ventura: perké nela primaia battaglia, essendo consolo Valerio, MMMD ne moriro