# Translation from Ancient to Modern Italian

In [86]:
# Import Datases to work with Transformers by Hugging-Face
import torch
import pandas as pd
from tqdm.auto import tqdm
# Imports for Transformers
from transformers import AutoTokenizer  # Datasets
from datasets import Dataset, DatasetDict
from utils import Report
import numpy as np
import evaluate

from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments, AutoModelForCausalLM          # imports for causal Learning
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoModelForSeq2SeqLM    # imports for Seq2Seq models
from peft import LoraConfig, TaskType, LoftQConfig, PeftModelForSeq2SeqLM, PeftModelForCausalLM, get_peft_model     # imports for quantization methods (LoRA etc...)
from transformers import EarlyStoppingCallback  


# Promposed Models
* google/gemma-3-1b-it (LLM) 🚀
* mistralai/Mistral-7B-Instruct-v0.2
* sapienzanlp/Minerva-1B-base-v1.0 🇮🇹 (LMM)
* Helsinki-NLP/opus-mt-itc-itc (Machine Translation) 🏆 - use OpusPrompt 

In [87]:
device = ('cuda' if torch.cuda.is_available() else "cpu")
DATASET = "dataset_ann.csv"
SRC_L = "Sentence"
TRG_L = "Target"
network = "sapienzanlp/Minerva-1B-base-v1.0"
tokenization_method = "minerva_base"
OUT_DIR = network.split("/")[-1]
EPOCHS = 100
BATCH_SIZE = 8
max_length = 120

# Dataset Analysis

In [88]:
df = pd.read_csv(DATASET, sep=",", index_col=False)

In [89]:
print(f"length mean {SRC_L} text: {df[SRC_L].apply(lambda x: len(x.split())).mean()}")
print(f"length mean {TRG_L} text: {df[TRG_L].apply(lambda x: len(x.split())).mean()}")

length mean Sentence text: 20.04123711340206
length mean Target text: 20.690721649484537


In [90]:
df.head() 

Unnamed: 0,Author,Date,Region,Sentence,Target
0,Brunetto Latini,1260-61,fior.,quella guerra ben fatta l' opera perché etc. E...,Quella guerra fu ben condotta per via delle az...
1,Bono Giamboni,1292,fior.,"crudele, e di tutte le colpe pigli vendetta, c...","È severo, e punisce tutte le colpe come prescr..."
2,Valerio Massimo (red. V1,1336,fior.,Non d' altra forza d' animo fue ornato Ponzio ...,"Ponzio Aufidiano, cavaliere romano, fu dotato ..."
3,Lucano volg. (ed. Marinoni),1330/40,prat.,Se questo piace a tutti e se 'l tempo hae biso...,Se questo è quello che tutti desiderano e se l...
4,Brunetto Latini,1260-61,fior.,Officio di questa arte pare che sia dicere app...,Il compito di quest’arte sembra essere quello ...


## Load The Dataset

In [91]:
# Switch to select the network and load the appropriate model and tokenizer
match network:
    
    case "sapienzanlp/Minerva-1B-base-v1.0" | "sapienzanlp/Minerva-7B-base-v1.0":
        
        
        tokenizer = AutoTokenizer.from_pretrained(network)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token 

        
        model = AutoModelForCausalLM.from_pretrained(network, device_map=device, torch_dtype=torch.float16)
        data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
        
        lora_config = LoraConfig(
            task_type=TaskType.CAUSAL_LM, 
            inference_mode=False, 
            r=8, 
            lora_alpha=16, 
            lora_dropout=0.65
            )

        qlora_config = LoraConfig(
            init_lora_weights="loftq",
            loftq_config=LoftQConfig(loftq_bits=4),
            r=8,
            lora_alpha=16,
            target_modules="all-linear",
            lora_dropout=0.5,
            bias="none",
            task_type="CAUSAL_LM"
        )

        model = get_peft_model(model, lora_config)
        model.print_trainable_parameters()

        

        early_stopping_patience = 10
        early_stopping_threshold = 0.01 

        early_callback = EarlyStoppingCallback(
            early_stopping_patience=early_stopping_patience, # Se la loss di valutazione non migliora per 3 epoche consecutive
            early_stopping_threshold=early_stopping_threshold # Ignora miglioramenti inferiori a 0.001
        )

        training_args = TrainingArguments(
            output_dir=OUT_DIR,
            learning_rate=6e-5,
            weight_decay=1e-4,
            warmup_steps=80,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            num_train_epochs=EPOCHS,
            eval_strategy="epoch",
            save_strategy="epoch",
            logging_strategy="epoch",
            load_best_model_at_end=True,
            report_to="none",
            save_total_limit=3,
            lr_scheduler_type="cosine",
            logging_dir=OUT_DIR,
            logging_steps=10,
            label_names=['labels'],
            metric_for_best_model="eval_chrf++", 
            greater_is_better=True,
        )

        params = {
            
            "max_new_tokens": max_length, # max number of new tokens to generate
            "do_sample":True,      # enables sampling for more diverse outputs
            "top_k":100,            # diversity increase by controlling the candidate words
            "top_p":0.95,          # nucleus sampling for further control over variety
            "temperature":1.0,     # reduces randomness and increases coherence
            "repetition_penalty":1.0,  # penalizza ripetizioni
            "num_return_sequences":10,  # number of generated responses
            "pad_token_id":tokenizer.eos_token_id  # avoids warning if padding token is missing
        }

    case _:
        raise Exception(f"Rete {network} non testabile")
        

trainable params: 851,968 || all params: 1,007,552,512 || trainable%: 0.0846


In [92]:
class MyTrainerSeq2Seq(Seq2SeqTrainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        if 'num_items_in_batch' in inputs:
            inputs = {k: v for k, v in inputs.items() if k != 'num_items_in_batch'}
        return super().compute_loss(model, inputs, return_outputs=return_outputs)

class MyTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        if 'num_items_in_batch' in inputs:
            inputs = {k: v for k, v in inputs.items() if k != 'num_items_in_batch'}
        return super().compute_loss(model, inputs, return_outputs=return_outputs)

In [93]:
sacrebleu_metric = evaluate.load("sacrebleu")
rouge_metric = evaluate.load("rouge")
meteor_metric = evaluate.load("meteor")
chrf_metric = evaluate.load("chrf")
ter_metric = evaluate.load("ter")


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels] # Specific format for SacreBLEU
    return preds, labels

def compute_metrics(eval_preds):
    preds_input, label_ids = eval_preds

    # Dealing with logits or token IDs for predictions
    # If preds_input are logits (es. direct output of training modello)
    current_preds = preds_input
    if isinstance(current_preds, tuple): # Common in HF Trainer, es. (logits, hidden_states)
        current_preds = current_preds[0]
    
    if hasattr(current_preds, "ndim") and current_preds.ndim == 3: # Array of logits (batch_size, seq_len, vocab_size)
        current_preds_ids = np.argmax(current_preds, axis=-1)
    else: # Otherwise, assumed to be token ID (batch_size, seq_len)
        current_preds_ids = current_preds

    # Decode predictions and labels
    decoded_preds_raw = tokenizer.batch_decode(current_preds_ids, skip_special_tokens=True)

    # Replace -100 in labels (common for token to be ignored) with pad_token_id for decoding
    processed_label_ids = np.where(label_ids != -100, label_ids, tokenizer.pad_token_id)
    decoded_labels_raw = tokenizer.batch_decode(processed_label_ids, skip_special_tokens=True)
    
    processed_preds, processed_labels_for_sacrebleu = postprocess_text(decoded_preds_raw, decoded_labels_raw)

    # For other metrics (ROUGE, METEOR, CHRF, TER), usually expects a flat list of reference strings
    flat_references = [ref[0] for ref in processed_labels_for_sacrebleu]

    results = {}

    # 1. SacreBLEU
    sacrebleu_output = sacrebleu_metric.compute(predictions=processed_preds, references=processed_labels_for_sacrebleu)
    if sacrebleu_output and "score" in sacrebleu_output:
        results["bleu"] = sacrebleu_output["score"]
    else:
        results["bleu"] = 0.0 # Fallback

    # 2. ROUGE (rouge1, rouge2, rougeL, rougeLsum)
    rouge_output = rouge_metric.compute(predictions=processed_preds, references=flat_references, use_stemmer=True)
    if rouge_output:
        results["rouge1"] = rouge_output.get("rouge1", 0.0)
        results["rouge2"] = rouge_output.get("rouge2", 0.0)
        results["rougeL"] = rouge_output.get("rougeL", 0.0)
        results["rougeLsum"] = rouge_output.get("rougeLsum", 0.0) # Spesso più robusto per sommario
    else:
        results.update({"rouge1": 0.0, "rouge2": 0.0, "rougeL": 0.0, "rougeLsum": 0.0})

    # 3. METEOR
    meteor_output = meteor_metric.compute(predictions=processed_preds, references=flat_references)
    if meteor_output and "meteor" in meteor_output:
        results["meteor"] = meteor_output["meteor"]
    else:
        results["meteor"] = 0.0

    # 4. CHRF++ (CHRF with n-grams of words)
    # For CHRF++, word_order (or word_n) is > 0. Default of evaluate.load('chrf') are word_order=0 (CHRF standard).
    # Common parameters for CHRF++: word_order=2, beta=2 (beta=2 default)
    chrf_output = chrf_metric.compute(predictions=processed_preds, references=flat_references, word_order=2, beta=2)
    if chrf_output and "score" in chrf_output:
        results["chrf++"] = chrf_output["score"] # CHRF++ score
    else:
        results["chrf++"] = 0.0
        
    # (Optional) CHRF standard (only characters)
    # chrf_std_output = chrf_metric.compute(predictions=processed_preds, references=flat_references, word_order=0)
    # if chrf_std_output and "score" in chrf_std_output:
    #     results["chrf"] = chrf_std_output["score"]
    # else:
    #     results["chrf"] = 0.0

    # 5. TER (Translation Edit Rate) - the smaller, the better
    ter_output = ter_metric.compute(predictions=processed_preds, references=flat_references)
    if ter_output and "score" in ter_output:
        results["ter"] = ter_output["score"]
    else:
        results["ter"] = 1.0 # Fallback on worst score TER possible

    # Mean length of generated predictions (excluding padding tokens)
    # 'current_preds_ids' are ID token of the predictions
    prediction_lengths = [np.count_nonzero(pid_seq != tokenizer.pad_token_id) for pid_seq in current_preds_ids]
    results["gen_len"] = np.mean(prediction_lengths) if prediction_lengths else 0.0

    # Rounding of all numerical results
    final_results = {k: round(v, 4) for k, v in results.items() if isinstance(v, (int, float))}
    
    return final_results

[nltk_data] Downloading package wordnet to /home/andrea/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/andrea/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/andrea/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [94]:
from datasets.features import Value, Features
hf = Dataset.from_csv(DATASET, features=
    Features({
        SRC_L : Value("string"),
        TRG_L : Value("string"),
        "Date": Value("string"),
        "Author":Value("string"),
        "Region":Value("string")
    })          
                      
    ).shuffle(2025).train_test_split(test_size=0.1)

## Tokenization

In [95]:
print(f"model max length: {max_length}")


def noprompt_it_it(examples):
    inputs = [example for example in examples[SRC_L]]
    targets = [example for example in examples[TRG_L]]
    
    model_inputs = tokenizer(inputs, text_target=targets, max_length=max_length, truncation=True, padding="max_length")
    return model_inputs

# I. Riscrivi
# II. Traduci
# III. Correggi
def minerva_base_prompt_it_it_train(examples):

    prompts = [
        f"""Riscrivi la seguente frase {src} scritta in {dia} italiano arcaico del {dat} in Italiano moderno: {dst}""" 
        for src, dst, dat, dia in zip(examples[SRC_L], examples[TRG_L], examples["Date"], examples["Region"])
    ] 

    # Tokenizza input+target e crea label con gli stessi token
    model_inputs = tokenizer(
        prompts,
        max_length=max_length,
        truncation=True,
        padding="max_length"
    )
    
    model_inputs["labels"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in input_ids]
        for input_ids in model_inputs["input_ids"]
    ]
    return model_inputs
    
def minerva_base_prompt_it_it_eval(examples):
    prompts = [
        
        f"""Riscrivi la seguente frase {src} scritta in Italiano arcaico in Italiano moderno: """
        for src in examples[SRC_L]
    ]

    # Tokenizza input+target e crea label con gli stessi token
    model_inputs = tokenizer(
        prompts,
        max_length=max_length,
        truncation=True,
        padding="max_length"
    )
   
    return model_inputs

def base_prompt_it_it(examples):
    inputs = ["Riscrivi dall'Italiano Antico a l'Italiano Moderno: " + example for example in examples[SRC_L]]

    # Tokenizza solo gli input
    model_inputs = tokenizer(inputs, max_length=max_length, truncation=True, padding="max_length")
    targets = [example for example in examples[TRG_L]]

    model_inputs = tokenizer(inputs, text_target=targets, max_length=max_length, truncation=True, padding="longest")
    
    return model_inputs

def parafrasi_prompt_it_it(examples):
    inputs = ["Scrivi la parafrasi di questo testo: " + example for example in examples[SRC_L]]
    targets = [example for example in examples[TRG_L]]
    
    model_inputs = tokenizer(inputs, text_target=targets, max_length=max_length, truncation=True, padding="longest")
    return model_inputs

def informative_prompt_it_it(examples):
    inputs = [f"Riscrivi in uno stile più moderno il testo del seguente Autore: '{author}', anno di scrittura: {date}, luogo: Italia, dialetto: '{region}', testo: '{text}'." for text, date, region, author in zip(examples[SRC_L], examples["Date"], examples["Region"], examples["Author"]) ]
    targets = [example for example in examples[TRG_L]]
    
    model_inputs = tokenizer(inputs, text_target=targets, max_length=max_length, truncation=True, padding="longest")
    return model_inputs
    

model max length: 120


## Tokenizer Parameters

In [96]:
match tokenization_method:
    case "minerva_base":
        map_callback_train = minerva_base_prompt_it_it_train
        map_callback_eval   = minerva_base_prompt_it_it_eval

        hf_tokenized = DatasetDict({
            "train": hf["train"].map(map_callback_train, batched=True),
            "test":  hf["test"].map(map_callback_eval, batched=True)
        })
        
        hf_tokenized.set_format(type="torch", columns=['input_ids', 'attention_mask'])

    
    case _:
        raise ValueError("Tokenization method not avaiable")

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [97]:
print(hf_tokenized.column_names)
print(hf_tokenized.shape)

{'train': ['Sentence', 'Target', 'Date', 'Author', 'Region', 'input_ids', 'attention_mask', 'labels'], 'test': ['Sentence', 'Target', 'Date', 'Author', 'Region', 'input_ids', 'attention_mask']}
{'train': (87, 8), 'test': (10, 7)}


In [98]:
for idx, s in enumerate(hf_tokenized["train"].take(5), 1):
    print(f"===:(sentence n°{idx}):===")
    print(f"{SRC_L}:{tokenizer.decode(s["input_ids"], attention_mask=s["attention_mask"], skip_special_tokens=True)}" )
    #print(f"{TRG_L}:{tokenizer.decode(s["labels"], skip_special_tokens=True)}")


===:(sentence n°1):===
Sentence:Riscrivi la seguente frase Se questo piace a tutti e se 'l tempo hae bisogno d'avere Pompeio per cavaliere e non per compagno, non riterrò più i fati. scritta in prat. italiano arcaico del 1330/40 in Italiano moderno: Se questo è quello che tutti desiderano e se l'attuale situazione richiede che Pompeo sia un leader e non solo un alleato, allora non cercherò più di oppormi al destino.
===:(sentence n°2):===
Sentence:Riscrivi la seguente frase Unde gli poeti, parlando de lloro, dicono le virtute loro e dicono fabolosamente gli loro difetti, quando alcuna passava l'ordine a lloro deputato scritta in umbr.-tosc. italiano arcaico del 1375-77 in Italiano moderno: Perciò i poeti, quando parlano di loro, ne esaltano le virtù e, in modo allegorico o fittizio, ne descrivono i difetti, specialmente quando qualcuno di essi superava i limiti stabiliti dal proprio ruolo.
===:(sentence n°3):===
Sentence:Riscrivi la seguente frase ne salìo in su l'argine del fosso, e i

In [99]:
for idx, s in enumerate(hf_tokenized["test"].take(5), 1):
    print(f"===:(sentence n°{idx}):===")
    print(f"{SRC_L}:{tokenizer.decode(s["input_ids"], attention_mask=s["attention_mask"], skip_special_tokens=True)}" )
    #print(f"{TRG_L}:{tokenizer.decode(s["labels"], skip_special_tokens=True)}")

===:(sentence n°1):===
Sentence:Riscrivi la seguente frase Io gli apersi, e quelli fuggitte. E che bisogno è, che lo cuore tuo stia chiuso al tuo sposo Cristo? scritta in Italiano arcaico in Italiano moderno: 
===:(sentence n°2):===
Sentence:Riscrivi la seguente frase sì convene che lli desse grande avere perché · lasciasse andare via. scritta in Italiano arcaico in Italiano moderno: 
===:(sentence n°3):===
Sentence:Riscrivi la seguente frase noi iscaciati e dipartiti per debito dela cittade, e tutti iscaciati da fama e da ventura buona. scritta in Italiano arcaico in Italiano moderno: 
===:(sentence n°4):===
Sentence:Riscrivi la seguente frase una villa chiamata Vitermina, con ciò fosse cosa che piusori principi di scherani corressero per ventura a quel tempo a vederlo, Scipione, stimando che venissero per isforzarlo, allogò ne la casa scritta in Italiano arcaico in Italiano moderno: 
===:(sentence n°5):===
Sentence:Riscrivi la seguente frase Creti?  Certo quand'elli si mosse, elli ti

## Models & Traning

### PEFT Fine-Tuning

In [100]:
trainer = MyTrainer(
            model=model,
            args=training_args,
            train_dataset=hf_tokenized["train"],
            eval_dataset=hf_tokenized["test"],
            processing_class=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
            callbacks=[Report(OUT_DIR), early_callback]
        )

In [101]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Rouge1,Rouge2,Rougel,Rougelsum,Meteor,Chrf++,Ter,Gen Len
1,4.1562,5.238818,0.4542,0.2543,0.0281,0.1726,0.1729,0.1532,21.1414,92.8349,120.0
2,4.1453,5.215823,0.4956,0.2543,0.0281,0.1726,0.1729,0.1533,21.2637,94.081,120.0
3,4.1239,5.166306,0.6419,0.2528,0.0308,0.1711,0.1682,0.1641,21.7455,95.0156,120.0
4,4.0653,5.070263,0.7463,0.2491,0.0273,0.1673,0.1642,0.167,21.5545,91.9003,120.0
5,3.9778,4.899768,0.9061,0.252,0.0273,0.1716,0.1713,0.1716,22.2529,90.6542,120.0
6,3.8309,4.642314,1.6252,0.2942,0.063,0.2149,0.2145,0.2218,26.406,86.2928,120.0
7,3.6128,4.366135,3.2075,0.316,0.0965,0.2573,0.2578,0.2429,29.5817,83.4891,120.0
8,3.3562,4.128733,3.193,0.3676,0.1146,0.3137,0.3112,0.266,31.2828,82.866,120.0
9,3.0931,3.942444,3.3533,0.3825,0.1441,0.346,0.3508,0.3176,35.0446,81.9315,120.0
10,2.875,3.857544,3.2117,0.3587,0.1005,0.3213,0.3234,0.2774,33.3835,78.5047,120.0


Training done. Generating graphs...


TrainOutput(global_step=209, training_loss=3.1628864416095057, metrics={'train_runtime': 63.7312, 'train_samples_per_second': 136.511, 'train_steps_per_second': 17.26, 'total_flos': 1119278412103680.0, 'train_loss': 3.1628864416095057, 'epoch': 19.0})

In [102]:
# Imposta il modello in modalità valutazione e spostalo sul device
model = model.eval()
model = model.to(device)

# Crea il DataLoader
loader = torch.utils.data.DataLoader(hf_tokenized["test"], batch_size=8)


print(f"Inizio generazione su {device}")
print("=============================")

for batch in tqdm(loader):
    # Sposta l'intero batch sul device
    # Nota: DataLoader restituisce un batch come dizionario di tensori
    
    batch["input_ids"] = batch["input_ids"].to(device)
    batch["attention_mask"] = batch["attention_mask"].to(device)

    # Genera l'output per il batch
    # Input al generate devono essere input_ids e attention_mask
    result_ids = model.generate(
        input_ids=batch["input_ids"],
        attention_mask=batch["attention_mask"],
        max_new_tokens=max_length,
    )

    # Decodifica *separatamente* ogni prompt e ogni risultato generato
    # Iteriamo sul batch per decodificare uno per uno
    # batch["input_ids"] ha forma (batch_size, seq_len_input)
    # result_ids ha forma (batch_size, seq_len_output)
    
    # Decodifica i prompt originali
    decoded_prompts = tokenizer.batch_decode(batch["input_ids"], skip_special_tokens=True)
    
    # Decodifica i risultati generati
    decoded_results = tokenizer.batch_decode(result_ids, skip_special_tokens=True)

    # Stampa i risultati per ogni elemento del batch
    with open(OUT_DIR + "/output_chat.txt", "w", encoding="utf-8") as f:
        for i in range(len(decoded_prompts)):
            print(f"===:{i}-(Model for Prompt)===")
            print(f"{decoded_prompts[i]}")
            print("=========================")
            
            print(f"===:(model {network}):===")
            print(decoded_results[i])
            print("=================================")
            
            ############################################


            f.write("f===:({i}Model for Prompt)===")
            f.write(f"{decoded_prompts[i]}")
            f.write("=========================")

            f.write(f"===:(model {network}):===")
            f.write(decoded_results[i])
            f.write("==========================")

print("\nGenerazione completata.")

Inizio generazione su cuda


  0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


===:0-(Model for Prompt)===
Riscrivi la seguente frase Io gli apersi, e quelli fuggitte. E che bisogno è, che lo cuore tuo stia chiuso al tuo sposo Cristo? scritta in Italiano arcaico in Italiano moderno: 
===:(model sapienzanlp/Minerva-1B-base-v1.0):===
Riscrivi la seguente frase Io gli apersi, e quelli fuggitte. E che bisogno è, che lo cuore tuo stia chiuso al tuo sposo Cristo? scritta in Italiano arcaico in Italiano moderno: 1800 in italiano moderno: Io gli apersi, e quelli fuggirono.
Settima lettera di Giovanni apostolo, scritta in greco in italiano moderno: 1800 in italiano moderno: Io gli apersi, e quelli fuggirono.
Versione del 1800 in italiano moderno
Io gli apersi, e quelli fuggirono.
Io gli apersi, e quelli fuggirono.
Io gli apersi, e quelli fuggirono.
Io gli apersi, e quelli fuggirono.
Io gli apersi, e quelli fugg
===:1-(Model for Prompt)===
Riscrivi la seguente frase sì convene che lli desse grande avere perché · lasciasse andare via. scritta in Italiano arcaico in Italiano