# Translation from Ancient to Modern Italian

In [1]:
# Import Datases to work with Transformers by Hugging-Face
import torch
import pandas as pd
from tqdm.auto import tqdm
# Imports for Transformers
from transformers import AutoTokenizer  # Datasets
from datasets import Dataset, DatasetDict
from utils import Report
import numpy as np
import evaluate

from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments, AutoModelForCausalLM          # imports for causal Learning
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoModelForSeq2SeqLM    # imports for Seq2Seq models
from peft import LoraConfig, TaskType, LoftQConfig, PeftModelForSeq2SeqLM, PeftModelForCausalLM, get_peft_model     # imports for quantization methods (LoRA etc...)
from transformers import EarlyStoppingCallback  


# Promposed Models
* google/gemma-3-1b-it (LLM) 🚀
* mistralai/Mistral-7B-Instruct-v0.2
* sapienzanlp/Minerva-1B-base-v1.0 🇮🇹 (LMM)
* Helsinki-NLP/opus-mt-itc-itc (Machine Translation) 🏆 - use OpusPrompt 

In [2]:
device = ('cuda' if torch.cuda.is_available() else "cpu")
DATASET = "dataset_ann.csv"
SRC_L = "Sentence"
TRG_L = "Target"
network = "sapienzanlp/Minerva-1B-base-v1.0"
tokenization_method = "minerva_base"
OUT_DIR = network.split("/")[-1]
EPOCHS = 100
BATCH_SIZE = 8

# Dataset Analysis

In [3]:
df = pd.read_csv(DATASET, sep=",", index_col=False)

In [4]:
print(f"length mean {SRC_L} text: {df[SRC_L].apply(lambda x: len(x.split())).mean()}")
print(f"length mean {TRG_L} text: {df[TRG_L].apply(lambda x: len(x.split())).mean()}")

length mean Sentence text: 20.04123711340206
length mean Target text: 20.690721649484537


In [5]:
df.head()

Unnamed: 0,Author,Date,Region,Sentence,Target
0,Brunetto Latini,1260-61,fior.,quella guerra ben fatta l' opera perché etc. E...,Quella guerra fu ben condotta per via delle az...
1,Bono Giamboni,1292,fior.,"crudele, e di tutte le colpe pigli vendetta, c...","È severo, e punisce tutte le colpe come prescr..."
2,Valerio Massimo (red. V1,1336,fior.,Non d' altra forza d' animo fue ornato Ponzio ...,"Ponzio Aufidiano, cavaliere romano, fu dotato ..."
3,Lucano volg. (ed. Marinoni),1330/40,prat.,Se questo piace a tutti e se 'l tempo hae biso...,Se questo è quello che tutti desiderano e se l...
4,Brunetto Latini,1260-61,fior.,Officio di questa arte pare che sia dicere app...,Il compito di quest’arte sembra essere quello ...


## Load The Dataset

In [6]:
# Switch to select the network and load the appropriate model and tokenizer
match network:
    
    case "sapienzanlp/Minerva-1B-base-v1.0":
        
        
        tokenizer = AutoTokenizer.from_pretrained(network)
        max_length = 280
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token 

        
        model = AutoModelForCausalLM.from_pretrained(network, device_map=device, torch_dtype=torch.float16)
        data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
        
        lora_config = LoraConfig(
            task_type=TaskType.CAUSAL_LM, 
            inference_mode=False, 
            r=8, 
            lora_alpha=16, 
            lora_dropout=0.5
            )

        qlora_config = LoraConfig(
            init_lora_weights="loftq",
            loftq_config=LoftQConfig(loftq_bits=4),
            r=8,
            lora_alpha=16,
            target_modules="all-linear",
            lora_dropout=0.5,
            bias="none",
            task_type="CAUSAL_LM"
        )

        model = get_peft_model(model, lora_config)
        model.print_trainable_parameters()

        

        early_stopping_patience = 3 
        early_stopping_threshold = 0.01 

        early_callback = EarlyStoppingCallback(
            early_stopping_patience=early_stopping_patience, # Se la loss di valutazione non migliora per 3 epoche consecutive
            early_stopping_threshold=early_stopping_threshold # Ignora miglioramenti inferiori a 0.001
        )

        training_args = TrainingArguments(
            output_dir=OUT_DIR,
            learning_rate=4e-4,
            weight_decay=1e-5,
            #warmup_steps=20,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            num_train_epochs=EPOCHS,
            eval_strategy="epoch",
            save_strategy="epoch",
            logging_strategy="epoch",
            load_best_model_at_end=True,
            report_to="none",
            save_total_limit=3,
            lr_scheduler_type="linear",
            logging_dir=OUT_DIR,
            logging_steps=10,
            label_names=['labels'],
            metric_for_best_model="eval_loss", 
            greater_is_better=False,
        )

        params = {
            
            "max_new_tokens": 60, # max number of new tokens to generate
            "do_sample":True,      # enables sampling for more diverse outputs
            "top_k":50,            # diversity increase by controlling the candidate words
            "top_p":0.95,          # nucleus sampling for further control over variety
            "temperature":1.0,     # reduces randomness and increases coherence
            "repetition_penalty":0.9,  # penalizza ripetizioni
            "num_return_sequences":10,  # number of generated responses
            "pad_token_id":tokenizer.eos_token_id  # avoids warning if padding token is missing
        }

    case _:
        raise Exception(f"Rete {network} non testabile")
        

trainable params: 851,968 || all params: 1,007,552,512 || trainable%: 0.0846


In [7]:
class MyTrainerSeq2Seq(Seq2SeqTrainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        if 'num_items_in_batch' in inputs:
            inputs = {k: v for k, v in inputs.items() if k != 'num_items_in_batch'}
        return super().compute_loss(model, inputs, return_outputs=return_outputs)

class MyTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        if 'num_items_in_batch' in inputs:
            inputs = {k: v for k, v in inputs.items() if k != 'num_items_in_batch'}
        return super().compute_loss(model, inputs, return_outputs=return_outputs)

In [8]:
sacrebleu_metric = evaluate.load("sacrebleu")
rouge_metric = evaluate.load("rouge")
meteor_metric = evaluate.load("meteor")
chrf_metric = evaluate.load("chrf")
ter_metric = evaluate.load("ter")


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels] # Specific format for SacreBLEU
    return preds, labels

def compute_metrics(eval_preds):
    preds_input, label_ids = eval_preds

    # Dealing with logits or token IDs for predictions
    # If preds_input are logits (es. direct output of training modello)
    current_preds = preds_input
    if isinstance(current_preds, tuple): # Common in HF Trainer, es. (logits, hidden_states)
        current_preds = current_preds[0]
    
    if hasattr(current_preds, "ndim") and current_preds.ndim == 3: # Array of logits (batch_size, seq_len, vocab_size)
        current_preds_ids = np.argmax(current_preds, axis=-1)
    else: # Otherwise, assumed to be token ID (batch_size, seq_len)
        current_preds_ids = current_preds

    # Decode predictions and labels
    decoded_preds_raw = tokenizer.batch_decode(current_preds_ids, skip_special_tokens=True)

    # Replace -100 in labels (common for token to be ignored) with pad_token_id for decoding
    processed_label_ids = np.where(label_ids != -100, label_ids, tokenizer.pad_token_id)
    decoded_labels_raw = tokenizer.batch_decode(processed_label_ids, skip_special_tokens=True)
    
    processed_preds, processed_labels_for_sacrebleu = postprocess_text(decoded_preds_raw, decoded_labels_raw)

    # For other metrics (ROUGE, METEOR, CHRF, TER), usually expects a flat list of reference strings
    flat_references = [ref[0] for ref in processed_labels_for_sacrebleu]

    results = {}

    # 1. SacreBLEU
    sacrebleu_output = sacrebleu_metric.compute(predictions=processed_preds, references=processed_labels_for_sacrebleu)
    if sacrebleu_output and "score" in sacrebleu_output:
        results["bleu"] = sacrebleu_output["score"]
    else:
        results["bleu"] = 0.0 # Fallback

    # 2. ROUGE (rouge1, rouge2, rougeL, rougeLsum)
    rouge_output = rouge_metric.compute(predictions=processed_preds, references=flat_references, use_stemmer=True)
    if rouge_output:
        results["rouge1"] = rouge_output.get("rouge1", 0.0)
        results["rouge2"] = rouge_output.get("rouge2", 0.0)
        results["rougeL"] = rouge_output.get("rougeL", 0.0)
        results["rougeLsum"] = rouge_output.get("rougeLsum", 0.0) # Spesso più robusto per sommario
    else:
        results.update({"rouge1": 0.0, "rouge2": 0.0, "rougeL": 0.0, "rougeLsum": 0.0})

    # 3. METEOR
    meteor_output = meteor_metric.compute(predictions=processed_preds, references=flat_references)
    if meteor_output and "meteor" in meteor_output:
        results["meteor"] = meteor_output["meteor"]
    else:
        results["meteor"] = 0.0

    # 4. CHRF++ (CHRF with n-grams of words)
    # For CHRF++, word_order (or word_n) is > 0. Default of evaluate.load('chrf') are word_order=0 (CHRF standard).
    # Common parameters for CHRF++: word_order=2, beta=2 (beta=2 default)
    chrf_output = chrf_metric.compute(predictions=processed_preds, references=flat_references, word_order=2, beta=2)
    if chrf_output and "score" in chrf_output:
        results["chrf++"] = chrf_output["score"] # CHRF++ score
    else:
        results["chrf++"] = 0.0
        
    # (Optional) CHRF standard (only characters)
    # chrf_std_output = chrf_metric.compute(predictions=processed_preds, references=flat_references, word_order=0)
    # if chrf_std_output and "score" in chrf_std_output:
    #     results["chrf"] = chrf_std_output["score"]
    # else:
    #     results["chrf"] = 0.0

    # 5. TER (Translation Edit Rate) - the smaller, the better
    ter_output = ter_metric.compute(predictions=processed_preds, references=flat_references)
    if ter_output and "score" in ter_output:
        results["ter"] = ter_output["score"]
    else:
        results["ter"] = 1.0 # Fallback on worst score TER possible

    # Mean length of generated predictions (excluding padding tokens)
    # 'current_preds_ids' are ID token of the predictions
    prediction_lengths = [np.count_nonzero(pid_seq != tokenizer.pad_token_id) for pid_seq in current_preds_ids]
    results["gen_len"] = np.mean(prediction_lengths) if prediction_lengths else 0.0

    # Rounding of all numerical results
    final_results = {k: round(v, 4) for k, v in results.items() if isinstance(v, (int, float))}
    
    return final_results

[nltk_data] Downloading package wordnet to /home/andrea/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/andrea/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/andrea/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [9]:
from datasets.features import Value, Features
hf = Dataset.from_csv(DATASET, features=
    Features({
        SRC_L : Value("string"),
        TRG_L : Value("string"),
        "Date": Value("string"),
        "Author":Value("string"),
        "Region":Value("string")
    })          
                      
    ).shuffle(2025).train_test_split(test_size=0.10)

## Tokenization

In [10]:
print(f"model max length: {max_length}")


def noprompt_it_it(examples):
    inputs = [example for example in examples[SRC_L]]
    targets = [example for example in examples[TRG_L]]
    
    model_inputs = tokenizer(inputs, text_target=targets, max_length=max_length, truncation=True, padding="max_length")
    return model_inputs

def minerva_base_prompt_it_it_train(examples):
    prompts = [
        f"Riscrivi '{src}' dall'Italiano Antico a l'Italiano Moderno\n Risposta\n" 
        for src in examples[SRC_L]
    ]
    targets = [f" {trg}" for trg in examples[TRG_L]]

    # Concatena input e target in un'unica stringa
    full_texts = [prompt + target for prompt, target in zip(prompts, targets)]

    # Tokenizza input+target e crea label con gli stessi token
    model_inputs = tokenizer(
        full_texts,
        max_length=max_length,
        truncation=True,
        padding="max_length"
    )
   
    return model_inputs
    
def minerva_base_prompt_it_it_eval(examples):
    prompts = [
        f"Riscrivi '{src}' dall'Italiano Antico a l'Italiano Moderno\n Risposta\n" 
        for src in examples[SRC_L]
    ]

    # Tokenizza input+target e crea label con gli stessi token
    model_inputs = tokenizer(
        prompts,
        max_length=max_length,
        truncation=True,
        padding="max_length"
    )
   
    return model_inputs

def base_prompt_it_it(examples):
    inputs = ["Riscrivi dall'Italiano Antico a l'Italiano Moderno: " + example for example in examples[SRC_L]]

    # Tokenizza solo gli input
    model_inputs = tokenizer(inputs, max_length=max_length, truncation=True, padding="max_length")
    targets = [example for example in examples[TRG_L]]

    model_inputs = tokenizer(inputs, text_target=targets, max_length=max_length, truncation=True, padding="longest")
    
    return model_inputs

def parafrasi_prompt_it_it(examples):
    inputs = ["Scrivi la parafrasi di questo testo: " + example for example in examples[SRC_L]]
    targets = [example for example in examples[TRG_L]]
    
    model_inputs = tokenizer(inputs, text_target=targets, max_length=max_length, truncation=True, padding="longest")
    return model_inputs

def informative_prompt_it_it(examples):
    inputs = [f"Riscrivi in uno stile più moderno il testo del seguente Autore: '{author}', anno di scrittura: {date}, luogo: Italia, dialetto: '{region}', testo: '{text}'." for text, date, region, author in zip(examples[SRC_L], examples["Date"], examples["Region"], examples["Author"]) ]
    targets = [example for example in examples[TRG_L]]
    
    model_inputs = tokenizer(inputs, text_target=targets, max_length=max_length, truncation=True, padding="longest")
    return model_inputs
    

model max length: 280


## Tokenizer Parameters

In [11]:
match tokenization_method:
    case "minerva_base":
        map_callback_train = minerva_base_prompt_it_it_train
        map_callback_eval   = minerva_base_prompt_it_it_eval

        hf_tokenized = DatasetDict({
            "train": hf["train"].map(map_callback_train, batched=True),
            "test":  hf["test"].map(map_callback_eval, batched=True)
        })
        
        hf_tokenized.set_format(type="torch", columns=['input_ids', 'attention_mask'])

    
    case _:
        raise ValueError("Tokenization method not avaiable")

Map:   0%|          | 0/87 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [12]:
print(hf_tokenized.column_names)
print(hf_tokenized.shape)

{'train': ['Sentence', 'Target', 'Date', 'Author', 'Region', 'input_ids', 'attention_mask'], 'test': ['Sentence', 'Target', 'Date', 'Author', 'Region', 'input_ids', 'attention_mask']}
{'train': (87, 7), 'test': (10, 7)}


In [18]:
for idx, s in enumerate(hf_tokenized["train"].take(5), 1):
    print(f"===:(sentence n°{idx}):===")
    print(f"{SRC_L}:{tokenizer.decode(s["input_ids"], attention_mask=s["attention_mask"], skip_special_tokens=True)}" )
    #print(f"{TRG_L}:{tokenizer.decode(s["labels"], skip_special_tokens=True)}")


===:(sentence n°1):===
Sentence:Riscrivi 'Creti?  Certo quand'elli si mosse, elli ti dixe: "O fedele mia donna, fa' che in mio luogo ti sia racomandato il nostro hoste troiano".' dall'Italiano Antico a l'Italiano Moderno
 Risposta
 Creti? Con convinzione, prima di partire, ti disse: 'O mia fedele donna, fa’ in modo che, per tua volontà, il nostro ospite troiano sieda al mio posto.'
===:(sentence n°2):===
Sentence:Riscrivi 'Officio di questa arte pare che sia dicere appostatamente per fare credere, fine è far credere per lo dire.' dall'Italiano Antico a l'Italiano Moderno
 Risposta
 Il compito di quest’arte sembra essere quello di parlare in modo appropriato per convincere, e di far credere qualcosa attraverso le parole.
===:(sentence n°3):===
Sentence:Riscrivi 'l' anima muta la sua forza per la propietade di quello corpo a cui ella si congiunge.' dall'Italiano Antico a l'Italiano Moderno
 Risposta
 L’anima modifica la sua forza in base alla natura del corpo al quale si unisce.
===:(sen

In [19]:
for idx, s in enumerate(hf_tokenized["test"].take(5), 1):
    print(f"===:(sentence n°{idx}):===")
    print(f"{SRC_L}:{tokenizer.decode(s["input_ids"], attention_mask=s["attention_mask"], skip_special_tokens=True)}" )
    #print(f"{TRG_L}:{tokenizer.decode(s["labels"], skip_special_tokens=True)}")

===:(sentence n°1):===
Sentence:Riscrivi 'Io spero in messer Iesù di mandare tosto a voi Timoteo, acciocché io sia d'animo buono' dall'Italiano Antico a l'Italiano Moderno
 Risposta

===:(sentence n°2):===
Sentence:Riscrivi 'noi iscaciati e dipartiti per debito dela cittade, e tutti iscaciati da fama e da ventura buona.' dall'Italiano Antico a l'Italiano Moderno
 Risposta

===:(sentence n°3):===
Sentence:Riscrivi 'Andò nel campo de' Cartaginesi e tutta la legione trasse seco.' dall'Italiano Antico a l'Italiano Moderno
 Risposta

===:(sentence n°4):===
Sentence:Riscrivi 'Altressì uno amante chiamando merzé alla sua donna dice parole e ragioni molte, et ella si difende in suo dire.' dall'Italiano Antico a l'Italiano Moderno
 Risposta

===:(sentence n°5):===
Sentence:Riscrivi 'Ed ecco di subito tutta questa turba degli uccelli si levò a volo dietro all'aquila' dall'Italiano Antico a l'Italiano Moderno
 Risposta



## Models & Traning

### PEFT Fine-Tuning

In [14]:
trainer = MyTrainer(
            model=model,
            args=training_args,
            train_dataset=hf_tokenized["train"],
            eval_dataset=hf_tokenized["test"],
            processing_class=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
            callbacks=[Report(OUT_DIR), early_callback]
        )

In [15]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Rouge1,Rouge2,Rougel,Rougelsum,Meteor,Chrf++,Ter,Gen Len
1,3.8136,3.558442,1.3547,0.2478,0.0612,0.1939,0.2179,0.1296,30.5764,93.7238,280.0
2,2.9039,2.762877,19.3692,0.4367,0.3062,0.3945,0.4193,0.3655,45.8013,74.8954,280.0
3,2.6319,2.671443,18.5974,0.4446,0.3089,0.4043,0.432,0.3656,45.1249,75.3138,280.0
4,2.4882,2.653488,18.299,0.4392,0.3081,0.3985,0.4241,0.3521,44.2479,75.3138,280.0
5,2.3837,2.644812,20.0937,0.4497,0.3099,0.4065,0.4276,0.382,45.8046,74.8954,280.0
6,2.2777,2.674479,20.3143,0.4712,0.2995,0.4334,0.4569,0.4071,46.9554,70.7113,280.0
7,2.1613,2.66369,19.9846,0.4651,0.3025,0.4334,0.4549,0.4057,46.5902,71.9665,280.0


Training done. Generating graphs...


TrainOutput(global_step=77, training_loss=2.6657403475278385, metrics={'train_runtime': 57.9904, 'train_samples_per_second': 150.025, 'train_steps_per_second': 18.969, 'total_flos': 962186705141760.0, 'train_loss': 2.6657403475278385, 'epoch': 7.0})

In [None]:


# Imposta il modello in modalità valutazione e spostalo sul device
model = model.eval()
model = model.to(device)

# Crea il DataLoader
loader = torch.utils.data.DataLoader(hf_tokenized["test"], batch_size=8)


print(f"Inizio generazione su {device}")
print("=============================")

for batch in tqdm(loader):
    # Sposta l'intero batch sul device
    # Nota: DataLoader restituisce un batch come dizionario di tensori
    
    batch["input_ids"] = batch["input_ids"].to(device)
    batch["attention_mask"] = batch["attention_mask"].to(device)

    # Genera l'output per il batch
    # Input al generate devono essere input_ids e attention_mask
    result_ids = model.generate(
        input_ids=batch["input_ids"],
        attention_mask=batch["attention_mask"],
        max_new_tokens=60,
    )

    # Decodifica *separatamente* ogni prompt e ogni risultato generato
    # Iteriamo sul batch per decodificare uno per uno
    # batch["input_ids"] ha forma (batch_size, seq_len_input)
    # result_ids ha forma (batch_size, seq_len_output)
    
    # Decodifica i prompt originali
    decoded_prompts = tokenizer.batch_decode(batch["input_ids"], skip_special_tokens=True)
    
    # Decodifica i risultati generati
    decoded_results = tokenizer.batch_decode(result_ids, skip_special_tokens=True)

    # Stampa i risultati per ogni elemento del batch
    with open(OUT_DIR + "/output_chat.txt", "w", encoding="utf-8") as f:
        for i in range(len(decoded_prompts)):
            print("===:(Model for Prompt)===")
            print(f"{decoded_prompts[i]}")
            print("=========================")
            
            print(f"===:(model {network}):===")
            print(decoded_results[i])
            print("=================================")
            
            ############################################


            f.write("===:(Model for Prompt)===")
            f.write(f"{decoded_prompts[i]}")
            f.write("=========================")

            f.write(f"===:(model {network}):===")
            f.write(decoded_results[i])
            f.write("==========================")

print("\nGenerazione completata.")

Inizio generazione su cuda


  0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


===:(Model for Prompt)===
Riscrivi 'Io spero in messer Iesù di mandare tosto a voi Timoteo, acciocché io sia d'animo buono' dall'Italiano Antico a l'Italiano Moderno
 Risposta

===:(model sapienzanlp/Minerva-1B-base-v1.0):===
Riscrivi 'Io spero in messer Iesù di mandare tosto a voi Timoteo, acciocché io sia d'animo buono' dall'Italiano Antico a l'Italiano Moderno
 Risposta
 Spero che il Signore Gesù, il quale è venuto per salvare il mondo, mandi presto a voi Timoteo, affinché io possa essere un buon cristiano. 100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
===:(Model for Prompt)===
Riscrivi 'noi iscaciati e dipartiti per debito dela cittade, e tutti iscaciati da fama e da ventura buona.' dall'Italiano Antico a l'Italiano Moderno
 Risposta

===:(model sapienzanlp/Minerva-1B-base-v1.0):===
R