# Project of Translation from Archaic to Modern Italian

In [None]:
# Import Datases to work with Transformers by Hugging-Face
import torch
import pandas as pd
import huggingface_hub
# Imports for Transformers
from datasets import Dataset
from transformers import AutoTokenizer  # Datasets
from datasets.features import Value, Features





# Promposed Models

### Prompt Learning
* google/gemma-3-4b-it (LLM) for context learning - use ChatPrompt 🚀  
* MaLA-LM/emma-500-llama2-7b   another LLM for context learning - use ChatPrompt 🦙 
* sapienzanlp/Minerva-7B-instruct-v1.0 🇮🇹
### Fine Tuning
* sapienzanlp/Minerva-1B-base-v1.0  fine-tuned for transaltion task(LMM) - use MinervaPrompt (same for fine-tunning)  🇮🇹
* google/mt5-large (Machine Translation) 🤖
### Native Machine Translation Systems
* Helsinki-NLP/opus-mt-itc-itc (Machine Translation) for translation task - use OpusPrompt 🏆  
* facebook/nllb-200-3.3B (Machine Translation) another translation machine  🤖 FLORES-200

### Other Task
* models--prometheus-eval--prometheus-7b-v2.0 has jugde - use PrometheusPrompt 🔥 

# System Setup 🖥️

### Additional Dependencies 🐍

In [2]:
#!bash install.sh

### Hugging Face 🤗

In [3]:
TOKEN = ""
#huggingface_hub.login(token=TOKEN)

# Globals Variables

In [4]:
DATASET = "dataset.csv"
FEATURES = Features(
    {
        "Author": Value(dtype="string"),
        "Date": Value(dtype="string"),
        "Region": Value(dtype="string"),
        "Sentence": Value(dtype="string")
    }
)
NET = "sapienzanlp/Minerva-7B-instruct-v1.0"
BS = 16
PROMPT = "minerva"
DEVICE = ('cuda' if torch.cuda.is_available() else "cpu")

## Prompts


### Prompts for Prompt Learning (LLM)

In [5]:
tokenizer = None
max_length=120

In [None]:
def seqchat_map(examples):
    chat = tokenizer.apply_chat_template([
    [
        {"role": "system",   "content": "Rispondendo in Italiano riscrivi Le Frasi in Italiano Antico in Italiano Moderno sequendo le indicazioni dello 'user' e rispondendo precisamente alle richieste senza dare spiegazioni"},
        {"role": "user",     "content": "Frasi in Italiano Antico: 'Orlando, che gran tempo inamorato fu de la bella Angelica', sotituisci i termini poco utilizzati o errati"},
        {"role": "assistant","content": "Nuova Frase : ''"},
        {"role": "user",     "content": "Riordina le parole in modo che la frase risulti più scorrevole"},
        {"role": "assistant","content": "Nuova Frase : 'A metà del cammino della nostra vita mi sono ritrovato in un selva buia e avevo smarrito la giusta strada'"},
        {"role": "user",     "content": "Migliora il significato della frase"},
        {"role": "assistant","content": "Nuova Frase : 'A metà del cammino della mia vita (mezza età) mi sono ritrovato in un selva buia e avevo perso la giusta strada'"},

        {"role": "user",     "content": f"Frasi in Italiano Antico: '{example}', sotituisci i termini poco utilizzati o errati"},
        {"role": "assistant", "content": ""}
    ] for example in examples["Sentence"]], 
        tokenize=True,
        return_dict=True,
        padding=True,         # adds zeros to make all sequences the same length
        truncation=True,      # cuts sequences that are too long
        max_length=max_length,       # (optional) max length of the sequences
        return_tensors="pt")

    return chat
def chat_map(examples):
    chat = tokenizer.apply_chat_template([
    [
        {"role": "system",   "content": "Sei un traduttore esperto di Italiano Antico. Devi tradurre in modo conciso i brani che ti vengono data dallo 'user'"},
        {"role": "user",     "content": "Traduci 'La corte era in gran fermento.' in Italiano Moderno"},
        {"role": "assistant","content": "Italiano Antico: 'La corte era in gran fermento.' Italiano Moderno: 'La corte era molto agitata.'"},
        {"role": "user",      "content": f"Traduci '{example}' in Italiano Moderno"},
        {"role": "assistant", "content": ""}
    ] for example in examples["Sentence"]], 
        tokenize=True,
        return_dict=True,
        padding=True,         # adds zeros to make all sequences the same length
        truncation=True,      # cuts sequences that are too long
        max_length=max_length,       # (optional) max length of the sequences
        return_tensors="pt")

    return chat

### Prompts for Machine Translation Systems

In [7]:
def opus_prompt(examples):
    return tokenizer([
            f'>>ita<< {example}'
            for example in examples["Sentence"]
        ],
        padding=True, 
        max_length=max_length
        )

def nllb_prompt(examples):
    return tokenizer(list(examples["Sentence"]), padding=True, max_length=max_length)

### Prompts For Fine-Tuned Models

### Prompts for Other Task (Judge etc...)

In [8]:
def ft5_std_map(examples):
    return tokenizer(
        [f"Traduci dal volgare all’italiano moderno: {example}" for example in examples["Sentence"]],  
        padding=True, 
        max_length=128,
        )

def mask_std_map(examples):
    return tokenizer(
        [f"Old Italian: {example} Modern Italian [MASK]" for example in examples['Sentence']],  
        padding=True, 
        max_length=128)

def minerva_map(examples):
    tokenizer.pad_token = tokenizer.eos_token
    return tokenizer(
        [f"Traduci dal volgare all’italiano moderno: {example}" for example in examples["Sentence"]],  
        padding=True, 
        max_length=128,
        )

def style_map(examples):
    return tokenizer(
        [f"The following sentence represents an example from the Dolce Stil Novo (sweet new style) literary movement, developed in the 13th and 14th century in Italy: {example} Translate it to modern Italian: " for example in examples["Sentence"]],
        padding=True, 
        max_length=128)

def period_region_map(examples):
    return tokenizer(
        [f"This sentence {example['Sentence']} was written in {example['Date']}, in the {example['Region']} region. Translate it to Modern Italian" for example in examples],
        padding=True,
        max_length=128)

def author_map(examples):
    return tokenizer(
        [f"This sentence: {example['Sentence']} was written by {example['Author']}. Translate it to Modern Italian" for example in examples],
        padding=True,
        max_length=128)

def question_map(examples):
    return tokenizer(
        [f"Puoi riscrivere questa frase: {example} in uno stile più colloquiale?" for example in examples['Sentence']],
        padding=True,
        max_length=128)

### Prompt Selection

In [9]:
# Switch to select the mapping function based on the prompt type

match PROMPT:
    case "ft5_std":
        tr = ft5_std_map
    case "gemma-1b":
        tr = gemma_1b_map
    case "mask_std":
        tr = mask_std_map
    case "style":
        tr = style_map
    case "period_region":
        tr = period_region_map
    case "author":
        tr = author_map
    case "question":
        tr = question_map
    case "minerva":
        tr = minerva_map
    case "llma-1b":
        tr = llma_1b_map
    case "chat":
        tr = chat_map
    case "seqchat":
        tr = seqchat_map   
    case "opus":
        tr = opus_prompt
    case "nllb":
        tr = nllb_prompt
    case _:
        raise ValueError("Unknown prompt type")

### Network Selection (Configuration + Tokenizer)

In [None]:
# Switch to select the network and load the appropriate model and tokenizer
match NET:
    
    case "google/flan-t5-small" | "google-t5/t5-small" | "google/mt5-small" | "google/flan-t5-large":
        from transformers import T5Tokenizer, T5ForConditionalGeneration
        tokenizer = T5Tokenizer.from_pretrained(NET)
        model = T5ForConditionalGeneration.from_pretrained(NET, device_map=DEVICE, torch_dtype=torch.float16)
     

        params = {
            
            "max_new_tokens": 120, # max number of new tokens to generate
            "do_sample":True,      # enables sampling for more diverse outputs
            "top_k":50,            # diversity increase by controlling the candidate words
            "top_p":0.90,          # nucleus sampling for further control over variety
            "temperature":1.0,     # reduces randomness and increases coherence
            "repetition_penalty":1.0,  # penalizza ripetizioni
            "num_return_sequences":10,  # number of generated responses
            "pad_token_id":tokenizer.eos_token_id  # avoids warning if padding token is missing
        }
        
    case "google/mt5-base":
        from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
        tokenizer = AutoTokenizer.from_pretrained(NET)
        model = AutoModelForSeq2SeqLM.from_pretrained(NET, device_map=DEVICE, torch_dtype=torch.float16)
 

        params = {
            
            "max_new_tokens": 120,
            "do_sample":True,
            "top_k":10,          
            "top_p":0.90,          
            "temperature":1.0,  
            "repetition_penalty":1.0,
            "num_return_sequences":10, 
            "pad_token_id":tokenizer.eos_token_id 
        }

    case "google/gemma-3-1b-it":
        from transformers import BitsAndBytesConfig, Gemma3ForCausalLM, AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(NET)
        quantization_config = BitsAndBytesConfig(load_in_8bit=True)
        model = Gemma3ForCausalLM.from_pretrained(NET, device_map=DEVICE, quantization_config=quantization_config)
    
        params = {
            
            #"max_new_tokens": 120,
            "do_sample":True,
            #"top_k":10,     
            #"top_p":0.90,  
            #"temperature":1.0,     
            #"repetition_penalty":1.0,
            "num_return_sequences":10,  
            "pad_token_id":tokenizer.eos_token_id 
        }
    
    case "Helsinki-NLP/opus-mt-tc-bible-big-itc-fra_ita_por_spa" | "Helsinki-NLP/opus-mt-itc-itc":
        
        from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
        tokenizer = AutoTokenizer.from_pretrained(NET)
        model = AutoModelForSeq2SeqLM.from_pretrained(NET, device_map=DEVICE)

        params = {
            
            "max_new_tokens": 120,
            "do_sample":True,
            "top_k":10,            
            "top_p":0.90,         
            "temperature":1.0,    
            "repetition_penalty":1.0, 
            "num_return_sequences":10,  
            "pad_token_id":tokenizer.eos_token_id 
        }
        
    case "sapienzanlp/Minerva-1B-base-v1.0" | "sapienzanlp/Minerva-7B-instruct-v1.0":
        
        from transformers import AutoTokenizer, AutoModelForCausalLM
        tokenizer = AutoTokenizer.from_pretrained(NET)
        model = AutoModelForCausalLM.from_pretrained(NET, device_map=DEVICE)
  
        params = {
            
            "max_new_tokens": 120,
            "do_sample":True,
            "top_k":10,            # aumento della diversità controllando le parole candidate
            "top_p":0.90,          # campionamento nucleus per ulteriori controlli sulla varietà
            "temperature":1.0,     # riduce la casualità e aumenta la coerenza
            "repetition_penalty":1.0,  # penalizza ripetizioni
            "num_return_sequences":10,  # numero di risposte generate
            "pad_token_id":tokenizer.eos_token_id  # evita warning se manca un token di padding
        }
    case "MaLA-LM/emma-500-llama2-7b":
        from transformers import LlamaForCausalLM, AutoTokenizer
        from transformers import BitsAndBytesConfig
        quantization = BitsAndBytesConfig(load_in_8bit=True)
        tokenizer = AutoTokenizer.from_pretrained(NET, padding_side='left', use_fast=False)
        tokenizer.pad_token = tokenizer.eos_token
        model = LlamaForCausalLM.from_pretrained(NET, device_map=DEVICE, quantization_config=quantization)
        params = {
            
            #"max_new_tokens": 120,
            "do_sample":True,
            #"top_k":10,            # aumento della diversità controllando le parole candidate
            #"top_p":0.90,          # campionamento nucleus per ulteriori controlli sulla varietà
            #"temperature":1.0,     # riduce la casualità e aumenta la coerenza
            #"repetition_penalty":1.0,  # penalizza ripetizioni
            #"num_return_sequences":10,  # numero di risposte generate
            "pad_token_id":tokenizer.eos_token_id  # evita warning se manca un token di padding
        }
    
    case "facebook/nllb-200-3.3B":
        from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
        from transformers import BitsAndBytesConfig
        quantization = BitsAndBytesConfig(load_in_8bit=True)
        tokenizer = AutoTokenizer.from_pretrained(NET, use_fast=False)
        tokenizer.src_lang = "ita_Latn"
        if not tokenizer.pad_token:
            tokenizer.pad_token = tokenizer.eos_token
        
        model = AutoModelForSeq2SeqLM.from_pretrained(NET, device_map=DEVICE, quantization_config=quantization)
        params = {
            
            #"max_new_tokens": 120,
            #"do_sample":True,
            #"top_k":10,            # aumento della diversità controllando le parole candidate
            #"top_p":0.90,          # campionamento nucleus per ulteriori controlli sulla varietà
            #"temperature":1.0,     # riduce la casualità e aumenta la coerenza
            #"repetition_penalty":1.0,  # penalizza ripetizioni
            #"num_return_sequences":10,  # numero di risposte generate
            #"pad_token_id":tokenizer.eos_token_id,  # evita warning se manca un token di padding
            "forced_bos_token_id":tokenizer.convert_tokens_to_ids("ita_Latn")
        }

    case _:
        raise Exception(f"Rete {NET} non testabile")
        

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   1%|1         | 62.9M/4.96G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   1%|          | 41.9M/4.96G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   1%|1         | 52.4M/5.04G [00:00<?, ?B/s]

# Dataset

In [None]:
hf = Dataset.from_csv(DATASET, features=FEATURES).shuffle(seed=42)

In [None]:
tokenized = hf.map(tr, batched=True)

In [None]:
print(tokenized.column_names)
for idx, s in enumerate(tokenized.take(5), 1):
    print(f"sample n°{idx}: {s}")
    print(f"Decode ids: {tokenizer.decode(s["input_ids"], attention_mask=s["attention_mask"], skip_special_tokens=True)}")
    

['Author', 'Date', 'Region', 'Sentence', 'input_ids', 'attention_mask']
sample n°1: {'Author': 'Guido da Pisa', 'Date': '1337', 'Region': 'tosc.', 'Sentence': "Ed ecco di subito tutta questa turba degli uccelli si levò a volo dietro all'aquila", 'input_ids': [256077, 6721, 168611, 150, 105668, 44059, 23712, 1595, 103, 24490, 54279, 15284, 219, 15859, 248377, 9, 93259, 175204, 1910, 248116, 14993, 806, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
Decode ids: Ed ecco di subito tutta questa turba degli uccelli si levò a volo dietro all'aquila
sample n°2: {'Author': 'Bart. da San Concordio', 'Date': '1313', 'Region': 'tosc.', 'Sentence': "la seconda suole talora per la grande provedenzia fare timoroso, e la prima per l'ardire rendere altrui m

In [None]:
from utils import evaluate_and_save


df = evaluate_and_save(
    model=model,
    tokenizer=tokenizer,
    tokenized_dataset=tokenized,
    output_prefix=NET.split("/")[-1],
    device=DEVICE,
    batch_size=BS, 
    config=params
)

  0%|          | 0/7 [00:00<?, ?it/s]

['And suddenly this whole flock of birds flew up behind the eagle', 'The second sometimes for the great providence makes fearful, and the first for the boldness makes others mad.', 'And so, since these things are so, Cat, and you cannot well dwell here, you doubt to go to any land and use this life by fleeing to the deserts', 'In Milan the wickedness of a woman was reprinted in a similar lie, in the very time of this gentleman of the republic, in this way:', "A place where spears are sent , which the horsemen call the pig 's head .", 'When the serpents poisoned them by day any Roman, then it was a wonder to see how the Psille fought with the poison, for they imolate everything unassailable with their saliva', 'He begged him, "Give us this man\'s money. " So the man gave him his money back. He wrote a letter of release along with it.', 'In vain is the question asked who wrote this Book, with this being something that faithfully must be believed that the Author of that was the Holy Spiri