# Archaic to Modern Italian with Context Learning

In [1]:
# Import Datases to work with Transformers by Hugging-Face
import torch
from tqdm.auto import tqdm
# Imports for Transformers
from datasets import Dataset
from transformers import AutoTokenizer  # Datasets
import pandas as pd
from datasets.features import Value, Features

  from .autonotebook import tqdm as notebook_tqdm


## Globals

# Promposed Models
* google/flan-t5-small - google/mt5-small (text2text model) ::NO_WORK
* google/gemma-3-1b-it (LLM) 🚀
* sapienzanlp/Minerva-1B-base-v1.0 🇮🇹 (LMM)
* openai-community/gpt2 (LLM) ::NO IT
* Helsinki-NLP/opus-mt-itc-itc (Machine Translation) 🏆 - use OpusPrompt 
* facebook/nllb-200-3.3B (Translation)
* FacebookAI/xlm-roberta-base (fill-mask)

In [2]:
DATASET = "dataset.csv"
FEATURES = Features(
    {
        "Author": Value(dtype="string"),
        "Date": Value(dtype="string"),
        "Region": Value(dtype="string"),
        "Sentence": Value(dtype="string")
    }
)
NET = "t5-base/checkpoint-2982"
BS = 8
DEVICE = ('cuda' if torch.cuda.is_available() else "cpu")

## Define Network Pipline


In [3]:
tokenizer = None
def ft5_std_map(examples):
    return tokenizer(
        [f"Traduci dal volgare all’italiano moderno: {example}" for example in examples["Sentence"]],  
        padding=True, 
        max_length=128,
        )

def gemma_1b_map(examples):
    chat = tokenizer.apply_chat_template([
    [
        {"role": "system",   "content": "Sei un traduttore esperto di Italiano Antico "},
        {"role": "user",     "content": "Traduci 'La corte era in gran fermento.' in Italiano Moderno"},
        {"role": "assistant","content": "Italiano Antico: 'La corte era in gran fermento.' Italiano Moderno: 'La corte era molto agitata.'"},
        {"role": "user",      "content": f"Traduci '{example}' in Italiano Moderno"},
        {"role": "assistant", "content": ""}
    ] for example in examples["Sentence"] ], 
        tokenize=True,
        add_generation_prompt=True,
        return_dict=True,
        padding=True,         # <== aggiunge zeri per rendere le sequenze uguali
        truncation=True,      # <== taglia sequenze troppo lunghe
        max_length=250,       # (opzionale) puoi specificare una lunghezza massima
        return_tensors="pt"
    )

    return chat

def mask_std_map(examples):
    return tokenizer(
        [f"Old Italian: {example} Modern Italian [MASK]" for example in examples["Sentence"]],  
        padding=True, 
        max_length=128)

def minerva_map(examples):
    tokenizer.pad_token = tokenizer.eos_token
    return tokenizer(
        [f"Traduci dal volgare all’italiano moderno: {example}" for example in examples["Sentence"]],  
        padding=True, 
        max_length=128,
        )
    


In [5]:
match NET:
    
    case "google-t5/t5-small" |  "google/mt5-small" | "t5-base/best" | "t5-base/checkpoint-2982":
        from transformers import T5Tokenizer, T5ForConditionalGeneration
        tokenizer = T5Tokenizer.from_pretrained(NET)
        model = T5ForConditionalGeneration.from_pretrained(NET, device_map=DEVICE, torch_dtype=torch.float16)
        tr = ft5_std_map

        params = {
            
            "max_new_tokens": 120,
            "do_sample":True,
            "top_k":50,            # aumento della diversità controllando le parole candidate
            "top_p":0.90,          # campionamento nucleus per ulteriori controlli sulla varietà
            "temperature":1.0,     # riduce la casualità e aumenta la coerenza
            "repetition_penalty":1.0,  # penalizza ripetizioni
            "num_return_sequences":10,  # numero di risposte generate
            "pad_token_id":tokenizer.eos_token_id  # evita warning se manca un token di padding
        }
    case "google/mt5-base":
        from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
        tokenizer = AutoTokenizer.from_pretrained(NET)
        model = AutoModelForSeq2SeqLM.from_pretrained(NET, device_map=DEVICE, torch_dtype=torch.float16)
        tr = ft5_std_map

        params = {
            
            "max_new_tokens": 120,
            "do_sample":True,
            "top_k":10,            # aumento della diversità controllando le parole candidate
            "top_p":0.90,          # campionamento nucleus per ulteriori controlli sulla varietà
            "temperature":1.0,     # riduce la casualità e aumenta la coerenza
            "repetition_penalty":1.0,  # penalizza ripetizioni
            "num_return_sequences":10,  # numero di risposte generate
            "pad_token_id":tokenizer.eos_token_id  # evita warning se manca un token di padding
        }
    case "google/gemma-3-1b-it":
        from transformers import BitsAndBytesConfig, Gemma3ForCausalLM, AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(NET)
        quantization_config = BitsAndBytesConfig(load_in_8bit=True)
        model = Gemma3ForCausalLM.from_pretrained(NET, device_map=DEVICE, quantization_config=quantization_config)
        tr = gemma_1b_map
        params = {
            
            "max_new_tokens": 120,
            "do_sample":True,
            "top_k":10,            # aumento della diversità controllando le parole candidate
            "top_p":0.90,          # campionamento nucleus per ulteriori controlli sulla varietà
            "temperature":1.0,     # riduce la casualità e aumenta la coerenza
            "repetition_penalty":1.0,  # penalizza ripetizioni
            "num_return_sequences":10,  # numero di risposte generate
            "pad_token_id":tokenizer.eos_token_id  # evita warning se manca un token di padding
        }
    
    case "FacebookAI/xlm-roberta-base":
        
        from transformers import AutoTokenizer, AutoModelForMaskedLM
        tokenizer = AutoTokenizer.from_pretrained(NET)
        model = AutoModelForMaskedLM.from_pretrained(NET)
        tr = mask_std_map
        params = {
            
            "max_new_tokens": 120,
            "do_sample":True,
            "top_k":10,            # aumento della diversità controllando le parole candidate
            "top_p":0.90,          # campionamento nucleus per ulteriori controlli sulla varietà
            "temperature":1.0,     # riduce la casualità e aumenta la coerenza
            "repetition_penalty":1.0,  # penalizza ripetizioni
            "num_return_sequences":10,  # numero di risposte generate
            "pad_token_id":tokenizer.eos_token_id  # evita warning se manca un token di padding
        }
    case "sapienzanlp/Minerva-1B-base-v1.0":
        
        from transformers import AutoTokenizer, AutoModelForCausalLM
        tokenizer = AutoTokenizer.from_pretrained(NET)
        model = AutoModelForCausalLM.from_pretrained(NET, device_map=DEVICE)
        tr = minerva_map
        params = {
            
            "max_new_tokens": 120,
            "do_sample":True,
            "top_k":10,            # aumento della diversità controllando le parole candidate
            "top_p":0.90,          # campionamento nucleus per ulteriori controlli sulla varietà
            "temperature":1.0,     # riduce la casualità e aumenta la coerenza
            "repetition_penalty":1.0,  # penalizza ripetizioni
            "num_return_sequences":10,  # numero di risposte generate
            "pad_token_id":tokenizer.eos_token_id  # evita warning se manca un token di padding
        }
    case "openai-community/gpt2":
        from transformers import GPT2Tokenizer, AutoModelForCausalLM
        tokenizer = GPT2Tokenizer.from_pretrained(NET)
        model = AutoModelForCausalLM.from_pretrained(NET, device_map=DEVICE)
        tr = minerva_map
        params = {
            
            "max_new_tokens": 120,
            "do_sample":True,
            "top_k":10,            # aumento della diversità controllando le parole candidate
            "top_p":0.90,          # campionamento nucleus per ulteriori controlli sulla varietà
            "temperature":1.0,     # riduce la casualità e aumenta la coerenza
            "repetition_penalty":1.0,  # penalizza ripetizioni
            "num_return_sequences":10,  # numero di risposte generate
            "pad_token_id":tokenizer.eos_token_id  # evita warning se manca un token di padding
        }

    case _:
        raise Exception(f"Rete {NET} non testabile")
        

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


# Dataset

In [6]:
hf = Dataset.from_csv(DATASET, features=FEATURES).shuffle(seed=42)

In [7]:
tokenized =hf.map(tr, batched=True)

In [8]:
print(tokenized.column_names)
for idx, s in enumerate(tokenized.take(5), 1):
    print(f"sample n°{idx}: {s}")
    print(f"Decode ids: {tokenizer.decode(s["input_ids"], attention_mask=s["attention_mask"], skip_special_tokens=True)}")

['Author', 'Date', 'Region', 'Sentence', 'input_ids', 'attention_mask']
sample n°1: {'Author': 'Guido da Pisa', 'Date': '1337', 'Region': 'tosc.', 'Sentence': "Ed ecco di subito tutta questa turba degli uccelli si levò a volo dietro all'aquila", 'input_ids': [3083, 4817, 23, 3, 26, 138, 5063, 1478, 15, 66, 22, 9538, 20028, 941, 32, 10, 4857, 3, 15, 75, 509, 1227, 769, 23, 235, 13829, 17, 9, 13118, 9, 3, 2905, 115, 9, 20, 4707, 3, 17431, 7999, 108, 3, 10912, 2, 3, 9, 5063, 32, 1227, 15252, 66, 31, 9, 1169, 521, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [9]:
output = []
model= model.eval()
with torch.no_grad():
    tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask'])
    loader = torch.utils.data.DataLoader(tokenized, batch_size=BS)
    size = len(tokenized)
    for batch in tqdm(loader, dynamic_ncols=True, leave=True):
        
        input_ids=batch["input_ids"].to(DEVICE)
        attention_mask=batch["attention_mask"].to(DEVICE)

        pred = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        **params  
        )
        output.extend(pred)

100%|██████████| 13/13 [00:23<00:00,  1.79s/it]


In [10]:
df = pd.DataFrame(columns=["Original", "Translation(Generated)", "Evaluation"]) 
for idx, (y, y_pred) in enumerate(zip(hf, pred), 1):
    print(f"Sample n°{idx}")
    print(f"User say:\n Prompt + {y["Sentence"]}")
    response = tokenizer.decode(y_pred , skip_special_tokens=True)
    print(f"Model say:\n {response}")
    print(f"=======End Sentence n°{idx}=======")

    df.loc[len(df), "Original"] = y["Sentence"]
    df.loc[len(df) -1, "Translation(Generated)"] = response

df.to_csv(f"./Translation model({NET.split('/')[0]}).tsv", index=False, quotechar="\'", encoding='utf-8', sep="\t")


Sample n°1
User say:
 Prompt + Ed ecco di subito tutta questa turba degli uccelli si levò a volo dietro all'aquila
Model say:
 That’s what you’re saying, a ferocio dalmio dalata dala all’s singing. In your words of love, the exact version of this phrase is fierce and insuasive, and eglia in the sky in its enunciated words, egligligli dalma, egliosa d’ta egli d’a glioglia d’a eglio
Sample n°2
User say:
 Prompt + la seconda suole talora per la grande provedenzia fare timoroso, e la prima per l'ardire rendere altrui matto.
Model say:
 That he's written on his own soul with the same expression, the same song, as he wrote it, for all of them to describe him to him—the best description of each male subject—and the very words that contain the song.
Sample n°3
User say:
 Prompt + E dunque, da che queste cose son così, Catellina, e tu non puoi buonamente qui dimorare, dubiti tu d'andartene in alcuna terra ed usare questa vita fuggendo per li diserti
Model say:
 Acciocchi, no. egligli, each man 