# Archaic to Modern Italian with Context Learning

In [1]:
# Import Datases to work with Transformers by Hugging-Face
import torch
from tqdm.auto import tqdm
# Imports for Transformers
from datasets import Dataset
from transformers import AutoTokenizer  # Datasets
import pandas as pd
from datasets.features import Value, Features

# Promposed Models
* google/gemma-3-4b-it - google/gemma-3n-E4B-it-litert-preview (LLM) 🚀
* sapienzanlp/Minerva-1B-base-v1.0 🇮🇹 (LMM)
* Helsinki-NLP/opus-mt-itc-itc (Machine Translation) 🏆 - use OpusPrompt 
* facebook/nllb-200-3.3B (Translation)
* meta-llama/Llama-3.2-3B
* mistralai/Mistral-7B-Instruct-v0.2

In [2]:
DATASET = "dataset.csv"
FEATURES = Features(
    {
        "Author": Value(dtype="string"),
        "Date": Value(dtype="string"),
        "Region": Value(dtype="string"),
        "Sentence": Value(dtype="string")
    }
)
NET = "google/gemma-3-4b-it"
BS = 16
PROMPT = "chat"
DEVICE = ('cuda' if torch.cuda.is_available() else "cpu")

## Network Pipeline


### Styling of Prompts

In [3]:
tokenizer = None
def ft5_std_map(examples):
    return tokenizer(
        [f"Traduci dal volgare all’italiano moderno: {example}" for example in examples["Sentence"]],  
        padding=True, 
        max_length=128,
        )


def chat_map(examples):
    chat = tokenizer.apply_chat_template([
    [
        {"role": "system",   "content": "Sei un traduttore esperto di Italiano Antico. Devi tradurre in modo conciso i brani che ti vengono data dallo 'user'"},
        {"role": "user",     "content": "Traduci 'La corte era in gran fermento.' in Italiano Moderno"},
        {"role": "assistant","content": "Italiano Antico: 'La corte era in gran fermento.' Italiano Moderno: 'La corte era molto agitata.'"},
        {"role": "user",      "content": f"Traduci '{example}' in Italiano Moderno"},
        {"role": "assistant", "content": ""}
    ] for example in examples["Sentence"]], 
        tokenize=True,
        
        return_dict=True,
        padding=True,         # adds zeros to make all sequences the same length
        truncation=True,      # cuts sequences that are too long
        max_length=250,       # (optional) max length of the sequences
        return_tensors="pt")

    return chat

def gemma_1b_map(examples):
    chat = tokenizer.apply_chat_template([
    [
        {"role": "system",   "content": "Sei un traduttore esperto di Italiano Antico "},
        {"role": "user",     "content": "Traduci 'La corte era in gran fermento.' in Italiano Moderno"},
        {"role": "assistant","content": "Italiano Antico: 'La corte era in gran fermento.' Italiano Moderno: 'La corte era molto agitata.'"},
        {"role": "user",      "content": f"Traduci '{example}' in Italiano Moderno"},
        {"role": "assistant", "content": ""}
    ] for example in examples["Sentence"]], 
        tokenize=True,
        
        return_dict=True,
        padding=True,         # adds zeros to make all sequences the same length
        truncation=True,      # cuts sequences that are too long
        max_length=250,       # (optional) max length of the sequences
        return_tensors="pt")

    return chat

def mask_std_map(examples):
    return tokenizer(
        [f"Old Italian: {example} Modern Italian [MASK]" for example in examples['Sentence']],  
        padding=True, 
        max_length=128)

def minerva_map(examples):
    tokenizer.pad_token = tokenizer.eos_token
    return tokenizer(
        [f"Traduci dal volgare all’italiano moderno: {example}" for example in examples["Sentence"]],  
        padding=True, 
        max_length=128,
        )

def style_map(examples):
    return tokenizer(
        [f"The following sentence represents an example from the Dolce Stil Novo (sweet new style) literary movement, developed in the 13th and 14th century in Italy: {example} Translate it to modern Italian: " for example in examples["Sentence"]],
        padding=True, 
        max_length=128)

def period_region_map(examples):
    return tokenizer(
        [f"This sentence {example['Sentence']} was written in {example['Date']}, in the {example['Region']} region. Translate it to Modern Italian" for example in examples],
        padding=True,
        max_length=128)

def author_map(examples):
    return tokenizer(
        [f"This sentence: {example['Sentence']} was written by {example['Author']}. Translate it to Modern Italian" for example in examples],
        padding=True,
        max_length=128)

def question_map(examples):
    return tokenizer(
        [f"Puoi riscrivere questa frase: {example} in uno stile più colloquiale?" for example in examples['Sentence']],
        padding=True,
        max_length=128)

In [4]:
# Switch to select the mapping function based on the prompt type


match PROMPT:
    case "ft5_std":
        tr = ft5_std_map
    case "gemma-1b":
        tr = gemma_1b_map
    case "mask_std":
        tr = mask_std_map
    case "style":
        tr = style_map
    case "period_region":
        tr = period_region_map
    case "author":
        tr = author_map
    case "question":
        tr = question_map
    case "minerva":
        tr = minerva_map
    case "llma-1b":
        tr = llma_1b_map
    case "chat":
        tr = chat_map
        
    case _:
        raise ValueError("Unknown prompt type")

### Load Appropriate Model and Tokenizer

In [5]:
# Switch to select the network and load the appropriate model and tokenizer
match NET:
    
    case "google/flan-t5-small" | "google-t5/t5-small" | "google/mt5-small" | "google/flan-t5-large":
        from transformers import T5Tokenizer, T5ForConditionalGeneration
        tokenizer = T5Tokenizer.from_pretrained(NET)
        model = T5ForConditionalGeneration.from_pretrained(NET, device_map=DEVICE, torch_dtype=torch.float16)
     

        params = {
            
            "max_new_tokens": 120, # max number of new tokens to generate
            "do_sample":True,      # enables sampling for more diverse outputs
            "top_k":50,            # diversity increase by controlling the candidate words
            "top_p":0.90,          # nucleus sampling for further control over variety
            "temperature":1.0,     # reduces randomness and increases coherence
            "repetition_penalty":1.0,  # penalizza ripetizioni
            "num_return_sequences":10,  # number of generated responses
            "pad_token_id":tokenizer.eos_token_id  # avoids warning if padding token is missing
        }
        
    case "google/mt5-base":
        from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
        tokenizer = AutoTokenizer.from_pretrained(NET)
        model = AutoModelForSeq2SeqLM.from_pretrained(NET, device_map=DEVICE, torch_dtype=torch.float16)
 

        params = {
            
            "max_new_tokens": 120,
            "do_sample":True,
            "top_k":10,          
            "top_p":0.90,          
            "temperature":1.0,  
            "repetition_penalty":1.0,
            "num_return_sequences":10, 
            "pad_token_id":tokenizer.eos_token_id 
        }

    case "google/gemma-3-1b-it" | "google/gemma-3-4b-it" | "google/gemma-3n-E4B-it-litert-preview":
        from transformers import BitsAndBytesConfig, Gemma3ForCausalLM, AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(NET)
        quantization_config = BitsAndBytesConfig(load_in_8bit=True)
        model = Gemma3ForCausalLM.from_pretrained(NET, device_map=DEVICE, quantization_config=quantization_config)
    
        params = {
            
            "max_new_tokens": 120,
            "do_sample":True,
            "top_k":10,     
            "top_p":0.90,  
            "temperature":1.0,     
            "repetition_penalty":1.0,
            "num_return_sequences":10,  
            "pad_token_id":tokenizer.eos_token_id 
        }
    
    case "FacebookAI/xlm-roberta-base":
        
        from transformers import AutoTokenizer, AutoModelForMaskedLM
        tokenizer = AutoTokenizer.from_pretrained(NET)
        model = AutoModelForMaskedLM.from_pretrained(NET)

        params = {
            
            "max_new_tokens": 120,
            "do_sample":True,
            "top_k":10,            
            "top_p":0.90,         
            "temperature":1.0,    
            "repetition_penalty":1.0, 
            "num_return_sequences":10,  
            "pad_token_id":tokenizer.eos_token_id 
        }
    case "sapienzanlp/Minerva-1B-base-v1.0":
        
        from transformers import AutoTokenizer, AutoModelForCausalLM
        tokenizer = AutoTokenizer.from_pretrained(NET)
        model = AutoModelForCausalLM.from_pretrained(NET, device_map=DEVICE)
  
        params = {
            
            "max_new_tokens": 120,
            "do_sample":True,
            "top_k":10,            # aumento della diversità controllando le parole candidate
            "top_p":0.90,          # campionamento nucleus per ulteriori controlli sulla varietà
            "temperature":1.0,     # riduce la casualità e aumenta la coerenza
            "repetition_penalty":1.0,  # penalizza ripetizioni
            "num_return_sequences":10,  # numero di risposte generate
            "pad_token_id":tokenizer.eos_token_id  # evita warning se manca un token di padding
        }
    case "meta-llama/Llama-3.2-3B-Instruct-QLORA_INT4_EO8":
        from transformers import LlamaForCausalLM, AutoTokenizer
        from transformers import BitsAndBytesConfig
        quantization = BitsAndBytesConfig(load_in_8bit=True)
        tokenizer = AutoTokenizer.from_pretrained(NET, padding_side='left', use_fast=False)
        
        model = LlamaForCausalLM.from_pretrained(NET, device_map=DEVICE, quantization_config=quantization)
        params = {
            
            "max_new_tokens": 120,
            "do_sample":True,
            "top_k":10,            # aumento della diversità controllando le parole candidate
            "top_p":0.90,          # campionamento nucleus per ulteriori controlli sulla varietà
            "temperature":1.0,     # riduce la casualità e aumenta la coerenza
            "repetition_penalty":1.0,  # penalizza ripetizioni
            "num_return_sequences":10,  # numero di risposte generate
            "pad_token_id":tokenizer.eos_token_id  # evita warning se manca un token di padding
        }
    
    case "mistralai/Mistral-7B-Instruct-v0.2":
        from transformers import AutoTokenizer, AutoModelForCausalLM
        from transformers import BitsAndBytesConfig
        quantization = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
        )
        tokenizer = AutoTokenizer.from_pretrained(NET, padding_side='left')
        tokenizer.pad_token = tokenizer.eos_token
        model = AutoModelForCausalLM.from_pretrained(NET, device_map=DEVICE, quantization_config=quantization, torch_dtype=torch.bfloat16, attn_implementation="sdpa")
        params = {
            
            "max_new_tokens": 512,
            "do_sample":True,
            "top_k":10,            # aumento della diversità controllando le parole candidate
            "top_p":0.90,          # campionamento nucleus per ulteriori controlli sulla varietà
            "temperature":0.85,     # riduce la casualità e aumenta la coerenza
            "repetition_penalty":1.0,  # penalizza ripetizioni
            #"num_return_sequences":10,  # numero di risposte generate
            "pad_token_id":tokenizer.eos_token_id  # evita warning se manca un token di padding
        }

    case _:
        raise Exception(f"Rete {NET} non testabile")
        

Exception: Rete facebook/nllb-200-3.3B non testabile

# Dataset

In [None]:
import torch
import pandas as pd
from tqdm.auto import tqdm

def evaluate_and_save(
    model,
    tokenizer,
    tokenized_dataset,
    original_dataset,
    output_prefix: str,
    device: str = "cuda",
    batch_size: int = 32,
    generate_params: dict = None
):
    """
    Generate translations for each example in `tokenized_dataset`, compare against
    `original_dataset`, and save a TSV with columns ["Original", "Translation(Generated)", "Evaluation"].

    Args:
        model           : a HuggingFace seq2seq model
        tokenizer       : corresponding tokenizer
        tokenized_dataset : a Dataset with fields "input_ids" & "attention_mask"
        original_dataset  : the original (un-tokenized) dataset with field "Sentence"
        output_prefix   : prefix for the output file; final name will be
                          f"{output_prefix}({model.__class__.__name__}).tsv"
        device          : device to run on, e.g. "cuda" or "cpu"
        batch_size      : generation batch size
        generate_params : additional kwargs for `model.generate()`
    Returns:
        pandas.DataFrame with columns ["Original", "Translation(Generated)", "Evaluation"]
    """
    # Prep
    model = model.eval()
    generate_params = generate_params or {}
    tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
    loader = torch.utils.data.DataLoader(tokenized_dataset, batch_size=batch_size)

    # Results container
    df = pd.DataFrame(columns=["Original", "Translation(Generated)", "Evaluation"])

    # Generation loop
    example_idx = 0
    for batch in tqdm(loader, dynamic_ncols=True, leave=True):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        with torch.no_grad():
            preds = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                **generate_params
            )

        # Decode & append
        for pred in preds:
            src_sentence = original_dataset[example_idx]["Sentence"]
            gen_translation = tokenizer.decode(pred, skip_special_tokens=True)

            df.loc[len(df)] = [src_sentence, gen_translation, ""]
            example_idx += 1

    # Save to TSV
    filename = f"{output_prefix}({model.__class__.__name__}).tsv"
    df.to_csv(filename, index=False, quotechar="'", encoding="utf-8", sep="\t")

    print(f"Saved translations to {filename}")
    return df


In [None]:
hf = Dataset.from_csv(DATASET, features=FEATURES).shuffle(seed=42)

In [None]:
tokenized = hf.map(tr, batched=True)

In [None]:
print(tokenized.column_names)
for idx, s in enumerate(tokenized.take(5), 1):
    print(f"sample n°{idx}: {s}")
    print(f"Decode ids: {tokenizer.decode(s["input_ids"], attention_mask=s["attention_mask"], skip_special_tokens=True)}")
    

['Author', 'Date', 'Region', 'Sentence', 'input_ids', 'attention_mask']
sample n°1: {'Author': 'Guido da Pisa', 'Date': '1337', 'Region': 'tosc.', 'Sentence': "Ed ecco di subito tutta questa turba degli uccelli si levò a volo dietro all'aquila", 'input_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 105, 2364, 107, 1869, 236747, 723, 3031, 196848, 15112, 1071, 1001, 168248, 5307, 2486, 236761, 68131, 3031, 57648, 528, 18573, 3300, 11208, 858, 1615, 3236, 1273, 3163, 67404, 1262, 136658, 756, 2364, 236789, 108, 2035, 25593, 1287, 756, 4967, 50508, 6933, 528, 9085, 28196, 236748, 7085, 528, 168248, 13806, 236748, 106, 107, 105, 4368, 107, 64835, 236748, 5307, 2486, 236787, 756, 4967, 50508, 6933, 528, 9085, 28196, 236748, 7085, 168248, 13806, 236748, 236787, 756, 4967, 50508, 6933, 25965, 93160, 805, 7085, 106, 107, 105, 2364, 107, 2035, 25593, 1287, 756, 4675, 199304, 1001, 79727, 67169, 24903, 7909, 3604, 21751, 98474, 

In [None]:
df = evaluate_and_save(
    model=model,
    tokenizer=tokenizer,
    tokenized_dataset=tokenized,
    original_dataset=hf,
    output_prefix="./Translation model",
    device=DEVICE,
    batch_size=BS,
    generate_params=params
)

  0%|          | 0/7 [00:00<?, ?it/s]

IndexError: Invalid key: 97 is out of bounds for size 97