# Project of Translation from Archaic to Modern Italian

## System Setup 🖥️

### Drive Interface 📁

In [1]:
try:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    %cp /content/drive/MyDrive/MNLP_HW_2/Many_Naps_Little_Progress/*.* .
    %ls
except ModuleNotFoundError:
    print("Local env dected")

Local env dected


### Additional Dependencies 🐍

In [2]:
!bash ../install_colab.sh >> /dev/null

### Hugging Face 🤗

In [3]:
import huggingface_hub
TOKEN = "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"
huggingface_hub.login(token=TOKEN)

## Proposed Models

### Prompt Learning
* google/gemma-3-1b-it (LLM) for context learning - use ChatPrompt 🚀  ok
* MaLA-LM/emma-500-llama2-7b   another LLM for context learning - use ChatPrompt 🦙 NO
* sapienzanlp/Minerva-7B-instruct-v1.0 🇮🇹
### Fine Tuning
* sapienzanlp/Minerva-1B-base-v1.0  fine-tuned for transaltion task(LMM) - use MinervaPrompt (same for fine-tuning)  🇮🇹
* google/mt5-large (Machine Translation) 🤖
### Native Machine Translation Systems
* Helsinki-NLP/opus-mt-itc-itc (Machine Translation) for translation task - use OpusPrompt 🏆  
* facebook/nllb-200-3.3B (Machine Translation) another translation machine  🤖 FLORES-200
### Other Task
* models--prometheus-eval--prometheus-7b-v2.0 has jugde - use PrometheusPrompt 🔥

In [4]:
# Import Datases to work with Transformers by Hugging-Face
import torch
import pandas as pd
# Imports for Transformers
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer  # Datasets
from datasets.features import Value, Features

## Global Variables

In [5]:
DATASET_EVAL = "test_dataset_ann.csv"
FEATURES = Features(
    {
        "Author": Value(dtype="string"),
        "Date": Value(dtype="string"),
        "Region": Value(dtype="string"),
        "Sentence": Value(dtype="string")
    }
)
NET = "sapienzanlp/Minerva-7B-instruct-v1.0"
BS = 1
PROMPT = "chat"
DEVICE = ('cuda' if torch.cuda.is_available() else "cpu")
SRC_L = "Sentence"
TRG_L = "Target"

## Prompts

### Prompts for Prompt Learning (LLM)

In [6]:
tokenizer = None
max_length=800

In [7]:
def seqchat_map(examples):
    chat = tokenizer.apply_chat_template([
    [
        {"role": "system",   "content": "rispondendo in Italiano riscrivi Le Frasi in Italiano Antico in Italiano Moderno seguendo le indicazioni dello 'user' e rispondendo precisamente alle richieste senza dare spiegazioni"},
        {"role": "user",     "content": "frase in Italiano Antico: 'Orlando, che gran tempo inamorato fu de la bella Angelica', sotituisci i termini poco utilizzati o errati"},
        {"role": "assistant","content": "nuova Frase: 'Orlando che da molto tempo innamorato è della bella Agelica"},
        {"role": "user",     "content": "riordina le parole in modo che la frase risulti più scorrevole"},
        {"role": "assistant","content": "nuova Frase: 'Orlando che è innamorato della bella Angelica da molto tempo'"},
        {"role": "user",     "content": "migliora il significato della frase"},
        {"role": "assistant","content": "nuova Frase: 'Orlando è innamorato della bella Angelica da molto tempo'"},
        {"role": "user",     "content": f"Frase in Italiano Antico: '{example}', sostituisci i termini poco utilizzati o errati"},
        {"role": "assistant", "content": ""}
    ] for example in examples["Sentence"]],
        tokenize=True,
        return_dict=True,
        padding=True,         # adds zeros to make all sequences the same length
        truncation=True,      # cuts sequences that are too long
        max_length=max_length,       # (optional) max length of the sequences
        return_tensors="pt")

    return chat

def chat_map(examples):
    chat = tokenizer.apply_chat_template([
    [
        {"role": "system",   "content": "Sei un traduttore esperto di Italiano Antico. Devi tradurre in modo conciso i brani che ti vengono data dallo 'user'"},
        {"role": "user",     "content": "Traduci 'Orlando, che gran tempo inamorato fu de la bella Angelica.' in Italiano Moderno"},
        {"role": "assistant","content": "Italiano Moderno: 'Orlando è innamorato della bella Angelica da molto tempo'"},
        {"role": "user",      "content": f"Traduci '{example}' in Italiano Moderno"},
        {"role": "assistant", "content": ""}
    ] for example in examples["Sentence"]],
        tokenize=True,
        return_dict=True,
        padding="max_length",         # adds zeros to make all sequences the same length
        truncation=True,      # cuts sequences that are too long
        max_length=max_length,       # (optional) max length of the sequences
        return_tensors="pt")

    return chat

### Prompts for Machine Translation Systems

In [8]:
def opus_prompt(examples):
    return tokenizer([
            f'>>ita<< {example}'
            for example in examples["Sentence"]
        ],
        padding=True,
        max_length=max_length
        )

def nllb_prompt(examples):
    return tokenizer(list(examples["Sentence"]), padding=True, max_length=max_length)

### Prompts for Fine-Tuned Models

In [9]:
print(f"model max length: {max_length}")

def noprompt_it_it(examples):
    inputs = [example for example in examples[SRC_L]]
    targets = [example for example in examples[TRG_L]]

    model_inputs = tokenizer(inputs, text_target=targets, max_length=max_length, truncation=True, padding="max_length")
    return model_inputs

# I. Rewrite
# II. Translate
# III. Correct

def minerva_base_prompt_it_it_train(examples):

    prompts = [
        f"""riscrivi la seguente frase '{src}' scritta in italiano arcaico in Italiano moderno: {dst}"""
        for src, dst, dat, dia in zip(examples[SRC_L], examples[TRG_L], examples["Date"], examples["Region"])
    ]

    # Tokenizes input+target and creates label with same tokens
    model_inputs = tokenizer(
        prompts,
        max_length=max_length,
        truncation=True,
        padding="max_length"
    )

    model_inputs["labels"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in input_ids]
        for input_ids in model_inputs["input_ids"]
    ]
    return model_inputs

def minerva_base_prompt_it_it_eval(examples):
    prompts = [

        f"""riscrivi la seguente frase '{src}' scritta in italiano arcaico in Italiano moderno: """
        for src in examples[SRC_L]
    ]

    # Tokenizes input+target and creates label with same tokens
    model_inputs = tokenizer(
        prompts,
        max_length=max_length,
        truncation=True,
        padding="max_length"
    )

    return model_inputs

def base_prompt_en(examples):
    inputs = ["translate from Ancient Italian to Modern Italian: " + example for example in examples[SRC_L]]

    # Tokenizes only inputs
    model_inputs = tokenizer(inputs, max_length=max_length, truncation=True, padding="max_length")

    return model_inputs

def base_prompt_it_it(examples):
    inputs = ["Riscrivi dall'Italiano Antico a l'Italiano Moderno: " + example for example in examples[SRC_L]]

    # Tokenizes only inputs
    model_inputs = tokenizer(inputs, max_length=max_length, truncation=True, padding="max_length")
    targets = [example for example in examples[TRG_L]]

    model_inputs = tokenizer(inputs, text_target=targets, max_length=max_length, truncation=True, padding="longest")

    return model_inputs

def parafrasi_prompt_it_it(examples):
    inputs = ["Scrivi la parafrasi di questo testo: " + example for example in examples[SRC_L]]
    model_inputs = tokenizer(inputs, max_length=max_length, truncation=True, padding="max_length")
    return model_inputs

def informative_prompt_it_it(examples):
    inputs = [f"Riscrivi in uno stile più moderno il testo del seguente Autore: '{author}', anno di scrittura: {date}, luogo: Italia, dialetto: '{region}', testo: '{text}'." for text, date, region, author in zip(examples[SRC_L], examples["Date"], examples["Region"], examples["Author"]) ]
    targets = [example for example in examples[TRG_L]]

    model_inputs = tokenizer(inputs, text_target=targets, max_length=max_length, truncation=True, padding="longest")
    return model_inputs

model max length: 800


### Prompts for Other Tasks (Judge, etc...)

In [10]:
def ft5_std_map(examples):
    return tokenizer(
        [f"Traduci dal volgare all’italiano moderno: {example}" for example in examples["Sentence"]],
        padding=True,
        max_length=128)

def mask_std_map(examples):
    return tokenizer(
        [f"Old Italian: {example} Modern Italian [MASK]" for example in examples['Sentence']],
        padding=True,
        max_length=128)

def minerva_map(examples):
    tokenizer.pad_token = tokenizer.eos_token
    return tokenizer(
        [f"Traduci dal volgare all’italiano moderno: {example}" for example in examples["Sentence"]],
        padding=True,
        max_length=128)

def style_map(examples):
    return tokenizer(
        [f"The following sentence represents an example from the Dolce Stil Novo (sweet new style) literary movement, developed in the 13th and 14th century in Italy: '{example}' Translate it to modern Italian: " for example in examples["Sentence"]],
        padding=True,
        max_length=128)

def period_region_map(examples):
    return tokenizer(
        [f"This sentence '{example['Sentence']}' was written in {example['Date']}, in the {example['Region']} region. Translate it to Modern Italian" for example in examples],
        padding=True,
        max_length=128)

def author_map(examples):
    return tokenizer(
        [f"This sentence: {example['Sentence']} was written by {example['Author']}. Translate it to Modern Italian" for example in examples],
        padding=True,
        max_length=128)

def question_map(examples):
    return tokenizer(
        [f"Puoi riscrivere questa frase: {example} in uno stile più colloquiale?" for example in examples['Sentence']],
        padding=True,
        max_length=128)

### Prompt Selection

In [11]:
# Switch to select the mapping function based on the prompt type

match PROMPT:
    # prompt learning
    case "chat":
        tr = chat_map
    case "seqchat":
        tr = seqchat_map
    # machine tranlation systems
    case "opus":
        tr = opus_prompt
    case "nllb":
        tr = nllb_prompt
    # fine-tuned
    case "minerva_eval":
        tr = minerva_base_prompt_it_it_eval
    # other task
    case "ft5_std":
        tr = ft5_std_map
    case "mask_std":
        tr = mask_std_map
    case "style":
        tr = style_map
    case "period_region":
        tr = period_region_map
    case "author":
        tr = author_map
    case "question":
        tr = question_map
    case "minerva":
        tr = minerva_map
    case "base_prompt_en":
        tr = base_prompt_en
    case "parafrasi":
        tr = parafrasi_prompt_it_it
    case _:
        raise ValueError("Unknown prompt type")

### Network Selection (Configuration + Tokenizer)

In [12]:
# Switch to select the network and load the appropriate model and tokenizer
match NET:

    case "google/mt5-base":
        from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
        tokenizer = AutoTokenizer.from_pretrained(NET)
        model = AutoModelForSeq2SeqLM.from_pretrained(NET, device_map=DEVICE, torch_dtype=torch.float16)

        params = {
            "max_new_tokens": 120,
            "do_sample":True,
            "top_k":10,
            "top_p":0.90,
            "temperature":1.0,
            "repetition_penalty":1.0,
            "num_return_sequences":10,
            "pad_token_id":tokenizer.eos_token_id
        }

    case "google/gemma-3-1b-it":
        from transformers import BitsAndBytesConfig, Gemma3ForCausalLM, AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(NET)
        quantization_config = BitsAndBytesConfig(load_in_8bit=True)
        model = Gemma3ForCausalLM.from_pretrained(NET, device_map=DEVICE, quantization_config=quantization_config)

        params = {
            #"max_new_tokens": max_length,
            "do_sample":True,
            #"top_k":10,
            #"top_p":0.90,
            #"temperature":1.0,
            #"repetition_penalty":1.0,
            #"num_return_sequences":10,
            "max_new_tokens": max_length,
            "pad_token_id":tokenizer.eos_token_id
        }

    case "Helsinki-NLP/opus-mt-itc-itc":
        from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
        tokenizer = AutoTokenizer.from_pretrained(NET)
        model = AutoModelForSeq2SeqLM.from_pretrained(NET, device_map=DEVICE)

        params = {
            #"max_new_tokens": 120,
            "do_sample":True,
            #"top_k":10,
            #"top_p":0.90,
            #"temperature":1.0,
            #"repetition_penalty":1.0,
            #"num_return_sequences":10,
            #"pad_token_id":tokenizer.eos_token_id
        }

    case "sapienzanlp/Minerva-1B-base-v1.0":

        from transformers import AutoTokenizer, AutoModelForCausalLM
        from transformers import BitsAndBytesConfig
        quantization = BitsAndBytesConfig(load_in_8bit=True)
        tokenizer = AutoTokenizer.from_pretrained(NET, use_fast=True)
        tokenizer.padding_side = "left"
        model = AutoModelForCausalLM.from_pretrained(NET, device_map=DEVICE, quantization_config=quantization)

        params = {
            "max_new_tokens": 120,
            "do_sample":True,
            "top_k":10,            # aumento della diversità controllando le parole candidate
            "top_p":0.90,          # campionamento nucleus per ulteriori controlli sulla varietà
            "temperature":1.0,     # riduce la casualità e aumenta la coerenza
            "repetition_penalty":1.0,  # penalizza ripetizioni
            "num_return_sequences":10,  # numero di risposte generate
            "pad_token_id":tokenizer.eos_token_id  # evita warning se manca un token di padding
        }

    case "sapienzanlp/Minerva-7B-instruct-v1.0":

        from transformers import AutoModelForCausalLM, AutoTokenizer
        from transformers import BitsAndBytesConfig
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,  # o torch.float16 se bfloat16 non è supportato
        )
        tokenizer = AutoTokenizer.from_pretrained(NET, use_fast=True)
        
        model = AutoModelForCausalLM.from_pretrained(NET, device_map="auto", quantization_config=bnb_config)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
            model.config.pad_token_id = tokenizer.pad_token_id
        
        tokenizer.padding_side = "left"
        params = {
            #"max_new_tokens": 120,
            "do_sample":True,
            #"top_k":10,            # aumento della diversità controllando le parole candidate
            #"top_p":0.90,          # campionamento nucleus per ulteriori controlli sulla varietà
            #"temperature":1.0,     # riduce la casualità e aumenta la coerenza
            #"repetition_penalty":1.0,  # penalizza ripetizioni
            #"num_return_sequences":10,  # numero di risposte generate
            "pad_token_id":tokenizer.eos_token_id  # evita warning se manca un token di padding
        }

    case "MaLA-LM/emma-500-llama2-7b":
        from transformers import LlamaForCausalLM, AutoTokenizer
        from transformers import BitsAndBytesConfig
        quantization = BitsAndBytesConfig(load_in_8bit=True)
        tokenizer = AutoTokenizer.from_pretrained(NET, use_fast=False)
        tokenizer.padding_side = "left"
        tokenizer.pad_token = tokenizer.eos_token
        model = LlamaForCausalLM.from_pretrained(NET, device_map=DEVICE, quantization_config=quantization)
        params = {
            #"max_new_tokens": 120,
            "do_sample":True,
            #"top_k":10,            # aumento della diversità controllando le parole candidate
            #"top_p":0.90,          # campionamento nucleus per ulteriori controlli sulla varietà
            #"temperature":1.0,     # riduce la casualità e aumenta la coerenza
            #"repetition_penalty":1.0,  # penalizza ripetizioni
            #"num_return_sequences":10,  # numero di risposte generate
            "pad_token_id":tokenizer.eos_token_id  # evita warning se manca un token di padding
        }

    case "facebook/nllb-200-3.3B":
        from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
        from transformers import BitsAndBytesConfig
        quantization = BitsAndBytesConfig(load_in_8bit=True)
        tokenizer = AutoTokenizer.from_pretrained(NET, use_fast=False, padding_side='left')
        tokenizer.src_lang = "ita_Latn"
        if not tokenizer.pad_token:
            tokenizer.pad_token = tokenizer.eos_token

        model = AutoModelForSeq2SeqLM.from_pretrained(NET, device_map=DEVICE, quantization_config=quantization)
        params = {
            #"max_new_tokens": 120,
            #"do_sample":True,
            #"top_k":10,            # aumento della diversità controllando le parole candidate
            #"top_p":0.90,          # campionamento nucleus per ulteriori controlli sulla varietà
            #"temperature":1.0,     # riduce la casualità e aumenta la coerenza
            #"repetition_penalty":1.0,  # penalizza ripetizioni
            #"num_return_sequences":10,  # numero di risposte generate
            #"pad_token_id":tokenizer.eos_token_id,  # evita warning se manca un token di padding
            "forced_bos_token_id":tokenizer.convert_tokens_to_ids("ita_Latn")
        }

    case _:
        raise Exception(f"Rete {NET} non testabile")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

## Dataset

In [13]:
hf = Dataset.from_csv(DATASET_EVAL, features=FEATURES, keep_in_memory=True)

In [14]:
tokenized = hf.map(tr, batched=True)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [15]:
print(tokenized.column_names)
for idx, s in enumerate(tokenized.take(5), 1):
    print(f"sample n°{idx}: {s}")
    print(f"Decode ids: {tokenizer.decode(s['input_ids'], attention_mask=s['attention_mask'], skip_special_tokens=True)}")

['Author', 'Date', 'Region', 'Sentence', 'input_ids', 'attention_mask']
sample n°1: {'Author': 'Brunetto Latini', 'Date': '1260-61', 'Region': 'fior.', 'Sentence': 'Altressì uno amante chiamando merzé alla sua donna dice parole e ragioni molte, et ella si difende in suo dire.', 'input_ids': [51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 51202, 

In [16]:
from utils import generate_and_save

df = generate_and_save(
    model=model,
    tokenizer=tokenizer,
    tokenized_dataset=tokenized,
    output_prefix=NET.split("/")[-1],
    device=DEVICE,
    batch_size=BS,
    config=params,
    include_prompt=True
)

Generating:   0%|          | 0/10 [00:00<?, ?it/s]

Saved to Minerva-7B-instruct-v1.0(MistralForCausalLM).csv
