In [1]:
!pip install datasets

import math
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, DataCollatorForWholeWordMask
from datasets import load_dataset
from sklearn.metrics import accuracy_score, recall_score
from torch.utils.data import DataLoader

# Configurazione del modello e parametri
#MODEL_NAME = "./model/roberta-base-latin-v2"  # Sostituisci con il tuo modello
MODEL_NAME = "ClassCat/roberta-base-latin-v2"
#DATASET_PATH = "dataset.txt"      # Percorso del tuo dataset
MASK_PROBABILITY = 0.2
BATCH_SIZE = 8

# Caricamento del tokenizer e del modello
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForMaskedLM.from_pretrained(MODEL_NAME)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#torch.cuda.set_per_process_memory_fraction(0.95)
torch.cuda.empty_cache()
model.to(device)
model.eval()

# Caricamento del dataset personalizzato
#Dataset locale
#dataset = load_dataset("parquet", data_dir="./parquet", trust_remote_code=True)
#Dataset caricato da huggingface
#dataset = load_dataset("Cicciokr/CC-100-Latin", revision="refs/convert/parquet")
#Dataset CC 100 lavorato
dataset = load_dataset("pstroe/cc100-latin", data_files="la.nolorem.tok.latalphabetonly.v2.json", field="train")
dataset_split = dataset['train'].train_test_split(test_size=0.0001, shuffle=True)

# Tokenizzazione del dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

# Tokenizza i dati
tokenized_datasets = dataset['test'].map(tokenize_function, batched=True)

# Creazione del data collator per MLM con mascheramento dinamico
data_collator = DataCollatorForWholeWordMask(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=MASK_PROBABILITY
)

# Funzione per calcolare Accuratezza e Perplessità
def evaluate_model(dataloader):
    total_loss = 0
    total_correct = 0
    total_tokens = 0
    all_predictions = []  # Lista per accumulare le predizioni
    all_labels = []  # Lista per accumulare i valori reali

    for batch in dataloader:
        # Prepara un batch di esempio
        #batch = dataset.select(range(i, min(i + batch_size, len(dataset))))
        examples = [tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt") for text in batch["text"]]
        examples_list = batch["text"][:]

        # Combina i batch in un unico dizionario
        input_ids = torch.cat([ex["input_ids"] for ex in examples], dim=0).to(device)
        attention_mask = torch.cat([ex["attention_mask"] for ex in examples], dim=0).to(device)
        labels = input_ids.clone()


        # Applica il data collator per mascherare dinamicamente
        #Sposto i tensori sulla cpu perchè altrimenti il DataCollatorForWholeWordMask non funziona
        batch = data_collator([{
            "input_ids": input_ids[i].cpu().tolist(),
            "attention_mask": attention_mask[i].cpu().tolist(),
            "labels": labels[i].cpu().tolist()
        } for i in range(input_ids.size(0))])

        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)


        with torch.no_grad():
            outputs = model(input_ids, labels=labels)
            logits = outputs.logits
            loss = outputs.loss

        # Calcola il loss totale per la perplessità
        #Valore basso (es. 0.1 - 0.5): Il modello sta facendo previsioni accurate.
        #Valore medio (es. 1 - 2): Prestazioni accettabili, ma con margine di miglioramento.
        #Valore alto (> 2): Modello con scarsa performance; potrebbe richiedere miglioramenti nei dati o nell'addestramento.
        if not torch.isnan(loss):
            total_loss += loss.item() * input_ids.size(0)

            # Calcola l'accuratezza
            predictions = torch.argmax(logits, dim=-1)

            mask = labels != -100  # Maschera per selezionare solo i token mascherati

            correct = (predictions[mask] == labels[mask]).sum().item()
            total_correct += correct
            total_tokens += mask.sum().item()
            # Accumula predizioni e labels
            all_predictions.extend(predictions[mask].cpu().numpy())
            all_labels.extend(labels[mask].cpu().numpy())

    # Accuratezza
    accuracy = total_correct / total_tokens

    # Perplessità
    avg_loss = total_loss / total_tokens
    perplexity = math.exp(avg_loss)
    recall = recall_score(all_labels, all_predictions, average='weighted')

    return accuracy, perplexity, total_correct, total_loss, total_tokens, recall

# Esegui la valutazione
eval_dataloader = DataLoader(tokenized_datasets, batch_size=BATCH_SIZE, shuffle=False)
accuracy, perplexity, total_correct, total_loss, total_tokens, recall = evaluate_model(eval_dataloader)

# Stampa i risultati
# > 80% Ottimo modello
# = 70 Buon modello
# < 70% Mediocre
print(f"Accuratezza: {accuracy*100:.2f}")
#Se la loss è 0, la perplessità sarà 1, indicando previsioni perfette.
#Perplessità > 1 indica un grado di incertezza crescente
print(f"Perplessità: {perplexity:.4f}")
print(f"Recall: {recall:.4f}")  # Stampa il valore del Recall
print(f"Total correct: {total_correct:.2f}")
print(f"Total Loss: {total_loss:.2f}")
print(f"Total Token: {total_tokens:.2f}")
print(f"Length Dataset: {len(tokenized_datasets):.2f}")

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/431 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/845k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/505k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/238 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


la.nolorem.tok.latalphabetonly.v2.json:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/934 [00:00<?, ? examples/s]



Accuratezza: 33.14
Perplessità: 2.4693
Recall: 0.3314
Total correct: 1429.00
Total Loss: 3897.73
Total Token: 4312.00
Length Dataset: 934.00


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
