In [9]:
# Montaggio Google Drive
from google.colab import drive
drive.mount('/content/drive')


# Installazione librerie necessarie
!pip install -U transformers datasets accelerate evaluate lxml pandas scikit-learn

import xml.etree.ElementTree as ET
import pandas as pd
import csv
import os


# Definizione dei percorsi

drive_root = "/content/drive/MyDrive"
dataset_folder = f"{drive_root}/e-rte-3-it"
script_path = f"{drive_root}/run_glue_no_trainer_italiano.py"

### INIZIA LA PARTE DI CONVERSIONE IN CSV ###

def parse_rte_xml(xml_file):
    # Legge file binario grezzo
    with open(xml_file, "rb") as f:
        raw = f.read()

  # Decodifica con pulizia
    text = raw.decode("utf-8-sig", errors="replace")

    try:
        root = ET.fromstring(text)
    except ET.ParseError as e:
        raise ValueError(f"Errore di parsing in {xml_file}: {e}")

    # Estrae coppie T-H ed etichette
    data = []
    for pair in root.findall("pair"):
        id_ = pair.get("id")
        entailment = pair.get("entailment")
        t = pair.find("t").text.strip() if pair.find("t") is not None else ""
        h = pair.find("h").text.strip() if pair.find("h") is not None else ""
        data.append({"id": id_, "sentence1": t, "sentence2": h, "label": entailment})
    return pd.DataFrame(data)

# Converte e pulisce DEV.xml
dev_df = parse_rte_xml(os.path.join(dataset_folder, "DEV.xml"))

# Normalizza etichette
dev_df["label"] = dev_df["label"].str.strip().str.lower()

# Mostra distribuzione
print("Distribuzione etichette in DEV:")
print(dev_df["label"].value_counts())

# Mappa le etichette
label_map = {"yes": 0, "no": 1, "unknown": 2}
dev_df = dev_df[dev_df["label"].isin(label_map)]  # rimuove le righe fuori mappa
dev_df["label"] = dev_df["label"].map(label_map)

print("Dopo la mappatura:")
print(dev_df["label"].value_counts())

# Split train/val
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(dev_df, test_size=0.2, stratify=dev_df["label"], random_state=42)

# Converte in csv
train_df.to_csv("train.csv", index=False)
val_df.to_csv("validation.csv", index=False)

# Crea la cartella di output
output_folder = f"{drive_root}/e-rte3-it-model"
os.makedirs(output_folder, exist_ok=True)

# Salva il file .csv
!cp train.csv "{output_folder}/train.csv"
!cp validation.csv "{output_folder}/validation.csv"

### FINISCE LA PARTE DI CONVERSIONE IN CSV ###

### ADDESTRAMENTO DEL MODELLO ###
!python "{script_path}" \
  --train_file train.csv \
  --validation_file validation.csv \
  --model_name_or_path dbmdz/bert-base-italian-xxl-cased \
  --per_device_train_batch_size 8 \
  --per_device_eval_batch_size 8 \
  --learning_rate 1e-5 \
  --num_train_epochs 10 \
  --output_dir "{output_folder}" \
  --max_length 128 \
  --pad_to_max_length \
  --seed 31 \
  --checkpointing_steps epoch \
  --load_best_model_at_end \
  --metric_for_best_model accuracy \
  --greater_is_better True


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Distribuzione etichette in DEV:
label
yes        412
unknown    308
no          80
Name: count, dtype: int64
Dopo la mappatura:
label
0    412
2    308
1     80
Name: count, dtype: int64
2025-06-24 14:18:02.171757: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750774682.370214   11403 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750774682.427111   11403 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Generating train split: 640 examples [00:00, 53428.50 examples/s]
Generating validation split: 160 examples [00:00, 33684

In [10]:
### CONVERSIONE IN CSV DEL TEST SET ###
test_df = parse_rte_xml(os.path.join(dataset_folder, "TEST.xml"))

test_df["label"] = test_df["label"].str.strip().str.lower()

print("Distribuzione etichette in TEST:")
print(test_df["label"].value_counts())

# Mappa le etichette
label_map = {"yes": 0, "no": 1, "unknown": 2}
test_df = test_df[test_df["label"].isin(label_map)]  # rimuove le righe fuori mappa
test_df["label"] = test_df["label"].map(label_map)

print("Dopo la mappatura:")
print(test_df["label"].value_counts())

test_df.to_csv("test.csv", index=False)

!cp test.csv "{output_folder}/test.csv"

Distribuzione etichette in TEST:
label
yes        388
unknown    342
no          70
Name: count, dtype: int64
Dopo la mappatura:
label
0    388
2    342
1     70
Name: count, dtype: int64


In [11]:
### EVALUATION SUL TEST SET ###
# Importazione librerie necessarie
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
import evaluate
import torch
from torch.utils.data import DataLoader
import os

# Selezione del modello migliore dopo il processo di addestramento

device     = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_path = output_folder
best_dir = os.path.join(output_folder, "best_model")

tokenizer = AutoTokenizer.from_pretrained(best_dir)
model     = AutoModelForSequenceClassification.from_pretrained(best_dir).to(device)
model.eval()

# Importazione del test set da CSV
test_dataset = load_dataset("csv", data_files={"test": "test.csv"})["test"]

# Tokenizzazione
def preprocess(example):
    return tokenizer(
        example["sentence1"],
        example["sentence2"],
        truncation=True,
        padding="max_length",
        max_length=128
    )
encoded_test = test_dataset.map(preprocess, batched=True)
encoded_test.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# Dataloader
test_loader = DataLoader(encoded_test, batch_size=32)

# Inferenza
preds, refs = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids      = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels         = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        pred    = outputs.logits.argmax(dim=-1)

        preds.extend(pred.cpu().tolist())
        refs.extend(labels.cpu().tolist())

# Calcolo accuracy
accuracy = evaluate.load("accuracy")
result   = accuracy.compute(predictions=preds, references=refs)
print("Accuracy sul test set:", result["accuracy"])

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Accuracy sul test set: 0.64375
