In [None]:
import torch
from modules import paths
from datasets import Dataset
from transformers import AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding, set_seed, AutoModelForSequenceClassification
from modules.transformer_model import build_model, metrics
import nest_asyncio
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import optuna
import pandas as pd

In [None]:
nest_asyncio.apply()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Utilizzo:", device)
set_seed(0)

In [None]:
df_train: pd.DataFrame = pd.read_csv(f'hf://datasets/sapienzanlp/nlp2025_hw1_cultural_dataset/train.csv')
df_val: pd.DataFrame = pd.read_csv(f'hf://datasets/sapienzanlp/nlp2025_hw1_cultural_dataset/valid.csv')

labels = sorted(df_train['label'].unique())
lab2id = {l:i for i,l in enumerate(labels)}
df_train['label'] = df_train['label'].map(lab2id)
df_val  ['label'] = df_val  ['label'].map(lab2id)

df_train_sample = df_train.sample(n=1000, random_state=0).reset_index(drop=True)
df_val_sample   = df_val.sample(n=300, random_state=0).reset_index(drop=True)

train_ds = Dataset.from_pandas(df_train_sample[['description','label']])
val_ds = Dataset.from_pandas(df_val_sample[['description','label']])


In [None]:
model_type = "xlm-roberta-base"
classes = 3

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_type)

def tokenize(examples):
    return tokenizer(examples["description"], truncation=True)

In [None]:
train_data = train_ds.map(tokenize, batched=True).rename_column("label", "labels")
val_data = val_ds.map(tokenize, batched=True).rename_column("label", "labels")

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
def model_init():
    return build_model(model_type, classes).to(device)

def objective(trial):
    lr = trial.suggest_float("lr", 1e-5, 1e-3, log=True)
    batch_size=32
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.3)
    num_epochs = trial.suggest_int("num_train_epochs", 3, 8)
    warmup_ratio = trial.suggest_float("warmup_ratio", 0.0, 0.2)

    trainargs = TrainingArguments(
        output_dir="./results", 
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        weight_decay=weight_decay,
        warmup_ratio=warmup_ratio,
        load_best_model_at_end=True,
        eval_strategy="epoch", 
        save_strategy="epoch",
        metric_for_best_model="accuracy",
        greater_is_better=True,
        report_to="none"
    )

    trainer = Trainer(
        model_init=model_init,
        args=trainargs,
        train_dataset=train_data,
        eval_dataset=val_data,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=metrics,
    )
    trainer.train()
    
    if trainer.state.best_model_checkpoint is not None:
        trainer.model = AutoModelForSequenceClassification.from_pretrained(trainer.state.best_model_checkpoint)
        trainer.model.to(device)

    eval_dataloader = trainer.get_eval_dataloader()
    raw_preds = trainer.prediction_loop(eval_dataloader, description="Evaluation")

    result = metrics((raw_preds.predictions, raw_preds.label_ids))
    f1 = result.get("f1")
    acc = result.get("accuracy")

    trial.set_user_attr("f1", f1)
    
    return acc  

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=15)

In [None]:
print("Best trial:")
print("  Value (F1):", study.best_trial.value)
print("  Params:")
for key, value in study.best_trial.params.items():
    print(f"    {key}: {value}")


In [None]:
best_params = study.best_trial.params

train_ds = Dataset.from_pandas(df_train[['description','label']])
val_ds   = Dataset.from_pandas(df_val[['description','label']])
train_data = train_ds.map(tokenize, batched=True).rename_column("label", "labels")
val_data   = val_ds.map(tokenize, batched=True).rename_column("label", "labels")

trainargs = TrainingArguments(
    output_dir="./best_model",
    learning_rate=best_params["lr"],
    per_device_train_batch_size=best_params["batch_size"],
    per_device_eval_batch_size=best_params["batch_size"],
    num_train_epochs=best_params["num_train_epochs"],
    weight_decay=best_params["weight_decay"],
    warmup_ratio=best_params["warmup_ratio"],
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    report_to="none"
)

final_trainer = Trainer(
    model_init=model_init,
    args=trainargs,
    train_dataset=train_data,  
    eval_dataset=val_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=metrics
)

final_trainer.train()


In [None]:
optuna.visualization.plot_optimization_history(study).show()
optuna.visualization.plot_param_importances(study).show()

In [None]:
preds = final_trainer.predict(val_data)
y_true = preds.label_ids
y_pred = preds.predictions.argmax(-1)

cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(cm, display_labels=lab2id.keys())
disp.plot(cmap="Blues", xticks_rotation=45)
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()