In [None]:
from datasets import Dataset
import numpy as np
from numpy.typing import NDArray
import pandas as pd
import optuna
from sklearn.preprocessing import LabelEncoder
from transformers.data.data_collator import DataCollatorWithPadding
from transformers.models.auto.modeling_auto import AutoModelForSequenceClassification
from transformers.models.auto.tokenization_auto import AutoTokenizer
from transformers.modeling_utils import PreTrainedModel
from transformers.tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase
from transformers.trainer import Trainer
from transformers.trainer_callback import EarlyStoppingCallback
from transformers.training_args import TrainingArguments

from modules import models, paths, utils

Va scelto il modello e regolato eval_steps (influenza l'early stopping). Aggiungere wandb. Una volta sistemato il dataset, sistemare le chiamate a df['label'] e simili (train_df[['description','label']]).

In [None]:
model_name: str = 'sentence-transformers/all-MiniLM-L6-v2'

### Data

In [None]:
# Load the dataset
train_df: pd.DataFrame = pd.read_csv(f'hf://datasets/sapienzanlp/nlp2025_hw1_cultural_dataset/train.csv')
val_df: pd.DataFrame = pd.read_csv(f'hf://datasets/sapienzanlp/nlp2025_hw1_cultural_dataset/valid.csv')

In [None]:
# Encode labels
label_encoder: LabelEncoder = LabelEncoder()
train_df['label'] = label_encoder.fit_transform(train_df['label'])
val_df['label'] = label_encoder.transform(val_df['label'])

In [None]:
# Define the tokenizer and data collator
tokenizer: PreTrainedTokenizerBase = AutoTokenizer.from_pretrained(model_name)
data_collator: DataCollatorWithPadding = DataCollatorWithPadding(tokenizer = tokenizer)

In [None]:
# Give the data a suitable format

def tokenize(samples) -> BatchEncoding:
    return tokenizer(samples['description'], truncation = True, padding = True)

train_data: Dataset = Dataset.from_pandas(train_df[['description','label']]).map(tokenize, batched = True)
val_data: Dataset = Dataset.from_pandas(val_df[['description','label']]).map(tokenize, batched = True)

### Model

In [None]:
# Load  the pretrained model
model: PreTrainedModel = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                                            num_labels = len(label_encoder.classes_),
                                                                            ignore_mismatched_sizes = True
                                                                            )

### Tuning

In [None]:
def model_init():
    return build_model(model_name, classes)

def objective(trial):
    lr = trial.suggest_float("lr", 1e-5, 1e-3, log=True)
    batch_size=32
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.3)
    num_epochs = trial.suggest_int("num_train_epochs", 3, 8)
    warmup_ratio = trial.suggest_float("warmup_ratio", 0.0, 0.2)

    trainargs = TrainingArguments(
        output_dir="./results", 
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        weight_decay=weight_decay,
        warmup_ratio=warmup_ratio,
        load_best_model_at_end=True,
        eval_strategy="epoch", 
        save_strategy="epoch",
        metric_for_best_model="accuracy",
        greater_is_better=True,
        report_to="none"
    )

    trainer = Trainer(
        model_init=model_init,
        args=trainargs,
        train_dataset=train_data,
        eval_dataset=val_data,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=metrics,
    )
    trainer.train()
    
    if trainer.state.best_model_checkpoint is not None:
        trainer.model = AutoModelForSequenceClassification.from_pretrained(trainer.state.best_model_checkpoint)
        trainer.model.to(device)

    eval_dataloader = trainer.get_eval_dataloader()
    raw_preds = trainer.prediction_loop(eval_dataloader, description="Evaluation")

    result = metrics((raw_preds.predictions, raw_preds.label_ids))
    f1 = result.get("f1")
    acc = result.get("accuracy")

    trial.set_user_attr("f1", f1)
    
    return acc  

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=1)

In [None]:
print("Best trial:")
print("  Value (F1):", study.best_trial.value)
print("  Params:")
for key, value in study.best_trial.params.items():
    print(f"    {key}: {value}")

In [None]:
optuna.visualization.plot_optimization_history(study).show()
optuna.visualization.plot_param_importances(study).show()

### Training

In [None]:
# Train the model

trainargs: TrainingArguments = TrainingArguments(num_train_epochs = 1000,    # Unlimited epochs
                                                 auto_find_batch_size = True,
                                                 fp16 = True,
                                                 metric_for_best_model = 'f1',
                                                 load_best_model_at_end = True,
                                                 eval_strategy = 'steps',
                                                 eval_steps = 100,
                                                 save_strategy = 'best',
                                                 save_total_limit = 1,
                                                 output_dir = str(paths.TRANSFORMER_MODEL_DIR),
                                                 report_to = 'none' # cambiare a wandb
                                                 )

trainer: Trainer = Trainer(callbacks = [EarlyStoppingCallback(early_stopping_patience = 10)],
                           model = model,
                           args = trainargs,
                           train_dataset = train_data,
                           eval_dataset = val_data,
                           processing_class = tokenizer,
                           compute_metrics = models.transformer_metrics
                           )

trainer.train()

In [None]:
# Evaluate the model on the validation set
val_results: dict[str, float] = trainer.evaluate()
print(f"Loss: {val_results['eval_loss']:.3f}")
print(f"Accuracy: {val_results['eval_accuracy']:.3f}")
print(f"F1 score: {val_results['eval_f1']:.3f}")
print(f"Precision: {val_results['eval_precision']:.3f}")
print(f"Recall: {val_results['eval_recall']:.3f}")

In [None]:
# Confusion matrix
logits: NDArray[np.float32] = np.array(trainer.predict(val_data).predictions)    # type: ignore
predictions_encoded: NDArray[np.intp] = np.argmax(logits, axis = 1)
utils.plot_confusion_matrix(val_df['label'], predictions_encoded, label_encoder)

### Test