In [None]:
from datasets import Dataset, load_metric
from transformers import AutoTokenizer, DataCollatorWithPadding, TrainingArguments, AutoModelForSequenceClassification, Trainer
import numpy as np
import pandas as pd

from huggingface_hub import notebook_login
from collections import Counter

Preparazione dataset

In [None]:
train_pd = pd.read_csv('data/modelSelection/train_set.csv')
eval_pd = pd.read_csv('data/modelSelection/eval_set.csv')


train_dataset = Dataset.from_pandas(train_pd)
eval_dataset = Dataset.from_pandas(eval_pd)

Definizione modello e training

In [None]:
checkpoint = 'nlpaueb/legal-bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def preprocess_function(examples):
   return tokenizer(examples["Processed_text"], truncation=True, max_length=512)
 
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_eval= eval_dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
import numpy as np
from sklearn.metrics import f1_score, accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    # Calcola l'F1 score per ciascuna classe
    f1_scores = f1_score(labels, predictions, average=None)
    
    # Calcola l'F1 score aggregato
    f1_macro = f1_score(labels, predictions, average='macro')
    
    # Calcola l'accuracy
    accuracy = accuracy_score(labels, predictions)
    
    # Crea un dizionario con i risultati
    metrics = {"Accuracy": accuracy, "F1_macro": f1_macro}
    
    for i, f1 in enumerate(f1_scores):
        metrics[f"F1_class_{i}"] = f1
    
    return metrics


In [None]:
from transformers import TrainingArguments
training_args = TrainingArguments(
   output_dir="DayOne",
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=20,
   weight_decay=0.01,
   evaluation_strategy="steps",
   logging_steps=250,
   load_best_model_at_end=True,
   push_to_hub=True,
)

In [None]:

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=20)
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_eval,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
) 
 
trainer.train()

In [None]:
#Carica il modello su huggingface hub (dovete fare il login e avere una key di huggingFace)
trainer.push_to_hub()