# Classificador de Sentiments a Xarxes Socials en Català (CSXSC): Model Evaluation

**Author:** Daniel Arias Cámara  
**Date:** 07-2025  

**Description:**

In [1]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, cohen_kappa_score

data_files = {
    "train": "./train.csv",
    "validation": "./validation.csv",
    "test": "./test.csv"
}
dataset = load_dataset("csv", data_files=data_files)

checkpoint = "projecte-aina/roberta-base-ca-v2"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.padding_side = "right"

num_labels = 3
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=num_labels
)

label2id = {"negative": 0, "neutral": 1, "positive": 2}

def map_label(example):
    if isinstance(example["label"], str):
        label_lower = example["label"].strip().lower()
        if label_lower in label2id:
            example["label"] = label2id[label_lower]
        else:
            raise ValueError(f"Unexpected label: {example['label']}")
    return example

dataset = dataset.map(map_label)

def preprocess_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

tokenized_datasets = dataset.map(preprocess_function, batched=True)
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets.set_format("torch")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1_macro": f1_score(labels, predictions, average="macro"),
        "f1_weighted": f1_score(labels, predictions, average="weighted"),
        "qwk": cohen_kappa_score(labels, predictions, weights="quadratic"),
    }

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    warmup_ratio=0.1,
    lr_scheduler_type="linear",
    logging_strategy="epoch",
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="qwk",
    greater_is_better=True,
    gradient_accumulation_steps=1,
    fp16=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

test_results = trainer.evaluate(tokenized_datasets["test"])
print("\nFinal Evaluation on Test Set:")
for key, value in test_results.items():
    print(f"{key}: {value:.4f}")


  from .autonotebook import tqdm as notebook_tqdm
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at projecte-aina/roberta-base-ca-v2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted,Qwk
1,0.6465,0.451981,0.813367,0.803018,0.812101,0.843816
2,0.3975,0.448084,0.815048,0.803626,0.812358,0.855087
3,0.3141,0.453484,0.815469,0.80335,0.812367,0.853265
4,0.255,0.48481,0.824296,0.814086,0.823027,0.860137
5,0.2191,0.50631,0.820933,0.810657,0.81986,0.857983



Final Evaluation on Test Set:
eval_loss: 0.4376
eval_accuracy: 0.8369
eval_f1_macro: 0.8186
eval_f1_weighted: 0.8364
eval_qwk: 0.8715
eval_runtime: 13.9756
eval_samples_per_second: 170.2260
eval_steps_per_second: 5.3670
epoch: 5.0000


In [1]:
import numpy as np
import torch
from datasets import load_dataset, ClassLabel
from sklearn.metrics import accuracy_score, f1_score, cohen_kappa_score
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    set_seed,
    BitsAndBytesConfig,
)
from peft import LoraConfig, get_peft_model, TaskType

set_seed(42)

MODEL_CHECKPOINT = "./Starling-LM-7B-alpha"
DATA_FILES = {
    "train": "./train.csv",
    "validation": "./validation.csv",
    "test": "./test.csv"
}
MAX_LENGTH = 96

dataset = load_dataset("csv", data_files=DATA_FILES)

label_names = ["negative", "neutral", "positive"]
cl = ClassLabel(names=label_names)
dataset = dataset.cast_column("label", cl)

id2label = {i: label for i, label in enumerate(label_names)}
label2id = {label: i for i, label in id2label.items()}

tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def preprocess_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH
    )

tokenized_datasets = dataset.map(preprocess_function, batched=True)
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets.set_format("torch")

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=len(label_names),
    id2label=id2label,
    label2id=label2id,
    quantization_config=quantization_config,
    device_map="auto",
    trust_remote_code=True,
)

model.config.pad_token_id = tokenizer.pad_token_id

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_CLS
)

peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1_macro": f1_score(labels, predictions, average="macro"),
        "f1_weighted": f1_score(labels, predictions, average="weighted"),
        "qwk": cohen_kappa_score(labels, predictions, weights="quadratic"),
    }

training_args = TrainingArguments(
    output_dir="./results_stirling_qlora_optimized",
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    optim="paged_adamw_8bit",
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_ratio=0.1,
    bf16=True,
    gradient_checkpointing=True,
    load_best_model_at_end=True,
    metric_for_best_model="qwk",
    greater_is_better=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    report_to="none",
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

print("Starting QLoRA fine-tuning for Starling-LM-7B-alpha.")
trainer.train()

print("\nEvaluating the best model on the test set.")
test_results = trainer.evaluate(tokenized_datasets["test"])

print("\nFinal Evaluation on Test Set:")
for key, value in test_results.items():
    print(f"{key}: {value:.4f}")

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 3/3 [00:15<00:00,  5.09s/it]
Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at ./Starling-LM-7B-alpha and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


trainable params: 6,828,032 || all params: 7,117,508,608 || trainable%: 0.0959
Starting QLoRA fine-tuning for Starling-LM-7B-alpha.


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted,Qwk
1,14.4836,1.296154,0.656578,0.648504,0.661192,0.652657
2,8.1661,1.128759,0.690626,0.670021,0.682865,0.709337
3,6.6027,1.039839,0.693148,0.678921,0.691578,0.70546





Evaluating the best model on the test set.



Final Evaluation on Test Set:
eval_loss: 1.0908
eval_accuracy: 0.6839
eval_f1_macro: 0.6489
eval_f1_weighted: 0.6767
eval_qwk: 0.7107
eval_runtime: 391.4062
eval_samples_per_second: 6.0780
eval_steps_per_second: 1.5200
epoch: 3.0000


In [1]:
import numpy as np
import torch
from datasets import load_dataset, ClassLabel
from sklearn.metrics import accuracy_score, f1_score, cohen_kappa_score
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    set_seed,
    BitsAndBytesConfig,
)
from peft import LoraConfig, get_peft_model, TaskType

set_seed(42)

MODEL_CHECKPOINT = "./salamandra-7b"
DATA_FILES = {
    "train": "./train.csv",
    "validation": "./validation.csv",
    "test": "./test.csv"
}
MAX_LENGTH = 96

dataset = load_dataset("csv", data_files=DATA_FILES)

label_names = ["negative", "neutral", "positive"]
cl = ClassLabel(names=label_names)
dataset = dataset.cast_column("label", cl)

id2label = {i: label for i, label in enumerate(label_names)}
label2id = {label: i for i, label in id2label.items()}

tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def preprocess_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH
    )

tokenized_datasets = dataset.map(preprocess_function, batched=True)
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets.set_format("torch")

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=len(label_names),
    id2label=id2label,
    label2id=label2id,
    quantization_config=quantization_config,
    device_map="auto",
    trust_remote_code=True,
)

model.config.pad_token_id = tokenizer.pad_token_id

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_CLS
)

peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1_macro": f1_score(labels, predictions, average="macro"),
        "f1_weighted": f1_score(labels, predictions, average="weighted"),
        "qwk": cohen_kappa_score(labels, predictions, weights="quadratic"),
    }

training_args = TrainingArguments(
    output_dir="./results_salamandra",
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    optim="paged_adamw_8bit",
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_ratio=0.1,
    bf16=True,
    gradient_checkpointing=True,
    load_best_model_at_end=True,
    metric_for_best_model="qwk",
    greater_is_better=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    report_to="none",
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

print("Starting QLoRA fine-tuning for Salamandra-7b.")
trainer.train()

print("\nEvaluating the best model on the test set.")
test_results = trainer.evaluate(tokenized_datasets["test"])

print("\nFinal Evaluation on Test Set:")
for key, value in test_results.items():
    print(f"{key}: {value:.4f}")

  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 19030/19030 [00:00<00:00, 25929.27 examples/s]
Map: 100%|██████████| 2379/2379 [00:00<00:00, 25696.74 examples/s]
Map: 100%|██████████| 2379/2379 [00:00<00:00, 24748.87 examples/s]
Loading checkpoint shards: 100%|██████████| 4/4 [00:13<00:00,  3.35s/it]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at ./salamandra-7b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


trainable params: 6,828,032 || all params: 6,726,381,568 || trainable%: 0.1015
Starting QLoRA fine-tuning for Salamandra-7b.


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted,Qwk
1,5.902,0.587382,0.741068,0.726644,0.738072,0.745122
2,4.4102,0.563246,0.745691,0.732247,0.743404,0.764441
3,4.0926,0.555918,0.746953,0.731599,0.743085,0.760114





Evaluating the best model on the test set.



Final Evaluation on Test Set:
eval_loss: 0.5455
eval_accuracy: 0.7533
eval_f1_macro: 0.7301
eval_f1_weighted: 0.7512
eval_qwk: 0.7872
eval_runtime: 333.4366
eval_samples_per_second: 7.1350
eval_steps_per_second: 1.7840
epoch: 3.0000
