# Classificador de Sentiments a Xarxes Socials en Català (CSXSC): Fine-Tuning Model

**Author:** Daniel Arias Cámara  
**Date:** 07-2025  

**Description:**  This notebook presents the fine-tuning process of the **CSXSC** model, designed to classify social media reviews in Catalan into three categories: positive, negative, and neutral.

The base model for CSXSC is [roberta-base-ca-v2](https://huggingface.co/projecte-aina/roberta-base-ca-v2), a Catalan adaptation of RoBERTa, an encoder-only transformer architecture well-suited for text classification tasks.

This model is provided by Aina Kit and trained on a diverse Catalan corpus, including Catalan Crawling, Wikipedia, the Official Gazette of the Government of Catalonia (DOGC), and other publicly available datasets.

In [1]:
from datasets import load_dataset

data_files = {
    "train": "./train.csv",
    "validation": "./validation.csv",
    "test": "./test.csv"
}

dataset = load_dataset("csv", data_files=data_files)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(dataset["train"][0])

{'text': 'Desvergonyidament robat de Tres Sabors Cornetto Posting', 'label': 'negative'}


In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("projecte-aina/roberta-base-ca-v2")
tokenizer.padding_side = "right"

model = AutoModelForSequenceClassification.from_pretrained("projecte-aina/roberta-base-ca-v2", num_labels=3)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at projecte-aina/roberta-base-ca-v2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
label2id = {"negative": 0, "neutral": 1, "positive": 2}

def map_label(example):
    if isinstance(example["label"], str):
        label_lower = example["label"].strip().lower()
        if label_lower in label2id:
            example["label"] = label2id[label_lower]
        else:
            raise ValueError(f"Unexpected label: {example['label']}")
    return example

dataset = dataset.map(map_label)

In [5]:
def preprocess_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 2379/2379 [00:00<00:00, 17053.28 examples/s]


In [6]:
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets.set_format("torch")

In [None]:
from transformers import Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, cohen_kappa_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    return {
        "eval_accuracy": accuracy_score(labels, predictions),
        "eval_f1_macro": f1_score(labels, predictions, average="macro"),
        "eval_f1_weighted": f1_score(labels, predictions, average="weighted"),
        "eval_qwk": cohen_kappa_score(labels, predictions, weights="quadratic"),  # NEW
    }

# Hyperparameters (you can extend this grid later)
learning_rates = [2e-5]
batch_sizes = [16]
weight_decays = [0.01]
epoch_options = [4]

best_score = -float("inf")
best_params = None
training_history = []

for lr in learning_rates:
    for batch_size in batch_sizes:
        for wd in weight_decays:
            for epochs in epoch_options:
                print(f"\nTraining with LR={lr}, BS={batch_size}, WD={wd}, Epochs={epochs}")

                training_args = TrainingArguments(
                    output_dir="./results",
                    eval_strategy="epoch",
                    save_strategy="no",
                    learning_rate=lr,
                    per_device_train_batch_size=batch_size,
                    per_device_eval_batch_size=batch_size,
                    num_train_epochs=epochs,
                    weight_decay=wd,
                    logging_strategy="epoch",
                    report_to="none"
                )

                trainer = Trainer(
                    model=model,
                    args=training_args,
                    train_dataset=tokenized_datasets["train"],
                    eval_dataset=tokenized_datasets["validation"],
                    tokenizer=tokenizer,
                    compute_metrics=compute_metrics
                )

                train_result = trainer.train()
                eval_metrics = trainer.evaluate()

                # Store metrics + hyperparams
                eval_metrics.update({
                    "learning_rate": lr,
                    "batch_size": batch_size,
                    "weight_decay": wd,
                    "epochs": epochs,
                    "train_loss": train_result.training_loss
                })
                training_history.append(eval_metrics)

                # Select best model based on QWK (change to eval_f1_macro if preferred)
                if eval_metrics["eval_qwk"] > best_score:
                    best_score = eval_metrics["eval_qwk"]
                    best_params = (lr, batch_size, wd, epochs)

print(f"\nBest params: LR={best_params[0]}, BS={best_params[1]}, WD={best_params[2]}, "
      f"Epochs={best_params[3]} (QWK={best_score:.4f})")



Training with LR=2e-05, BS=16, WD=0.01, Epochs=4


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted,Qwk
1,0.52,0.44356,0.81757,0.805957,0.814708,0.847611
2,0.3659,0.442746,0.823455,0.813727,0.822289,0.858866
3,0.281,0.485128,0.827659,0.81725,0.825958,0.860215
4,0.225,0.522067,0.836486,0.827488,0.835554,0.869397



Best params: LR=2e-05, BS=16, WD=0.01, Epochs=4 (QWK=0.8694)
