In [1]:
import torch

if torch.cuda.is_available():
    print("GPU is available")
else:
    print("GPU is not available")

GPU is available


# Classificador de Sentiments a Xarxes Socials en Català (CSXSC): Dataset

**Author:** Daniel Arias Cámara  
**Date:** 25-07-2025  

**Description:**  This notebook aims to build a high-quality dataset for fine-tuning the **CSXSC** model. The dataset is constructed by combining trusted data sources, including structured sentiment corpora and translated social media content. Details on data origin and preprocessing steps are provided in the sections below.


## 1. GuiaCat Dataset

**Description:** This dataset consists of 5,750 restaurant reviews in Catalan, sourced from the GuiaCat platform. Each review includes individual ratings for service, food, price-quality ratio, and atmosphere, along with an overall average score.

**Access:** [projecte-aina/GuiaCat on Hugging Face](https://huggingface.co/datasets/projecte-aina/GuiaCat)

**Source:** Aina Project

**Notes:**  
The dataset is divided into three subsets:  
- **Train:** 1,750 rows  
- **Validation:** 500 rows  
- **Test:** 500 rows  

The original fields are: Service, Food, Price-quality, Environment, Avg, Text, and Label.  
For our purposes, we retain only the Text and Label fields, discarding the rest.

The Label field includes five sentiment categories:  
- Molt bo (Very good)  
- Bo (Good)  
- Regular (Average)  
- Dolent (Bad)  
- Molt dolent (Very bad)

These are grouped into three classes for sentiment classification:  
- **Positive:** Molt bo and Bo  
- **Neutral:** Regular  
- **Negative:** Dolent and Molt dolent


In [5]:
import os

try:
    import datasets
except ImportError:
    import subprocess
    subprocess.check_call(["pip", "install", "-q", "datasets"])

from datasets import load_dataset

ds_guiacat = load_dataset("projecte-aina/GuiaCat")
ds = {}

for split in ds_guiacat:
    drop_columns = [col for col in ds_guiacat[split].column_names if col not in ["text", "label"]]
    ds[split] = ds_guiacat[split].remove_columns(drop_columns)

    def relabel(opinion):
        label = opinion["label"].lower()
        if label in ["molt bo", "bo"]:
            opinion["label"] = "positive"
        elif label == "regular":
            opinion["label"] = "neutral"
        elif label in ["dolent", "molt dolent"]:
            opinion["label"] = "negative"
        return opinion

    ds[split] = ds[split].map(relabel)

output_dirs = {
    "train": "train",
    "validation": "validate",
    "test": "test"
}

for split in ['train', 'validation', 'test']:
    os.makedirs(split, exist_ok=True)
    output_path = os.path.join(split, "guiacat.csv")
    ds[split].to_csv(output_path, index=False)

print("Train:", ds["train"][0])
print("Validation:", ds["validation"][0])
print("Test:", ds["test"][0])

Creating CSV from Arrow format: 100%|██████████| 5/5 [00:00<00:00, 316.32ba/s]
Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 401.64ba/s]
Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 424.27ba/s]

Train: {'text': 'El lloc és acollidor. El tracte, familiar. Els plats són casolans, abundants i de qualitat! Productes de la terra com embutits i altres de la zona. Hi tornarem segur!', 'label': 'positive'}
Validation: {'text': 'Bon Menjar i bon tracte en un restaurant que segú hi tornaràs un altre vegada.', 'label': 'positive'}
Test: {'text': "Fantàstic restaurant ,una carta plena de plats creatius i cuina de temporada que sempre es d'agrair , i el que varem menjar nosaltres molt bé, estic segura que tornaré no ho dubtaré.El tracte bó i l'espai molt acollidor.", 'label': 'positive'}





## 2. Catalan Structured Sentiment Analysis (CaSSA) Dataset

**Description:** The CaSSA dataset contains 6,400 reviews and forum messages in Catalan, annotated at the fine-grained level with polar expressions. Each text instance is labeled with all the sentiment expressions it contains. For each polar expression, the annotation includes the **expression itself**, the **target** (i.e., the object of the sentiment), and the **source** (i.e., the subject expressing the sentiment). In total, 25,453 polar expressions have been annotated.

**Access:** [projecte-aina/CaSSA on Hugging Face](https://huggingface.co/datasets/projecte-aina/CaSSA-catalan-structured-sentiment-analysis)

**Source:** Aina Project

**Notes:**

Each instance in the dataset is a text. For each text, there can be 0 to unlimited polar expressions, which are contained in the "opinions" field. Each opinion contains a source, a target, a polar expression, a polarity value and an intensity value.

To convert this structured information into a single sentiment label per text, we apply the following strategy:
- Count all Positive, Negative, and Neutral polarities per opinion.
- Assign the sentiment label based on the dominant polarity.
  - If Positive polar expressions are the majority: **Positive**.
  - If Negative polar expressions dominate: **Negative**.
  - In case of a tie or no polar expressions: **Neutral**.

In [7]:
import os
from datasets import load_dataset

ds_cassa = load_dataset("projecte-aina/CaSSA-catalan-structured-sentiment-analysis")["train"]

def relabel_from_opinions(item):
    pos = neg = neu = 0

    for opinion in item.get("opinions", []):
        polarity = opinion.get("Polarity")

        if isinstance(polarity, str):
            polarity = polarity.strip().lower()

            if polarity == "positive":
                pos += 1
            elif polarity == "negative":
                neg += 1
            elif polarity == "neutral":
                neu += 1

    if pos > neg and pos > neu:
        label = "positive"
    elif neg > pos and neg > neu:
        label = "negative"
    else:
        label = "neutral"

    return {"text": item["text"], "label": label}

ds_cassa_labeled = ds_cassa.map(relabel_from_opinions)

ds_split = ds_cassa_labeled.train_test_split(test_size=0.2, seed=42)
ds_val_test = ds_split["test"].train_test_split(test_size=0.5, seed=42)

ds_cassa_clean = {
    "train": ds_split["train"],
    "validation": ds_val_test["train"],
    "test": ds_val_test["test"]
}

for split in ds_cassa_clean:
    keep = ["text", "label"]
    drop = [col for col in ds_cassa_clean[split].column_names if col not in keep]
    ds_cassa_clean[split] = ds_cassa_clean[split].remove_columns(drop)

for split in ['train', 'validation', 'test']:
    os.makedirs(split, exist_ok=True)
    output_path = os.path.join(split, "cassa.csv")
    ds_cassa_clean[split].to_csv(output_path, index=False)

print("Train:", ds_cassa_clean["train"][0])
print("Validation:", ds_cassa_clean["validation"][0])
print("Test:", ds_cassa_clean["test"][0])

Creating CSV from Arrow format: 100%|██████████| 6/6 [00:00<00:00, 164.66ba/s]
Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 196.22ba/s]
Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 199.90ba/s]

Train: {'text': "IE TIO!! MOLT BONA INICIATIVA!! A MI DE VEGADES TAMBÉ EM VENEN GANES DE CANVIAR-LOS, PERÒ, COM LA MAJORIA,MAI M'HE DECIDIT A FER-HO. POT SE ARA JA M'HO PLANTEGE MILLOR....\n", 'label': 'positive'}
Validation: {'text': 'Dels pocs llocs del Ripollès on trobaràs bin peix. Aquí el trobaràs bò i barat.', 'label': 'positive'}
Test: {'text': "Que n'havia fet jo de cassera de dracs amb pals ben llargs per les parets del poble de El Milà per les nits d'estiu de fa anys... Hòstia, que bo, he trobat una fotografia realment bona. Una vista del poble de El Milà on es veu la casa que tenia ara fa uns anys la meva família, amb el tros i tot!!!! Que abandonat que el tenen el tros?!?! Hi ha algú per ací de El Milà o voltants?\n", 'label': 'neutral'}



