In [1]:
import torch

if torch.cuda.is_available():
    print("GPU is available")
else:
    print("GPU is not available")

GPU is available


# Classificador de Sentiments a Xarxes Socials en Català (CSXSC): Dataset

**Author:** Daniel Arias Cámara  
**Date:** 25-07-2025  

**Description:**  This notebook aims to build a high-quality dataset for fine-tuning the **CSXSC** model. The dataset is constructed by combining trusted data sources, including structured sentiment corpora and translated social media content. Details on data origin and preprocessing steps are provided in the sections below.


## 1. GuiaCat Dataset

**Description:** This dataset consists of 5,750 restaurant reviews in Catalan, sourced from the GuiaCat platform. Each review includes individual ratings for service, food, price-quality ratio, and atmosphere, along with an overall average score.

**Access:** [projecte-aina/GuiaCat on Hugging Face](https://huggingface.co/datasets/projecte-aina/GuiaCat)

**Source:** Aina Project

**Notes:**  
The dataset is divided into three subsets:  
- **Train:** 1,750 rows  
- **Validation:** 500 rows  
- **Test:** 500 rows  

The original fields are: Service, Food, Price-quality, Environment, Avg, Text, and Label.  
For our purposes, we retain only the Text and Label fields, discarding the rest.

The Label field includes five sentiment categories:  
- Molt bo (Very good)  
- Bo (Good)  
- Regular (Average)  
- Dolent (Bad)  
- Molt dolent (Very bad)

These are grouped into three classes for sentiment classification:  
- **Positive:** Molt bo and Bo  
- **Neutral:** Regular  
- **Negative:** Dolent and Molt dolent


In [2]:
import os

try:
    import datasets
except ImportError:
    import subprocess
    subprocess.check_call(["pip", "install", "-q", "datasets"])

from datasets import load_dataset

ds_guiacat = load_dataset("projecte-aina/GuiaCat")
ds = {}

for split in ds_guiacat:
    drop_columns = [col for col in ds_guiacat[split].column_names if col not in ["text", "label"]]
    ds[split] = ds_guiacat[split].remove_columns(drop_columns)

    def relabel(opinion):
        label = opinion["label"].lower()
        if label in ["molt bo", "bo"]:
            opinion["label"] = "positive"
        elif label == "regular":
            opinion["label"] = "neutral"
        elif label in ["dolent", "molt dolent"]:
            opinion["label"] = "negative"
        return opinion

    ds[split] = ds[split].map(relabel)

output_dirs = {
    "train": "train",
    "validation": "validate",
    "test": "test"
}

for split in ['train', 'validation', 'test']:
    os.makedirs(split, exist_ok=True)
    output_path = os.path.join(split, "guiacat.csv")
    ds[split].to_csv(output_path, index=False)

print("Train:", ds["train"][0])
print("Validation:", ds["validation"][0])
print("Test:", ds["test"][0])

  from .autonotebook import tqdm as notebook_tqdm
Creating CSV from Arrow format: 100%|██████████| 5/5 [00:00<00:00, 286.28ba/s]
Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 529.52ba/s]
Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 532.14ba/s]

Train: {'text': 'El lloc és acollidor. El tracte, familiar. Els plats són casolans, abundants i de qualitat! Productes de la terra com embutits i altres de la zona. Hi tornarem segur!', 'label': 'positive'}
Validation: {'text': 'Bon Menjar i bon tracte en un restaurant que segú hi tornaràs un altre vegada.', 'label': 'positive'}
Test: {'text': "Fantàstic restaurant ,una carta plena de plats creatius i cuina de temporada que sempre es d'agrair , i el que varem menjar nosaltres molt bé, estic segura que tornaré no ho dubtaré.El tracte bó i l'espai molt acollidor.", 'label': 'positive'}





## 2. Catalan Structured Sentiment Analysis (CaSSA) Dataset

**Description:** The CaSSA dataset contains 6,400 reviews and forum messages in Catalan, annotated at the fine-grained level with polar expressions. Each text instance is labeled with all the sentiment expressions it contains. For each polar expression, the annotation includes the **expression itself**, the **target** (i.e., the object of the sentiment), and the **source** (i.e., the subject expressing the sentiment). In total, 25,453 polar expressions have been annotated.

**Access:** [projecte-aina/CaSSA on Hugging Face](https://huggingface.co/datasets/projecte-aina/CaSSA-catalan-structured-sentiment-analysis)

**Source:** Aina Project

**Notes:**

Each instance in the dataset is a text. For each text, there can be 0 to unlimited polar expressions, which are contained in the "opinions" field. Each opinion contains a source, a target, a polar expression, a polarity value and an intensity value.

To convert this structured information into a single sentiment label per text, we apply the following strategy:
- Count all Positive, Negative, and Neutral polarities per opinion.
- Assign the sentiment label based on the dominant polarity.
  - If Positive polar expressions are the majority: **Positive**.
  - If Negative polar expressions dominate: **Negative**.
  - In case of a tie or no polar expressions: **Neutral**.

In [3]:
import os
from datasets import load_dataset

ds_cassa = load_dataset("projecte-aina/CaSSA-catalan-structured-sentiment-analysis")["train"]

def relabel_from_opinions(item):
    pos = neg = neu = 0

    for opinion in item.get("opinions", []):
        polarity = opinion.get("Polarity")

        if isinstance(polarity, str):
            polarity = polarity.strip().lower()

            if polarity == "positive":
                pos += 1
            elif polarity == "negative":
                neg += 1
            elif polarity == "neutral":
                neu += 1

    if pos > neg and pos > neu:
        label = "positive"
    elif neg > pos and neg > neu:
        label = "negative"
    else:
        label = "neutral"

    return {"text": item["text"], "label": label}

ds_cassa_labeled = ds_cassa.map(relabel_from_opinions)

ds_split = ds_cassa_labeled.train_test_split(test_size=0.2, seed=42)
ds_val_test = ds_split["test"].train_test_split(test_size=0.5, seed=42)

ds_cassa_clean = {
    "train": ds_split["train"],
    "validation": ds_val_test["train"],
    "test": ds_val_test["test"]
}

for split in ds_cassa_clean:
    keep = ["text", "label"]
    drop = [col for col in ds_cassa_clean[split].column_names if col not in keep]
    ds_cassa_clean[split] = ds_cassa_clean[split].remove_columns(drop)

for split in ['train', 'validation', 'test']:
    os.makedirs(split, exist_ok=True)
    output_path = os.path.join(split, "cassa.csv")
    ds_cassa_clean[split].to_csv(output_path, index=False)

print("Train:", ds_cassa_clean["train"][0])
print("Validation:", ds_cassa_clean["validation"][0])
print("Test:", ds_cassa_clean["test"][0])

Creating CSV from Arrow format: 100%|██████████| 6/6 [00:00<00:00, 218.60ba/s]
Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 271.23ba/s]
Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 282.43ba/s]

Train: {'text': "IE TIO!! MOLT BONA INICIATIVA!! A MI DE VEGADES TAMBÉ EM VENEN GANES DE CANVIAR-LOS, PERÒ, COM LA MAJORIA,MAI M'HE DECIDIT A FER-HO. POT SE ARA JA M'HO PLANTEGE MILLOR....\n", 'label': 'positive'}
Validation: {'text': 'Dels pocs llocs del Ripollès on trobaràs bin peix. Aquí el trobaràs bò i barat.', 'label': 'positive'}
Test: {'text': "Que n'havia fet jo de cassera de dracs amb pals ben llargs per les parets del poble de El Milà per les nits d'estiu de fa anys... Hòstia, que bo, he trobat una fotografia realment bona. Una vista del poble de El Milà on es veu la casa que tenia ara fa uns anys la meva família, amb el tros i tot!!!! Que abandonat que el tenen el tros?!?! Hi ha algú per ací de El Milà o voltants?\n", 'label': 'neutral'}





## 3. GoEmotions Dataset

**Description:** The GoEmotions dataset is a large-scale human-annotated corpus of 58k English Reddit comments labeled for 27 emotion categories plus neutral. It was developed by Google AI to support fine-grained sentiment and emotion classification in user-generated content. Each comment may be associated with one or multiple emotion labels, making it suitable for multilabel classification tasks.

**Access:** [https://www.kaggle.com/datasets/debarshichanda/goemotions](https://www.kaggle.com/datasets/debarshichanda/goemotions)

**Source:**  Google AI

**Notes:**  
- The GoEmotions dataset includes annotations for 27 fine-grained emotion categories plus a neutral class. Since each Reddit comment can be associated with **multiple emotions**, we adopt a two-step strategy to simplify the dataset into three general sentiment categories: **Positive**, **Negative**, and **Neutral**.

- In the **first step**, each emotion label is mapped to one of three broader sentiment groups:

  - **Positive**: amusement, excitement, joy, love, desire, optimism, caring, pride, admiration, gratitude, relief, approval.
  
  - **Negative**: fear, nervousness, remorse, embarrassment, disappointment, sadness, grief, disgust, anger, annoyance, disapproval.
  
  - **Ambiguous**: realization, surprise, curiosity, confusion.

- In the **second step**, we count how many mapped emotions of each type are assigned to a given comment, and apply the following decision rule:

  - If **Positive** emotions are the majority: classify as **Positive**.  
  - If **Negative** emotions are the majority: classify as **Negative**.  
  - If there is a tie or no mapped emotions: classify as **Neutral**.

This strategy allows us to transform a complex multilabel emotion task into a simpler, interpretable sentiment classification problem.


In [None]:
import subprocess
import sys

for package in ["ctranslate2", "sentencepiece", "kagglehub[hf-datasets]", "datasets", "tqdm", "huggingface_hub"]:
    pkg_name = package.split("[")[0] if "[" in package else package
    try:
        __import__(pkg_name)
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])

import os
from tqdm import tqdm
from datasets import Dataset
from huggingface_hub import snapshot_download
import kagglehub
from kagglehub import KaggleDatasetAdapter
import ctranslate2
import sentencepiece as spm

file_path = "data/train.tsv"
hf_ds = kagglehub.load_dataset(
    KaggleDatasetAdapter.HUGGING_FACE,
    "debarshichanda/goemotions",
    file_path,
    pandas_kwargs={
        "sep": "\t",
        "names": ["text_en", "labels", "id"],
        "header": 0
    }
)
hf_ds = hf_ds.remove_columns("id")
hf_ds = hf_ds.select(range(10000))

model_dir = snapshot_download(repo_id="projecte-aina/aina-translator-en-ca", revision="main")
sp_model_path = os.path.join(model_dir, "spm.model")

sp = spm.SentencePieceProcessor()
sp.load(sp_model_path)
translator = ctranslate2.Translator(model_dir)

def translate_en_to_ca(text):
    try:
        tokens = sp.encode(text, out_type=str)
        translation = translator.translate_batch([tokens])
        return sp.decode(translation[0][0]["tokens"])
    except Exception as e:
        print(f"Translation error: {text[:50]}... → {e}")
        return ""

tqdm.pandas()
df = hf_ds.to_pandas()
df["text"] = df["text_en"].progress_apply(translate_en_to_ca)


hf_ds = Dataset.from_pandas(df)

def parse_labels(labels_str):

    if isinstance(labels_str, list):
        return labels_str
    if isinstance(labels_str, int):
        return [labels_str]
    
    try:
        if isinstance(labels_str, str) and labels_str:
            return [int(i) for i in labels_str.split(',')]
        return []
    except (ValueError, TypeError):
        return []

emotion_id2label = [
    "admiration", "amusement", "anger", "annoyance", "approval", "caring", "confusion",
    "curiosity", "desire", "disappointment", "disapproval", "disgust", "embarrassment",
    "excitement", "fear", "gratitude", "grief", "joy", "love", "nervousness", "optimism",
    "pride", "realization", "relief", "remorse", "sadness", "surprise", "neutral"
]

sentiment_map = {
    "positive": {"amusement", "excitement", "joy", "love", "desire", "optimism", "caring",
                 "pride", "admiration", "gratitude", "relief", "approval"},
    "negative": {"fear", "nervousness", "remorse", "embarrassment", "disappointment",
                 "sadness", "grief", "disgust", "anger", "annoyance", "disapproval"},
    "ambiguous": {"realization", "surprise", "curiosity", "confusion"}
}

def classify_sentiment(emotion_ids):
    """Clasifica el sentimiento basado en una lista de IDs de emoción."""
    counts = {"positive": 0, "negative": 0, "ambiguous": 0}

    for eid in emotion_ids:
        if isinstance(eid, int) and eid < len(emotion_id2label):
            emotion = emotion_id2label[eid]
            for category in sentiment_map:
                if emotion in sentiment_map[category]:
                    counts[category] += 1
                    break
    if counts["positive"] > counts["negative"] and counts["positive"] > counts["ambiguous"]:
        return "positive"
    elif counts["negative"] > counts["positive"] and counts["negative"] > counts["ambiguous"]:
        return "negative"
    else:
        return "neutral"

def process_and_classify(example):
    parsed_labels = parse_labels(example["labels"])
    return {
        "labels": parsed_labels,
        "label": classify_sentiment(parsed_labels)
    }

hf_ds = hf_ds.map(process_and_classify)

print("\nExample samples:\n")
for i in range(50):
    row = hf_ds[i]
    emotions = [emotion_id2label[e] for e in row["labels"]]
    print(f"[{i+1}] EN: {row['text_en']}")
    print(f"    CA: {row['text']}")
    print(f"    Emotions: {emotions}")
    print(f"    Sentiment: {row['label']}")
    print("-" * 10)

hf_ds = hf_ds.remove_columns(["labels", "text_en"])
ds_split = hf_ds.train_test_split(test_size=0.2, seed=42)
ds_val_test = ds_split["test"].train_test_split(test_size=0.5, seed=42)

ds_goemotions_clean = {
    "train": ds_split["train"],
    "validation": ds_val_test["train"],
    "test": ds_val_test["test"]
}

for split in ds_goemotions_clean:
    keep = ["text", "label"]
    drop = [col for col in ds_goemotions_clean[split].column_names if col not in keep]
    ds_goemotions_clean[split] = ds_goemotions_clean[split].remove_columns(drop)

for split in ["train", "validation", "test"]:
    os.makedirs(split, exist_ok=True)
    output_path = os.path.join(split, "goemotions.csv")
    ds_goemotions_clean[split].to_csv(output_path, index=False)

print("\nScript finished and files saved successfully.")

  hf_ds = kagglehub.load_dataset(
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 93206.76it/s]
  return sp.decode(translation[0][0]["tokens"])
100%|██████████| 100/100 [00:59<00:00,  1.69it/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 19798.46 examples/s]



Example samples:

[1] EN: Now if he does off himself, everyone will think hes having a laugh screwing with people instead of actually dead
    CA: Ara bé, si es treu a si mateix, tothom pensarà que riu fotent-se de la gent en lloc de realment mort
    Emotions: ['neutral']
    Sentiment: neutral
----------
[2] EN: WHY THE FUCK IS BAYLESS ISOING
    CA: PER QUÈ LA MERDA ÉS AIXÒ SENSE BAYLESS
    Emotions: ['anger']
    Sentiment: negative
----------
[3] EN: To make her feel threatened
    CA: Perquè se senti amenaçada
    Emotions: ['fear']
    Sentiment: negative
----------
[4] EN: Dirty Southern Wankers
    CA: brutes del sud Wankers
    Emotions: ['annoyance']
    Sentiment: negative
----------
[5] EN: OmG pEyToN iSn'T gOoD eNoUgH tO hElP uS iN tHe PlAyOfFs! Dumbass Broncos fans circa December 2015.
    CA: OmG pEyToN iSn'T gOoD eNoUgH tO hElP uS iN tHe PlAyOfFs! Tontos fans dels Broncs al voltant de desembre de 2015.
    Emotions: ['surprise']
    Sentiment: neutral
----------
[6] 

Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 477.44ba/s]
Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 981.12ba/s]
Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1591.77ba/s]


Script finished and files saved successfully.





## 4. Create the final dataset

In [7]:
import os
import pandas as pd

splits = ["train", "validation", "test"]
expected_columns = ["text", "label"]
row_counts = {}

for split in splits:
    folder_path = os.path.abspath(split)
    if not os.path.isdir(folder_path):
        print(f"Folder not found: {folder_path}")
        continue

    all_csvs = [os.path.join(folder_path, f) for f in os.listdir(folder_path)
                if f.endswith(".csv") and f != f"{split}.csv"]  # Skip previous output if re-run

    if not all_csvs:
        print(f"No CSV files found in: {folder_path}")
        continue

    print(f"\nProcessing '{split}' folder with {len(all_csvs)} CSV files...")

    dfs = []
    for csv_file in all_csvs:
        try:
            df = pd.read_csv(csv_file)
            if set(df.columns) != set(expected_columns):
                print(f"Skipping {csv_file} (columns mismatch: found {list(df.columns)})")
                continue
            dfs.append(df)
        except Exception as e:
            print(f"Failed to read {csv_file}: {e}")

    if dfs:
        combined = pd.concat(dfs, ignore_index=True)
        row_counts[split] = len(combined)

        output_path = os.path.join(folder_path, f"{split}.csv")
        combined.to_csv(output_path, index=False)
        print(f"Saved combined CSV to: {output_path}")
    else:
        print(f"No valid CSV files to combine in: {folder_path}")
        row_counts[split] = 0


print("\nFinal dataset row counts:")
for split in splits:
    print(f"- {split}: {row_counts.get(split, 0)} rows")



Processing 'train' folder with 3 CSV files...
Saved combined CSV to: /home/user/Escritorio/TFM/train/train.csv

Processing 'validation' folder with 3 CSV files...
Saved combined CSV to: /home/user/Escritorio/TFM/validation/validation.csv

Processing 'test' folder with 3 CSV files...
Saved combined CSV to: /home/user/Escritorio/TFM/test/test.csv

Final dataset row counts:
- train: 9950 rows
- validation: 1150 rows
- test: 1150 rows
