In [5]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DefaultDataCollator
)
from datasets import Dataset, DatasetDict, Value, Sequence

# 1. Read CSV files
df1 = pd.read_csv("goemotions/data/full_dataset/goemotions_1.csv")
df2 = pd.read_csv("goemotions/data/full_dataset/goemotions_2.csv")
df3 = pd.read_csv("goemotions/data/full_dataset/goemotions_3.csv")

# Combine them if needed (this step depends on how your CSVs are structured)
df = pd.concat([df1, df2, df3], ignore_index=True)

# Inspect
print("Combined dataset size:", len(df))
print(df.head())


Combined dataset size: 211225
                                                text       id  \
0                                    That game hurt.  eew5j0j   
1   >sexuality shouldn’t be a grouping category I...  eemcysk   
2     You do right, if you don't care then fuck 'em!  ed2mah1   
3                                 Man I love reddit.  eeibobj   
4  [NAME] was nowhere near them, he was by the Fa...  eda6yn6   

                author            subreddit    link_id   parent_id  \
0                Brdd9                  nrl  t3_ajis4z  t1_eew18eq   
1          TheGreen888     unpopularopinion  t3_ai4q37   t3_ai4q37   
2             Labalool          confessions  t3_abru74  t1_ed2m7g7   
3        MrsRobertshaw             facepalm  t3_ahulml   t3_ahulml   
4  American_Fascist713  starwarsspeculation  t3_ackt2f  t1_eda65q2   

    created_utc  rater_id  example_very_unclear  admiration  ...  love  \
0  1.548381e+09         1                 False           0  ...     0   
1  1.54808

In [6]:
# Suppose your CSV has a column "labels" that is a list of emotion strings:
# e.g. row["labels"] might be ["joy", "amusement"]

label_groups = {
    "anger": "anger",
    "annoyance": "anger",
    "disgust": "anger",
    "joy": "joy",
    "amusement": "joy",
    "excitement": "joy",
    "sadness": "sadness",
    "grief": "sadness",
    "disappointment": "sadness",
    "love": "love",
    "caring": "love",
    "fear": "fear",
    "nervousness": "fear",
    "surprise": "surprise",
    # ... etc ...
    "admiration": "admiration",
    "approval": "approval",
    "confusion": "confusion",
    "curiosity": "curiosity",
    "desire": "desire",
    "disapproval": "disapproval",
    "embarrassment": "embarrassment",
    "gratitude": "gratitude",
    "optimism": "optimism",
    "pride": "pride",
    "realization": "realization",
    "relief": "relief",
    "remorse": "remorse",
    "neutral": "neutral"
}

# Build a sorted list of unique new labels:
unique_new_labels = sorted(list(set(label_groups.values())))
print("New label set:", unique_new_labels)
# e.g. ['admiration', 'anger', 'approval', 'confusion', 'curiosity', 'desire', 
#       'disapproval', 'embarrassment', 'fear', 'gratitude', 'joy', 'love', 
#       'neutral', 'optimism', 'pride', 'realization', 'relief', 'remorse', 
#       'sadness', 'surprise']

# Create a mapping from label -> index
new_label2id = {lbl: i for i, lbl in enumerate(unique_new_labels)}

def map_labels_to_new(labels_list):
    """
    labels_list is the list of original label strings for a single example.
    We'll map each to the new label, then build a multi-hot vector.
    """
    # Start all zeros
    multi_hot = [0] * len(unique_new_labels)
    for old_lbl in labels_list:
        if old_lbl in label_groups:
            new_lbl = label_groups[old_lbl]
            idx = new_label2id[new_lbl]
            multi_hot[idx] = 1
        # If there's a label not in label_groups, decide how to handle it (ignore or add it)
    return multi_hot


New label set: ['admiration', 'anger', 'approval', 'confusion', 'curiosity', 'desire', 'disapproval', 'embarrassment', 'fear', 'gratitude', 'joy', 'love', 'neutral', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise']


In [9]:
label_groups = {
    "anger": "anger",
    "disgust": "anger",   # Merge disgust into "anger"

    "joy": "joy",
    "amusement": "joy",
    "excitement": "joy",

    "sadness": "sadness",
    "grief": "sadness",
    "disappointment": "sadness",

    "love": "love",
    "caring": "love",

    "fear": "fear",
    "nervousness": "fear",

    # Keep some as-is:
    "admiration": "admiration",
    "approval":   "approval",
    "confusion":  "confusion",
    "curiosity":  "curiosity",
    "desire":     "desire",
    "disapproval":"disapproval",
    "embarrassment":"embarrassment",
    "gratitude":  "gratitude",
    "optimism":   "optimism",
    "pride":      "pride",
    "realization":"realization",
    "relief":     "relief",
    "remorse":    "remorse",
    "sadness":    "sadness",
    "surprise":   "surprise",
    "neutral":    "neutral"
}


In [14]:
unique_new_labels = sorted(set(label_groups.values()))
print("New label set:", unique_new_labels)
new_label2id = {lbl: i for i, lbl in enumerate(unique_new_labels)}


New label set: ['admiration', 'anger', 'approval', 'confusion', 'curiosity', 'desire', 'disapproval', 'embarrassment', 'fear', 'gratitude', 'joy', 'love', 'neutral', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise']


In [13]:
import numpy as np

def row_to_new_multi_hot(row):
    multi_hot = [0]*len(unique_new_labels)  # all zeros initially
    for emo in emotions:
        if row[emo] == 1:
            # If merging:
            merged_label = label_groups[emo]  # e.g. "anger" for "disgust"
            # If NOT merging, do: merged_label = emo
            idx = new_label2id[merged_label]
            multi_hot[idx] = 1
    return multi_hot

df["new_multi_hot"] = df.apply(row_to_new_multi_hot, axis=1)


NameError: name 'emotions' is not defined