In [8]:
from datasets import load_dataset

In [9]:
# Load dataset
dataset = load_dataset("go_emotions")
emotion_list = dataset["train"].features["labels"].feature.names

In [10]:
# Sample example
sample = dataset["train"][0]
print("Text:", sample["text"])
print("Raw Labels:", sample["labels"])
print("Emotions:", [emotion_list[i] for i in sample["labels"]])

Text: My favourite food is anything I didn't have to cook myself.
Raw Labels: [27]
Emotions: ['neutral']


In [11]:
# Simplify Emotion Mapping
GOEMOTION_TO_CUSTOM = {
    "optimism": "Hopeful",
    "anticipation": "Hopeful",
    "joy": "Hopeful",
    "anger": "Angry",
    "annoyance": "Angry",
    "fear": "Fearful",
    "disappointment": "Frustrated",
    "pride": "Empowered",
    "gratitude": "Empowered",
    "approval": "Empowered",
    "neutral": "Neutral"
}

def map_emotions(label_ids, emotion_list):
    mapped = set()
    for idx in label_ids:
        emo = emotion_list[idx]
        if emo in GOEMOTION_TO_CUSTOM:
            mapped.add(GOEMOTION_TO_CUSTOM[emo])
    return list(mapped)

In [12]:
# Test mapping on one sample
mapped = map_emotions(sample["labels"], emotion_list)
print("Mapped Emotion(s):", mapped)

Mapped Emotion(s): ['Neutral']


In [13]:
import pandas as pd

# Create a small subset (e.g., 500 examples) for local dev/testing
def extract_subset(dataset, emotion_list, limit=500):
    rows = []
    for example in dataset["train"]:
        custom_emos = map_emotions(example["labels"], emotion_list)
        if custom_emos:
            rows.append({
                "text": example["text"],
                "custom_emotions": custom_emos
            })
        if len(rows) >= limit:
            break
    return pd.DataFrame(rows)

df = extract_subset(dataset, emotion_list)
df.to_csv("../data/processed/goemotions_mapped_subset.csv", index=False)