🚀 Step 1: Preprocess and Save the Data (Run Only Once)

In [1]:
from datasets import load_dataset
from transformers import BertTokenizer
import torch
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ✅ **1. Load Dataset**
print("📥 Loading dataset...")
dataset = load_dataset("go_emotions", split="train")

📥 Loading dataset...


In [3]:
# ✅ **1.1 Load Label Names**
label_names = dataset.info.features["labels"].feature.names
print("📝 Label Names Loaded:", label_names)


📝 Label Names Loaded: ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']


In [4]:
# ✅ **2. Define Emotion Mapping**
emotion_mapping = {
    "admiration": "joy",
    "amusement": "joy",
    "anger": "anger",
    "annoyance": "anger",
    "approval": "joy",
    "caring": "joy",
    "confusion": "surprise",
    "curiosity": "surprise",
    "desire": "joy",
    "disappointment": "sadness",
    "disapproval": "disgust",
    "disgust": "disgust",
    "embarrassment": "fear",
    "excitement": "joy",
    "fear": "fear",
    "gratitude": "joy",
    "grief": "sadness",
    "joy": "joy",
    "love": "joy",
    "nervousness": "fear",
    "optimism": "joy",
    "pride": "joy",
    "realization": "surprise",
    "relief": "joy",
    "remorse": "sadness",
    "sadness": "sadness",
    "surprise": "surprise",
    "neutral": "neutral"
}

# Mapping the emotion to numeric labels (Ekman categories)
emotion_to_id = {
    "anger": 0,
    "joy": 3,
    "surprise": 5,
    "sadness": 4,
    "disgust": 1,
    "fear": 2,
    "neutral": 6
}

In [5]:
# ✅ **3. Map Labels**
def map_labels(example):
    labels = example['labels']
    # Get string labels from numeric IDs
    string_labels = [label_names[label] for label in labels]
    # Map string labels to Ekman categories
    mapped_labels = [emotion_to_id[emotion_mapping[label]] for label in string_labels]
    # Ensure only a single integer label (choose the first if multiple, fallback to 'neutral')
    example['labels'] = mapped_labels[0] if mapped_labels else emotion_to_id["neutral"]
    return example

print("🔄 Mapping labels...")
dataset = dataset.map(map_labels)

🔄 Mapping labels...


In [6]:
# ✅ **4. Tokenize and Format Dataset**
print("📝 Tokenizing and formatting dataset...")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize(batch):
    return tokenizer(batch['text'], truncation=True, padding=True)

dataset = dataset.map(tokenize, batched=True)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


📝 Tokenizing and formatting dataset...




In [7]:
# ✅ **5. Save the Processed Dataset**
PROCESSED_DATA_DIR = "./processed_data"

if not os.path.exists(PROCESSED_DATA_DIR):
    os.makedirs(PROCESSED_DATA_DIR)

dataset.save_to_disk(PROCESSED_DATA_DIR)
print(f"\n✅ Dataset processed and saved to '{PROCESSED_DATA_DIR}'")

Saving the dataset (0/1 shards):   0%|          | 0/43410 [00:00<?, ? examples/s]

Saving the dataset (1/1 shards): 100%|██████████| 43410/43410 [00:00<00:00, 270050.01 examples/s]


✅ Dataset processed and saved to './processed_data'



