In [None]:
%pip install wget

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: wget
  Building wheel for wget (pyproject.toml): started
  Building wheel for wget (pyproject.toml): finished with status 'done'
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9712 sha256=1a6bbaafeb6d2b62492e27e863f6847b37d2d617c1c5fdf932c88274e91fd4a8
  Stored in directory: c:\users\xiang\appdata\local\pip\cache\wheels\8a\b8\04\0c88fb22489b0c049bee4e977c5689c7fe597d6c4b0e7d0b6a
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2
Note: you may need to restart the kernel to use updated packages.


Download the GoEmotions Dataset

In [2]:
import wget
import os

# Create the directory if it doesn't exist
os.makedirs("data/full_dataset/", exist_ok=True)

# Download the files
urls = [
    "https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_1.csv",
    "https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_2.csv",
    "https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_3.csv"
]

for url in urls:
    filename = wget.download(url, out="data/full_dataset/")
    print(f"\nDownloaded: {filename}")


Downloaded: data/full_dataset//goemotions_1.csv

Downloaded: data/full_dataset//goemotions_2.csv

Downloaded: data/full_dataset//goemotions_3.csv


Convert 27 Emotions into Ekman Emotions

In [3]:
import pandas as pd

# Mapping from GoEmotions (27 emotions) to Ekman emotions (6 + neutral)
EKMAN_MAPPING = {
    'anger': 'anger',
    'annoyance': 'anger',
    'disapproval': 'anger',
    'disgust': 'disgust',
    'fear': 'fear',
    'nervousness': 'fear',
    'joy': 'joy',
    'amusement': 'joy',
    'approval': 'joy',
    'excitement': 'joy',
    'gratitude': 'joy',
    'love': 'joy',
    'optimism': 'joy',
    'relief': 'joy',
    'pride': 'joy',
    'admiration': 'joy',
    'desire': 'joy',
    'caring': 'joy',
    'sadness': 'sadness',
    'disappointment': 'sadness',
    'embarrassment': 'sadness',
    'grief': 'sadness',
    'remorse': 'sadness',
    'surprise': 'surprise',
    'realization': 'surprise',
    'confusion': 'surprise',
    'curiosity': 'surprise',
    'neutral': 'neutral'
}

# Load the data
df1 = pd.read_csv("data/full_dataset/goemotions_1.csv")
df2 = pd.read_csv("data/full_dataset/goemotions_2.csv")
df3 = pd.read_csv("data/full_dataset/goemotions_3.csv")
df = pd.concat([df1, df2, df3], ignore_index=True)

# Get emotion columns (all columns except 'text', 'id', 'author', etc.)
emotion_cols = [col for col in df.columns if col in EKMAN_MAPPING]

# Create Ekman emotion columns
ekman_emotions = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'neutral']
for ekman in ekman_emotions:
    # Sum all GoEmotions that map to this Ekman emotion
    goemotions = [col for col in emotion_cols if EKMAN_MAPPING.get(col) == ekman]
    df[f'ekman_{ekman}'] = df[goemotions].max(axis=1)  # Use max (1 if any is 1)

# Preview the result
print(df[['text'] + [f'ekman_{e}' for e in ekman_emotions]].head(10))

                                                text  ekman_anger  \
0                                    That game hurt.            0   
1   >sexuality shouldn’t be a grouping category I...            0   
2     You do right, if you don't care then fuck 'em!            0   
3                                 Man I love reddit.            0   
4  [NAME] was nowhere near them, he was by the Fa...            0   
5  Right? Considering it’s such an important docu...            0   
6  He isn't as big, but he's still quite popular....            1   
7  That's crazy; I went to a super [RELIGION] hig...            0   
8                                that's adorable asf            0   
9  "Sponge Blurb Pubs Quaw Haha GURR ha AAa!" fin...            0   

   ekman_disgust  ekman_fear  ekman_joy  ekman_sadness  ekman_surprise  \
0              0           0          0              1               0   
1              0           0          0              0               0   
2              0  

Save the cleaned and combined dataset

In [4]:
# Keep only relevant columns and save to CSV
ekman_df = df[['text', 'id'] + [f'ekman_{e}' for e in ekman_emotions]]

# Rename columns to remove 'ekman_' prefix for cleaner output
ekman_df.columns = ['text', 'id'] + ekman_emotions

# Save to CSV
ekman_df.to_csv("data/goemotions_ekman.csv", index=False)
print(f"Saved {len(ekman_df)} rows to data/goemotions_ekman.csv")

Saved 211225 rows to data/goemotions_ekman.csv
