# Label Presidential Speeches with GoEmotions
Using the pretrained `SamLowe/roberta-base-go_emotions` model to label speeches with all 28 GoEmotions labels.

In [1]:
%pip install transformers torch pandas openpyxl tqdm

Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


### Load the GoEmotions Model
Using `SamLowe/roberta-base-go_emotions` - a fine-tuned RoBERTa model for multi-label emotion classification with 28 labels.

In [3]:
# Load the pre-trained GoEmotions model
MODEL_NAME = "SamLowe/roberta-base-go_emotions"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.to(device)
model.eval()

# Get the emotion labels from the model config
emotion_labels = list(model.config.id2label.values())
print(f"Model loaded with {len(emotion_labels)} emotion labels:")
print(emotion_labels)

Model loaded with 28 emotion labels:
['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']


### GoEmotions Labels
The model outputs probabilities for all 28 emotion labels.

In [4]:
# Use labels from the model (already loaded above)
GOEMOTIONS = emotion_labels
print(f"GoEmotions ({len(GOEMOTIONS)} labels): {GOEMOTIONS}")

GoEmotions (28 labels): ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']


### Load Presidential Speeches Dataset

In [None]:
# Load the presidential speeches dataset
df = pd.read_excel("1presidential_speeches_with_metadata.xlsx")

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()

Dataset shape: (995, 9)

Columns: ['President', 'Party', 'from', 'until', 'Vice President', 'title', 'date', 'info', 'speech']

First few rows:


Unnamed: 0,President,Party,from,until,Vice President,title,date,info,speech
0,Donald Trump,Republican,2017,2021,1.0,"January 8, 2020: Statement on Iran",2020-01-08 00:00:00,After the killing of General Qasem Soleimani o...,As long as I am President of the United States...
1,Donald Trump,Republican,2017,2021,1.0,"January 3, 2020: Remarks on the Killing of Qas...",2020-01-03 00:00:00,President Trump announces that the US military...,"Hello, everybody. Well, thank you very much. ..."
2,Donald Trump,Republican,2017,2021,1.0,"October 27, 2019: Statement on the the Death o...",2019-10-27 00:00:00,President Donald Trump announces the death of ...,"Last night, the United States brought the worl..."
3,Donald Trump,Republican,2017,2021,1.0,"September 25, 2019: Press Conference",2019-09-25 00:00:00,President Donald Trump holds a press conferenc...,PRESIDENT TRUMP: Thank you very much. Thank...
4,Donald Trump,Republican,2017,2021,1.0,"September 24, 2019: Remarks at the United Nati...",2019-09-24 00:00:00,President Donald Trump speaks to the 74th sess...,PRESIDENT TRUMP: Thank you very much. Mr. ...


### Define Prediction Function
The model outputs multi-label probabilities for all 28 emotions.

In [6]:
def predict_emotions(text, model, tokenizer, threshold=0.3):
    """
    Predict GoEmotions for a given text using the pretrained model.
    
    Args:
        text: Input text string
        model: The GoEmotions model
        tokenizer: The tokenizer
        threshold: Probability threshold for multi-label classification
    
    Returns:
        dict with emotion probabilities and predicted labels
    """
    if pd.isna(text) or not isinstance(text, str) or len(text.strip()) == 0:
        return {
            'probs': {e: 0.0 for e in GOEMOTIONS},
            'primary_emotion': 'neutral',
            'all_emotions': ['neutral']
        }
    
    # Tokenize (model max length is 512)
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=512,
        padding=True
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Get predictions
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.sigmoid(outputs.logits).cpu().numpy()[0]
    
    # Create probability dictionary
    emotion_probs = {emotion: float(probs[i]) for i, emotion in enumerate(GOEMOTIONS)}
    
    # Get predicted emotions above threshold
    predicted_emotions = [e for e, p in emotion_probs.items() if p >= threshold]
    
    # Get primary emotion (highest probability)
    primary_emotion = max(emotion_probs, key=emotion_probs.get)
    
    # If no emotion above threshold, use the highest one
    if not predicted_emotions:
        predicted_emotions = [primary_emotion]
    
    return {
        'probs': emotion_probs,
        'primary_emotion': primary_emotion,
        'all_emotions': predicted_emotions
    }


def predict_emotions_for_long_text(text, model, tokenizer, chunk_size=400, overlap=50, threshold=0.3):
    """
    Process long text by splitting into overlapping chunks and aggregating predictions.
    """
    if pd.isna(text) or not isinstance(text, str) or len(text.strip()) == 0:
        return {
            'probs': {e: 0.0 for e in GOEMOTIONS},
            'primary_emotion': 'neutral',
            'all_emotions': ['neutral']
        }
    
    # Tokenize the full text to get token count
    full_tokens = tokenizer.encode(text, add_special_tokens=False)
    
    # If text is short enough, process normally
    if len(full_tokens) <= chunk_size:
        return predict_emotions(text, model, tokenizer, threshold)
    
    # Split into chunks with overlap
    chunk_probs = []
    
    for i in range(0, len(full_tokens), chunk_size - overlap):
        chunk_tokens = full_tokens[i:i + chunk_size]
        chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
        
        result = predict_emotions(chunk_text, model, tokenizer, threshold)
        chunk_probs.append(result['probs'])
    
    # Aggregate: take the mean probability across all chunks
    aggregated_probs = {emotion: 0.0 for emotion in GOEMOTIONS}
    for emotion in GOEMOTIONS:
        aggregated_probs[emotion] = np.mean([cp[emotion] for cp in chunk_probs])
    
    # Get predicted emotions above threshold
    predicted_emotions = [e for e, p in aggregated_probs.items() if p >= threshold]
    primary_emotion = max(aggregated_probs, key=aggregated_probs.get)
    
    if not predicted_emotions:
        predicted_emotions = [primary_emotion]
    
    return {
        'probs': aggregated_probs,
        'primary_emotion': primary_emotion,
        'all_emotions': predicted_emotions,
        'num_chunks': len(chunk_probs)
    }


# Test the function
test_texts = [
    "I am so happy and grateful for this wonderful day!",
    "This makes me furious! How dare they do this!",
    "I'm really scared about what might happen next.",
    "Thank you so much for your help!",
    "I love you with all my heart.",
    "Wow, I didn't expect that at all!",
]

print("Testing emotion prediction:")
print("="*60)
for text in test_texts:
    result = predict_emotions(text, model, tokenizer)
    print(f"\nText: {text[:50]}...")
    print(f"Primary Emotion: {result['primary_emotion']}")
    print(f"All Emotions: {result['all_emotions']}")
    top_probs = sorted(result['probs'].items(), key=lambda x: x[1], reverse=True)[:5]
    print(f"Top 5 Probabilities: {', '.join([f'{k}:{v:.2f}' for k, v in top_probs])}")

Testing emotion prediction:

Text: I am so happy and grateful for this wonderful day!...
Primary Emotion: joy
All Emotions: ['joy']
Top 5 Probabilities: joy:0.87, admiration:0.14, gratitude:0.12, excitement:0.04, relief:0.03

Text: This makes me furious! How dare they do this!...
Primary Emotion: anger
All Emotions: ['anger']
Top 5 Probabilities: anger:0.82, neutral:0.09, annoyance:0.08, disapproval:0.02, admiration:0.01

Text: I'm really scared about what might happen next....
Primary Emotion: fear
All Emotions: ['fear']
Top 5 Probabilities: fear:0.90, nervousness:0.09, neutral:0.03, sadness:0.02, optimism:0.02

Text: Thank you so much for your help!...
Primary Emotion: gratitude
All Emotions: ['gratitude']
Top 5 Probabilities: gratitude:0.99, approval:0.01, admiration:0.01, neutral:0.01, optimism:0.01

Text: I love you with all my heart....
Primary Emotion: love
All Emotions: ['love']
Top 5 Probabilities: love:0.96, admiration:0.03, approval:0.02, gratitude:0.01, optimism:0.01

Text:

### Label All Speeches
Process each speech and add emotion labels to the dataset.

In [7]:
# Identify the text column (adjust if needed after seeing the data)
# Common column names: 'text', 'speech', 'transcript', 'content'
text_column = None
for col in ['text', 'speech', 'transcript', 'content', 'Speech', 'Text', 'Transcript']:
    if col in df.columns:
        text_column = col
        break

if text_column is None:
    print("Available columns:", df.columns.tolist())
    print("\nPlease set text_column manually to the column containing speech text")
else:
    print(f"Using column '{text_column}' for speech text")
    print(f"Sample text length: {df[text_column].str.len().describe()}")

Using column 'speech' for speech text
Sample text length: count      995.000000
mean     17063.630151
std      10989.687657
min        482.000000
25%       6694.500000
50%      15204.000000
75%      28168.500000
max      32759.000000
Name: speech, dtype: float64


In [8]:
# Process all speeches and add emotion columns
results = []

print(f"Starting labeling of {len(df)} speeches...")

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Labeling speeches"):
    text = row[text_column] if text_column else ""
    
    # Get emotion prediction (use chunking for long speeches)
    prediction = predict_emotions_for_long_text(text, model, tokenizer)
    
    result = {
        'primary_emotion': prediction['primary_emotion'],
        'all_emotions': ','.join(prediction['all_emotions']),
    }
    
    # Add individual emotion probabilities and binary labels
    threshold = 0.3
    for emotion in GOEMOTIONS:
        result[f'prob_{emotion}'] = prediction['probs'][emotion]
        result[emotion] = 1 if prediction['probs'][emotion] >= threshold else 0
    
    results.append(result)

# Convert results to DataFrame and merge with original
results_df = pd.DataFrame(results)
df_labeled = pd.concat([df.reset_index(drop=True), results_df], axis=1)

print(f"\nLabeling complete!")
print(f"New columns added: {results_df.columns.tolist()}")

Starting labeling of 995 speeches...


Labeling speeches:   0%|          | 0/995 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1383 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1383 > 512). Running this sequence through the model will result in indexing errors
Labeling speeches: 100%|██████████| 995/995 [02:03<00:00,  8.09it/s]


Labeling complete!
New columns added: ['primary_emotion', 'all_emotions', 'prob_admiration', 'admiration', 'prob_amusement', 'amusement', 'prob_anger', 'anger', 'prob_annoyance', 'annoyance', 'prob_approval', 'approval', 'prob_caring', 'caring', 'prob_confusion', 'confusion', 'prob_curiosity', 'curiosity', 'prob_desire', 'desire', 'prob_disappointment', 'disappointment', 'prob_disapproval', 'disapproval', 'prob_disgust', 'disgust', 'prob_embarrassment', 'embarrassment', 'prob_excitement', 'excitement', 'prob_fear', 'fear', 'prob_gratitude', 'gratitude', 'prob_grief', 'grief', 'prob_joy', 'joy', 'prob_love', 'love', 'prob_nervousness', 'nervousness', 'prob_optimism', 'optimism', 'prob_pride', 'pride', 'prob_realization', 'realization', 'prob_relief', 'relief', 'prob_remorse', 'remorse', 'prob_sadness', 'sadness', 'prob_surprise', 'surprise', 'prob_neutral', 'neutral']





### View Results and Statistics

In [9]:
# View sample of labeled data
print("Sample of labeled speeches:")
display_cols = [text_column, 'primary_emotion', 'all_emotions']
df_labeled[display_cols].head(10)

Sample of labeled speeches:


Unnamed: 0,speech,primary_emotion,all_emotions
0,As long as I am President of the United States...,neutral,neutral
1,"Hello, everybody. Well, thank you very much. ...",gratitude,gratitude
2,"Last night, the United States brought the worl...",gratitude,gratitude
3,PRESIDENT TRUMP: Thank you very much. Thank...,neutral,neutral
4,PRESIDENT TRUMP: Thank you very much. Mr. ...,approval,approval
5,"THE PRESIDENT: Thank you very much, everybody...",neutral,neutral
6,"Madam Speaker, Mr. Vice President, Members of...",neutral,neutral
7,"THE PRESIDENT: Just a short time ago, I had th...",neutral,neutral
8,"THE PRESIDENT: Madam President, Mr. Secretary...",approval,approval
9,"THE PRESIDENT: Thank you, Lee. Thank you, Lee...",admiration,admiration


In [10]:
# Emotion distribution statistics
print("\nEmotion Distribution:")
print("="*50)

print("\nPrimary emotion counts:")
print(df_labeled['primary_emotion'].value_counts())

print("\nBinary label distribution (speeches with each emotion above threshold):")
for emotion in GOEMOTIONS:
    count = df_labeled[emotion].sum()
    pct = count / len(df_labeled) * 100
    if count > 0:
        print(f"  {emotion}: {count} ({pct:.1f}%)")

print("\nAverage emotion probabilities:")
for emotion in GOEMOTIONS:
    avg_prob = df_labeled[f'prob_{emotion}'].mean()
    if avg_prob > 0.01:  # Only show significant ones
        print(f"  {emotion}: {avg_prob:.3f}")


Emotion Distribution:

Primary emotion counts:
primary_emotion
neutral        734
approval        96
gratitude       70
optimism        57
admiration      13
curiosity        7
caring           5
sadness          4
desire           4
disapproval      2
remorse          1
love             1
joy              1
Name: count, dtype: int64

Binary label distribution (speeches with each emotion above threshold):
  admiration: 19 (1.9%)
  approval: 159 (16.0%)
  caring: 8 (0.8%)
  curiosity: 4 (0.4%)
  desire: 3 (0.3%)
  disappointment: 1 (0.1%)
  disapproval: 2 (0.2%)
  gratitude: 58 (5.8%)
  joy: 1 (0.1%)
  love: 1 (0.1%)
  optimism: 47 (4.7%)
  remorse: 2 (0.2%)
  sadness: 4 (0.4%)
  neutral: 719 (72.3%)

Average emotion probabilities:
  admiration: 0.062
  annoyance: 0.018
  approval: 0.210
  caring: 0.031
  confusion: 0.017
  curiosity: 0.027
  desire: 0.030
  disappointment: 0.030
  disapproval: 0.042
  gratitude: 0.075
  optimism: 0.117
  realization: 0.038
  remorse: 0.010
  sadness: 

### Save Labeled Dataset

In [11]:
# Save to CSV
output_path = "data/presidential_speeches_goemotions_labeled.csv"
df_labeled.to_csv(output_path, index=False)
print(f"Labeled dataset saved to: {output_path}")

# Also save as Excel if preferred
output_path_xlsx = "data/presidential_speeches_goemotions_labeled.xlsx"
df_labeled.to_excel(output_path_xlsx, index=False)
print(f"Labeled dataset saved to: {output_path_xlsx}")

print(f"\nFinal dataset shape: {df_labeled.shape}")
print(f"Columns: {df_labeled.columns.tolist()}")

Labeled dataset saved to: data/presidential_speeches_goemotions_labeled.csv
Labeled dataset saved to: data/presidential_speeches_goemotions_labeled.xlsx

Final dataset shape: (995, 67)
Columns: ['President', 'Party', 'from', 'until', 'Vice President', 'title', 'date', 'info', 'speech', 'primary_emotion', 'all_emotions', 'prob_admiration', 'admiration', 'prob_amusement', 'amusement', 'prob_anger', 'anger', 'prob_annoyance', 'annoyance', 'prob_approval', 'approval', 'prob_caring', 'caring', 'prob_confusion', 'confusion', 'prob_curiosity', 'curiosity', 'prob_desire', 'desire', 'prob_disappointment', 'disappointment', 'prob_disapproval', 'disapproval', 'prob_disgust', 'disgust', 'prob_embarrassment', 'embarrassment', 'prob_excitement', 'excitement', 'prob_fear', 'fear', 'prob_gratitude', 'gratitude', 'prob_grief', 'grief', 'prob_joy', 'joy', 'prob_love', 'love', 'prob_nervousness', 'nervousness', 'prob_optimism', 'optimism', 'prob_pride', 'pride', 'prob_realization', 'realization', 'prob_r