# Emotion Analysis

In [None]:
### Imports

import pandas as pd
from pandas import json_normalize, Series
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import style
from sympy.strategies.core import switch
style.use('ggplot')
from textblob import TextBlob
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
#nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
import json
from json import loads, dumps
from pprint import pprint
import torch
from transformers import pipeline
from tqdm import tqdm


In [None]:
### Prepare dataframe for analysis
# Import dataset
df_tweets = pd.read_csv(
    '../data/twitter/tweets_isTweet.csv',
    dtype={'id': 'object'},
    low_memory=False
)
df_tweets = df_tweets[['id', 'createdAt', 'text', 'quoted_tweet']]
df_tweets = df_tweets.rename(columns={'id': 'tweet_id', 'text': 'tweet_text'})

# Normalize json column
quoted_tweets_normalized = pd.json_normalize(
    df_tweets['quoted_tweet'].apply(
        lambda x: json.loads(x) if pd.notna(x) and isinstance(x, str) else None
    )
)
quoted_tweets_normalized = quoted_tweets_normalized.rename(columns={'id': 'quoted_tweet_id', 'text': 'quoted_tweet_text'})

# Link by index
df_tweets.index = quoted_tweets_normalized.index

# Concat both dataframes
df_tweets_normalized = pd.concat([
    df_tweets[['tweet_id', 'createdAt', 'tweet_text']],
    quoted_tweets_normalized[['quoted_tweet_id', 'quoted_tweet_text']]
], axis=1)

# View data
df_tweets_normalized.head()

Unnamed: 0,tweet_id,createdAt,tweet_text,quoted_tweet_id,quoted_tweet_text
0,1917726279195058338,Wed Apr 30 23:42:29 +0000 2025,https://t.co/U6tI9pdin6,,
1,1917693698281787564,Wed Apr 30 21:33:01 +0000 2025,https://t.co/1c1WjFpOva,,
2,1917225430702240067,Tue Apr 29 14:32:17 +0000 2025,This is a big deal,1.9172236516250995e+18,"Last week, Treasury went live with its first a..."
3,1917114631287718009,Tue Apr 29 07:12:01 +0000 2025,https://t.co/6xSd8l67FN,,
4,1917103264417649121,Tue Apr 29 06:26:50 +0000 2025,Whoa,1.9170112797570665e+18,🚨THE INVISIBLE PUPPET MASTERS: AI'S DISTURBING...


In [None]:
### Pre-process data for the analysis
## Variables
ekman_emotions = ['anger', 'fear', 'joy', 'sadness', 'disgust', 'surprise']

## Classifier
# Load Hugging Face's emotion classifier
print("[Info]")
device = "cuda" if torch.cuda.is_available() else "cpu"
classifier = pipeline("text-classification", model="bhadresh-savani/bert-base-uncased-emotion", top_k=None, device=0 if device == "cuda" else -1)

## Functions
# Removing noise from the text
def remove_noise(text):
    text = text.lower()
    text = re.sub(r"https\S+|www\S+httpss\S+", '', text, flags=re.MULTILINE) # Remove Url
    text = re.sub(r"\@w+|\#", '', text) # remove @ and #
    text = re.sub(r"[^\w\s]", '', text) # remove punctuation
    text_tokens = text.split()
    filtered_text = [w for w in text_tokens if not w in stop_words]
    return " ".join(filtered_text)

# Reduction of dimensionality by abstracting word to word stem
stemmer = PorterStemmer()
def stem_words(text):
    words = text.split()
    stemmed_text = [stemmer.stem(word) for word in words]
    return stemmed_text

def compute_emotions(text):
    if not isinstance(text, str) or text.strip() == "":
        if isinstance(text, str):
            print(f"[ComputeEmotions] Invalid text: {text[:10]}...")
        else:
            print("[ComputeEmotions] Invalid text: None or NaN. Returning 0.0 for all emotions.")
        return {emotion: 0.0 for emotion in ekman_emotions}

    try:
        # Remove noise from text - when empty afterwards, return 0.0 for all emotions
        cleaned_text = remove_noise(text)
        if not cleaned_text.strip(): 
            return {emotion: 0.0 for emotion in ekman_emotions}
        
        # Remove dimensionality by stemming and converts list from stemming back to string for classification
        cleaned_text = " ".join(stem_words(cleaned_text)) 

        # Classify emotions using the Hugging Face pipeline and handle errors
        results = classifier(cleaned_text)[0]
        if not results or not isinstance(results, list) or len(results[0]) == 0:
            return {emotion: 0.0 for emotion in ekman_emotions}

        emotion_scores = {result['label']: result['score'] for result in results}
        return {emotion: emotion_scores.get(emotion, 0.0) for emotion in ekman_emotions}

    except Exception as e:
        print(f"[ComputeEmotions] Error while processing text: {cleaned_text[:10]}... Error: {e}")
        return {emotion: 0.0 for emotion in ekman_emotions}

def append_emotions(df, text_column):
    if text_column not in df.columns:
        raise ValueError(f"[AppendEmotions] Column '{text_column}' not found in DataFrame.")
    
    print("[AppendEmotions] Computing emotions for column:", text_column)

    emotion_scores = [compute_emotions(text) for text in tqdm(df[text_column], desc="[AppendEmotions] Processing emotions")]
    emotions_df = pd.DataFrame(emotion_scores)
    emotions_df.index = df.index
    emotions_df.columns = [f"{text_column}_{emotion}" for emotion in ekman_emotions]
    
    # Add dominant emotion column
    dominant = emotions_df.idxmax(axis=1).apply(lambda x: x.split('_')[-1])
    all_zero = (emotions_df == 0.0).all(axis=1)
    dominant[all_zero] = np.nan

    emotions_df[f"{text_column}_dominant_emotion"] = dominant

    # Insert right hand of input text_column
    insert_at = df.columns.get_loc(text_column) + 1

    # DataFrame in drei Teile splitten und zusammenfügen
    left = df.iloc[:, :insert_at]
    right = df.iloc[:, insert_at:]
    result_df = pd.concat([left, emotions_df, right], axis=1)


    return result_df

df_tweets_test = pd.DataFrame(data=df_tweets_normalized.iloc[:20])
for col in ['tweet_text', 'quoted_tweet_text']:
    df_tweets_test = append_emotions(df_tweets_test, text_column=col)

[Info]


Device set to use cpu


[AppendEmotions] Computing emotions for column: tweet_text


[AppendEmotions] Processing emotions: 100%|██████████| 20/20 [00:07<00:00,  2.57it/s]


[AppendEmotions] Computing emotions for column: quoted_tweet_text


[AppendEmotions] Processing emotions:   0%|          | 0/20 [00:00<?, ?it/s]

[ComputeEmotions] Invalid text: None or NaN. Returning 0.0 for all emotions.
[ComputeEmotions] Invalid text: None or NaN. Returning 0.0 for all emotions.


[AppendEmotions] Processing emotions:  15%|█▌        | 3/20 [00:02<00:11,  1.50it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1117 > 512). Running this sequence through the model will result in indexing errors


[ComputeEmotions] Invalid text: None or NaN. Returning 0.0 for all emotions.
[ComputeEmotions] Error while processing text: i n v i s ... Error: The size of tensor a (1117) must match the size of tensor b (512) at non-singleton dimension 1


[AppendEmotions] Processing emotions:  35%|███▌      | 7/20 [00:06<00:12,  1.01it/s]

[ComputeEmotions] Invalid text: None or NaN. Returning 0.0 for all emotions.
[ComputeEmotions] Invalid text: None or NaN. Returning 0.0 for all emotions.


[AppendEmotions] Processing emotions:  50%|█████     | 10/20 [00:08<00:07,  1.25it/s]

[ComputeEmotions] Invalid text: None or NaN. Returning 0.0 for all emotions.
[ComputeEmotions] Invalid text: None or NaN. Returning 0.0 for all emotions.
[ComputeEmotions] Invalid text: None or NaN. Returning 0.0 for all emotions.
[ComputeEmotions] Invalid text: None or NaN. Returning 0.0 for all emotions.
[ComputeEmotions] Invalid text: None or NaN. Returning 0.0 for all emotions.


[AppendEmotions] Processing emotions:  80%|████████  | 16/20 [00:09<00:02,  1.93it/s]

[ComputeEmotions] Invalid text: None or NaN. Returning 0.0 for all emotions.
[ComputeEmotions] Invalid text: None or NaN. Returning 0.0 for all emotions.


[AppendEmotions] Processing emotions: 100%|██████████| 20/20 [00:12<00:00,  1.59it/s]


In [52]:
df_tweets_test.head(20)

Unnamed: 0,tweet_id,createdAt,tweet_text,tweet_text_anger,tweet_text_fear,tweet_text_joy,tweet_text_sadness,tweet_text_disgust,tweet_text_surprise,tweet_text_dominant_emotion,quoted_tweet_id,quoted_tweet_text,quoted_tweet_text_anger,quoted_tweet_text_fear,quoted_tweet_text_joy,quoted_tweet_text_sadness,quoted_tweet_text_disgust,quoted_tweet_text_surprise,quoted_tweet_text_dominant_emotion
0,1917726279195058338,Wed Apr 30 23:42:29 +0000 2025,https://t.co/U6tI9pdin6,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,
1,1917693698281787564,Wed Apr 30 21:33:01 +0000 2025,https://t.co/1c1WjFpOva,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,
2,1917225430702240067,Tue Apr 29 14:32:17 +0000 2025,This is a big deal,0.136955,0.61298,0.159061,0.064732,0.0,0.02157,fear,1.9172236516250995e+18,"Last week, Treasury went live with its first a...",0.128893,0.403817,0.299485,0.068721,0.0,0.074847,fear
3,1917114631287718009,Tue Apr 29 07:12:01 +0000 2025,https://t.co/6xSd8l67FN,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,
4,1917103264417649121,Tue Apr 29 06:26:50 +0000 2025,Whoa,0.623358,0.224247,0.084702,0.053274,0.0,0.006742,anger,1.9170112797570665e+18,🚨THE INVISIBLE PUPPET MASTERS: AI'S DISTURBING...,0.0,0.0,0.0,0.0,0.0,0.0,
5,1917099777327829386,Tue Apr 29 06:12:59 +0000 2025,"Next week, Grok 3.5 early beta release to Supe...",0.11911,0.385693,0.328491,0.067913,0.0,0.07851,fear,1.9170118476239872e+18,🚨GROK 3 SENDS USAGE SOARING – 10X SPIKE IN DOW...,0.112118,0.249669,0.496297,0.063788,0.0,0.057098,joy
6,1917071819003334728,Tue Apr 29 04:21:53 +0000 2025,It is an existential crisis!,0.20568,0.614527,0.067401,0.080749,0.0,0.024567,fear,1.9170591154170148e+18,A friendly reminder to make more babies!\n\n🇯🇵...,0.193061,0.331306,0.308841,0.079331,0.0,0.058072,fear
7,1917048786633036129,Tue Apr 29 02:50:22 +0000 2025,"Knock, knock … it’s @DOGE https://t.co/yFDXlZgnmn",0.161077,0.617711,0.145097,0.042578,0.0,0.029053,fear,,,0.0,0.0,0.0,0.0,0.0,0.0,
8,1917048324215189802,Tue Apr 29 02:48:32 +0000 2025,Population collapse is an existential problem\...,0.268985,0.384552,0.162234,0.093768,0.0,0.072045,fear,,,0.0,0.0,0.0,0.0,0.0,0.0,
9,1917040536378335721,Tue Apr 29 02:17:35 +0000 2025,Starlink is trying out a service plan commitme...,0.141073,0.542726,0.145917,0.087712,0.0,0.069217,fear,1.9170298864323177e+18,$0 for the Standard Kit with 12-month resident...,0.164984,0.50469,0.179418,0.068739,0.0,0.064701,fear


In [None]:
df_test = pd.DataFrame(data=df_tweets_normalized, columns=['tweet_text'])
df_test['tweet_text'] = df_test['tweet_text'].apply(remove_noise)
df_test.head(20)