# Emotion Analysis - Part 1
### <i>Data Preparation and Emotion Analysis with Text-Classification Model</i>

[Imports]

In [86]:
### Imports
import re
import numpy as np
import pandas as pd
from matplotlib import style
from pandas.core.dtypes.common import infer_dtype_from_object

style.use('ggplot')
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
import json
import torch
from transformers import pipeline
from tqdm import tqdm
import pytz


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aklei\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[Data Preparation]<br>
Normalizing the quoted_tweets-dictionary to create a dataframe that contains the text of the quoted and origional tweet

In [87]:
### Prepare dataframe for analysis
# Import dataset
df_tweets = pd.read_csv(
    '../data/twitter/tweets_isTweet.csv',
    dtype={'id': 'object'},
    low_memory=False
)
df_tweets = df_tweets[['id', 'createdAt', 'text', 'quoted_tweet']]
df_tweets = df_tweets.rename(columns={'id': 'tweet_id', 'text': 'tweet_text'})

# Normalize json column
quoted_tweets_normalized = pd.json_normalize(
    df_tweets['quoted_tweet'].apply(
        lambda x: json.loads(x) if pd.notna(x) and isinstance(x, str) else None
    )
)
quoted_tweets_normalized = quoted_tweets_normalized.rename(columns={'id': 'quoted_tweet_id', 'text': 'quoted_tweet_text'})

# Link by index
df_tweets.index = quoted_tweets_normalized.index

# Concat both dataframes
df_tweets_normalized = pd.concat([
    df_tweets[['tweet_id', 'createdAt', 'tweet_text']],
    quoted_tweets_normalized[['quoted_tweet_id', 'quoted_tweet_text']]
], axis=1)

# View data
df_tweets_normalized.head()

Unnamed: 0,tweet_id,createdAt,tweet_text,quoted_tweet_id,quoted_tweet_text
0,1917726279195058338,Wed Apr 30 23:42:29 +0000 2025,https://t.co/U6tI9pdin6,,
1,1917693698281787564,Wed Apr 30 21:33:01 +0000 2025,https://t.co/1c1WjFpOva,,
2,1917225430702240067,Tue Apr 29 14:32:17 +0000 2025,This is a big deal,1.9172236516250995e+18,"Last week, Treasury went live with its first a..."
3,1917114631287718009,Tue Apr 29 07:12:01 +0000 2025,https://t.co/6xSd8l67FN,,
4,1917103264417649121,Tue Apr 29 06:26:50 +0000 2025,Whoa,1.9170112797570665e+18,🚨THE INVISIBLE PUPPET MASTERS: AI'S DISTURBING...


[Data Transformation]<br>
The following cell contains functions for the second step of data preparation, namely removing noise, truncating text, stemming words, and subsequently performing the emotion analysis.

In [88]:
### Pre-process data for the analysis
## Variables
ekman_emotions = ['anger', 'fear', 'joy', 'sadness', 'disgust', 'surprise']

## Classifier
# Load Hugging Face's emotion classifier
print("[Info]")
device = "cuda" if torch.cuda.is_available() else "cpu"
classifier = pipeline("text-classification", model="bhadresh-savani/bert-base-uncased-emotion", top_k=None, device=0 if device == "cuda" else -1)

## Functions
# Removing noise from the text
def remove_noise(text):
    text = text.lower()
    text = re.sub(r"https\S+|www\S+httpss\S+", '', text, flags=re.MULTILINE) # Remove Url
    text = re.sub(r"\@w+|\#", '', text) # remove @ and #
    text = re.sub(r"[^\w\s]", '', text) # remove punctuation
    text_tokens = text.split()
    filtered_text = [w for w in text_tokens if not w in stop_words]
    return " ".join(filtered_text)

# Reduction of dimensionality by abstracting word to word stem
stemmer = PorterStemmer()
def stem_words(text):
    words = text.split()
    stemmed_text = [stemmer.stem(word) for word in words]
    return stemmed_text

def truncate_text(text, max_length=512):
    words = text.split()
    return " ".join(words[:max_length])

def compute_emotions(text):
    if not isinstance(text, str) or text.strip() == "":
        print("[ComputeEmotions] Empty cell after data cleaning. Returning 0.0 for all emotions.")
        return {emotion: 0.0 for emotion in ekman_emotions}

    try:
    #    # Remove noise from text - when empty afterward, return 0.0 for all emotions
    #    cleaned_text = remove_noise(text)
    #    if not cleaned_text.strip(): 
    #        return {emotion: 0.0 for emotion in ekman_emotions}
    #    
    #    # Remove dimensionality by stemming and converts a list from stemming back to string for classification
    #    cleaned_text = " ".join(stem_words(cleaned_text)) 

        # Classify emotions using the Hugging Face pipeline and handle errors
        results = classifier(text)[0]
        if not results or not isinstance(results, list) or len(results[0]) == 0:
            return {emotion: 0.0 for emotion in ekman_emotions}

        emotion_scores = {result['label']: result['score'] for result in results}
        return {emotion: emotion_scores.get(emotion, 0.0) for emotion in ekman_emotions}

    except Exception as e:
        print(f"[ComputeEmotions] Error while processing text: {text[:20]}... Error: {e}")
        return {emotion: 0.0 for emotion in ekman_emotions}

def append_emotions(df, text_column):
    if text_column not in df.columns:
        raise ValueError(f"[AppendEmotions] Column '{text_column}' not found in DataFrame.")
    
    print("[AppendEmotions] Computing emotions for column:", text_column)

    cleaned_column = f"{text_column}_cleaned"
    df[cleaned_column] = df[text_column].apply(
        lambda x: " ".join(stem_words(remove_noise(x))) if isinstance(x, str) and x.strip() else ""
    )

    # Truncate text if cleaned text exceeds 512 tokens
    if (df[cleaned_column].str.split().str.len() > 512).any():
        print("[AppendEmotions] At least one row with more than 512 tokens - truncating text ...")
        df[cleaned_column] = df[cleaned_column].apply(lambda x: truncate_text(x, max_length=512))

    emotion_scores = [compute_emotions(text) for text in tqdm(df[cleaned_column], desc="[AppendEmotions] Processing emotions")]
    emotions_df = pd.DataFrame(emotion_scores)
    emotions_df.index = df.index
    emotions_df.columns = [f"{text_column}_{emotion}" for emotion in ekman_emotions]
    
    # Add dominant emotion column
    dominant = emotions_df.idxmax(axis=1).apply(lambda x: x.split('_')[-1])
    all_zero = (emotions_df == 0.0).all(axis=1)
    dominant[all_zero] = np.nan
    emotions_df[f"{text_column}_dominant_emotion"] = dominant

    # Insert right hand of input text_column
    insert_at = df.columns.get_loc(text_column) + 1

    # DataFrame in drei Teile splitten und zusammenfügen
    left = df.iloc[:, :insert_at]
    right = df.iloc[:, insert_at:].drop(columns=[cleaned_column], errors='ignore')
    result_df = pd.concat([left, df[[cleaned_column]], emotions_df, right], axis=1)

    return result_df


[Info]


Device set to use cpu


[Applying Functions]<br>
This part is only run once to create the new csv-file. Subsequently, the further analysis is performed on the new dataset.

In [89]:
# Perform emotion analysis for columns specified
#for col in ['tweet_text', 'quoted_tweet_text']:
#    df_tweets_normalized = append_emotions(df_tweets_normalized, text_column=col)

# Safe the DataFrame with emotions to csv
#df_tweets_normalized.to_csv('../data/twitter/tweets_isTweet_emotions.csv', index=False)

# Emotion Analysis - Part 2
### <i>Statistical Analysis</i>

In [94]:
# Set timezone
eastern = pytz.timezone("US/Eastern")

# Read necessary data and convert to the same timezone
## Tweet data
df_tweets_normalized = pd.read_csv('../data/twitter/tweets_isTweet_emotions.csv')[
    ['tweet_id', 'createdAt', 'tweet_text_dominant_emotion']
].dropna()
df_tweets_normalized = df_tweets_normalized.rename(columns={'createdAt': 'timestamp', 'tweet_text_dominant_emotion': 'emotion'})
df_tweets_normalized['timestamp'] = pd.to_datetime(
    df_tweets_normalized['timestamp'],
    format="%a %b %d %H:%M:%S %z %Y",
    errors="coerce")
df_tweets_normalized['timestamp'] = df_tweets_normalized['timestamp'].dt.tz_convert(eastern)

## Stock data
df_stock_data = pd.read_csv('../data/stocks/tsla_intraday_202305_202504-1m.csv')
df_stock_data = df_stock_data.rename(columns={'Unnamed: 0': 'timestamp'})
df_stock_data['timestamp'] = pd.to_datetime(df_stock_data['timestamp']).dt.tz_localize(eastern)


print("[df_tweets_normalized]\n", df_tweets_normalized.head(), "\n")
print("[df_stock_data]\n", df_stock_data.head())

[df_tweets_normalized]
               tweet_id                 timestamp emotion
2  1917225430702240067 2025-04-29 10:32:17-04:00     joy
4  1917103264417649121 2025-04-29 02:26:50-04:00     joy
5  1917099777327829386 2025-04-29 02:12:59-04:00     joy
6  1917071819003334728 2025-04-29 00:21:53-04:00     joy
7  1917048786633036129 2025-04-28 22:50:22-04:00   anger 

[df_stock_data]
                   timestamp    open    high     low   close  volume
0 2023-05-01 04:00:00-04:00  164.30  165.00  164.07  164.58    4518
1 2023-05-01 04:01:00-04:00  164.61  164.62  164.50  164.60    2880
2 2023-05-01 04:03:00-04:00  164.30  164.40  164.28  164.28     968
3 2023-05-01 04:04:00-04:00  164.15  164.15  164.15  164.15     100
4 2023-05-01 04:05:00-04:00  164.00  164.00  163.60  163.70    2500


In [96]:
# Correlation Analysis