# Emotion Analysis - Part 1
### <i>Data Preparation and Emotion Analysis with Text-Classification Model</i>

[Imports]

In [None]:
# Imports
import re
from importlib.resources.readers import remove_duplicates

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
from matplotlib import style
import matplotlib.pyplot as plt
from pandas.core.dtypes.common import infer_dtype_from_object
from datetime import datetime, timedelta
from plotly import hist_frame

from scripts.event_study import actual_return

style.use('ggplot')
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
import json
import torch
from transformers import pipeline
from tqdm import tqdm
import pytz

In [None]:
RUN_TYPE = 0 # set to 1 to repeat file creation process

[Data Preparation]<br>
Normalizing the quoted_tweets-dictionary to create a dataframe that contains the text of the quoted and origional tweet

In [None]:
# Prepare dataframe for analysis
## Import dataset
df_tweets = pd.read_csv(
    '../data/twitter/tweets_isTweet.csv',
    dtype={'id': 'object'},
    low_memory=False
)
## Parameter for weighing emotions per event later in Event Study
df_tweets['total_engagement'] = (df_tweets['retweetCount'] +
                                 df_tweets['likeCount'] +
                                 df_tweets['replyCount'] +
                                 df_tweets['quoteCount'] +
                                 df_tweets['viewCount'])
df_tweets = df_tweets[['id', 'createdAt', 'text', 'quoted_tweet', 'total_engagement']]
df_tweets = df_tweets.rename(columns={'id': 'tweet_id', 'text': 'tweet_text'})

## Normalize json column
quoted_tweets_normalized = pd.json_normalize(
    df_tweets['quoted_tweet'].apply(
        lambda x: json.loads(x) if pd.notna(x) and isinstance(x, str) else None
    )
)
quoted_tweets_normalized = quoted_tweets_normalized.rename(columns={
    'id': 'quoted_tweet_id',
    'text': 'quoted_tweet_text'
})

## Link by index
df_tweets.index = quoted_tweets_normalized.index

## Concat both dataframes
df_tweets_normalized = pd.concat([
    df_tweets[['tweet_id', 'createdAt', 'tweet_text', 'total_engagement']],
    quoted_tweets_normalized[['quoted_tweet_id', 'quoted_tweet_text']]
], axis=1)

## View data
print(df_tweets.columns)
df_tweets_normalized.head()

[Data Transformation]<br>
The following cell contains functions for the second step of data preparation, namely removing noise, truncating text, stemming words, and subsequently performing the emotion analysis.

In [None]:
# Pre-process data for the analysis
## Variables
ekman_emotions = ['anger', 'fear', 'joy', 'sadness', 'disgust', 'surprise']

## Classifier
### Load Hugging Face's emotion classifier
print("[Info]")
device = "cuda" if torch.cuda.is_available() else "cpu"
classifier = pipeline("text-classification", model="bhadresh-savani/bert-base-uncased-emotion", top_k=None, device=0 if device == "cuda" else -1)

## Functions
### Removing noise from the text
def remove_noise(text):
    text = text.lower()
    text = re.sub(r"https\S+|www\S+httpss\S+", '', text, flags=re.MULTILINE) # Remove Url
    text = re.sub(r"\@w+|\#", '', text) # remove @ and #
    text = re.sub(r"[^\w\s]", '', text) # remove punctuation
    text_tokens = text.split()
    filtered_text = [w for w in text_tokens if not w in stop_words]
    return " ".join(filtered_text)

### Reduction of dimensionality by abstracting word to word stem and truncating text
stemmer = PorterStemmer()
def stem_words(text):
    words = text.split()
    stemmed_text = [stemmer.stem(word) for word in words]
    return stemmed_text

def truncate_text(text, max_length=512):
    words = text.split()
    return " ".join(words[:max_length])

### Computing and appending emotions to dataframe
def compute_emotions(text):
    if not isinstance(text, str) or text.strip() == "":
        print("[ComputeEmotions] Empty cell after data cleaning. Returning 0.0 for all emotions.")
        return {emotion: 0.0 for emotion in ekman_emotions}

    try:
        # Classify emotions using the Hugging Face pipeline and handle errors
        results = classifier(text)[0]
        if not results or not isinstance(results, list) or len(results[0]) == 0:
            return {emotion: 0.0 for emotion in ekman_emotions}

        emotion_scores = {result['label']: result['score'] for result in results}
        return {emotion: emotion_scores.get(emotion, 0.0) for emotion in ekman_emotions}

    except Exception as e:
        print(f"[ComputeEmotions] Error while processing text: {text[:20]}... Error: {e}")
        return {emotion: 0.0 for emotion in ekman_emotions}

def append_emotions(df: pd.DataFrame, text_column: str) -> pd.DataFrame:
    if text_column not in df.columns:
        raise ValueError(f"[AppendEmotions] Column '{text_column}' not found in DataFrame.")
    
    print("[AppendEmotions] Computing emotions for column:", text_column)

    cleaned_column = f"{text_column}_cleaned"
    df[cleaned_column] = df[text_column].apply(
        lambda x: " ".join(stem_words(remove_noise(x))) if isinstance(x, str) and x.strip() else ""
    )

    # Truncate text if cleaned text exceeds 512 tokens
    if (df[cleaned_column].str.split().str.len() > 512).any():
        print("[AppendEmotions] At least one row with more than 512 tokens - truncating text ...")
        df[cleaned_column] = df[cleaned_column].apply(lambda x: truncate_text(x, max_length=512))

    emotion_scores = [compute_emotions(text) for text in tqdm(df[cleaned_column], desc="[AppendEmotions] Processing emotions")]
    emotions_df = pd.DataFrame(emotion_scores)
    emotions_df.index = df.index
    emotions_df.columns = [f"{text_column}_{emotion}" for emotion in ekman_emotions]
    
    # Add dominant emotion column
    dominant = emotions_df.idxmax(axis=1).apply(lambda x: x.split('_')[-1])
    all_zero = emotions_df.eq(0.0).all(axis=1)
    dominant[all_zero] = np.nan
    emotions_df[f"{text_column}_dominant_emotion"] = dominant

    # Insert right hand of input text_column
    insert_at = df.columns.get_loc(text_column) + 1

    # DataFrame in drei Teile splitten und zusammenfügen
    left = df.iloc[:, :insert_at]
    right = df.iloc[:, insert_at:].drop(columns=[cleaned_column], errors='ignore')
    result_df = pd.concat([left, df[[cleaned_column]], emotions_df, right], axis=1)

    return result_df

[Applying Functions]<br>
This part is only run once to create the new csv-file. Subsequently, the further analysis is performed on the new dataset.

In [None]:
# Perform emotion analysis for columns specified and safe as new csv-file
if RUN_TYPE == 1:
    for col in ['tweet_text']:#, 'quoted_tweet_text']:
        df_tweets_normalized = append_emotions(df_tweets_normalized, text_column=col)

    df_tweets_normalized.to_csv('../data/twitter/tweets_isTweet_emotions.csv', index=False)

# Emotion Analysis - Part 2
### <i>Formatting & Data Exploration</i>

In [None]:
# Formatting
## Set timezone
eastern = pytz.timezone("US/Eastern")

## Read necessary data and convert to the same timezone
### Tweet data
df_tweets_normalized = pd.read_csv('../data/twitter/tweets_isTweet_emotions.csv')[
    ['tweet_id',
     'createdAt',
     'total_engagement',
     'tweet_text_anger',
     'tweet_text_fear',
     'tweet_text_joy',
     'tweet_text_sadness',
     'tweet_text_disgust',
     'tweet_text_surprise',
     'tweet_text_dominant_emotion'
    ]
].dropna()
df_tweets_normalized = df_tweets_normalized.rename(columns={
    'tweet_id': 'id',
    'total_engagement': 'engagement',
    'createdAt': 'timestamp',
    'tweet_text_anger': 'anger',
    'tweet_text_fear': 'fear',
    'tweet_text_joy': 'joy',
    'tweet_text_sadness': 'sadness',
    'tweet_text_disgust': 'disgust',
    'tweet_text_surprise': 'surprise',
    'tweet_text_dominant_emotion': 'tweet_emotion'})
df_tweets_normalized['timestamp'] = pd.to_datetime(
    df_tweets_normalized['timestamp'],
    format="%a %b %d %H:%M:%S %z %Y",
    errors="coerce")
df_tweets_normalized['timestamp'] = df_tweets_normalized['timestamp'].dt.tz_convert(eastern)

### Stock data
df_stock_data = pd.read_csv('../data/stocks/tsla_intraday_202305_202504-1m.csv')
df_stock_data = df_stock_data.rename(columns={'Unnamed: 0': 'timestamp'})
df_stock_data['timestamp'] = pd.to_datetime(df_stock_data['timestamp']).dt.tz_localize(eastern)
df_stock_data['log_return'] = np.log(df_stock_data['close'] / df_stock_data['close'].shift(1))

print("[Tweets]\n",df_tweets_normalized.head(), "\n")
print("[Stock]\n", df_stock_data.head())

In [None]:
# Data exploration
## Emotion Occurrence
sns.countplot(x='tweet_emotion', data=df_tweets_normalized)

In [None]:
## Changes in closing price
plt.plot(df_stock_data['timestamp'], df_stock_data['close'])

In [None]:
## Changes in log_returns
plt.plot(df_stock_data['timestamp'], df_stock_data['log_return'])

[Event Study]<br>
1. Data Preparation (Tweet Data)

In [None]:
# 1. Filter out tweets outside market hours (2h)
df_tweets_normalized = df_tweets_normalized[
    df_tweets_normalized['timestamp'].dt.time >= pd.to_datetime("04:00").time()
]
df_tweets_normalized = df_tweets_normalized[
    df_tweets_normalized['timestamp'].dt.time <= pd.to_datetime("17:59").time()
]

# 2. Sort by time
df_tweets_normalized = df_tweets_normalized.sort_values("timestamp").reset_index(drop=True)

# 3. Compute time difference between tweets
df_tweets_normalized['time_diff'] = df_tweets_normalized['timestamp'].diff()

# 4. Cluster Tweets to events with time_diff >= 7h
df_tweets_normalized['new_event'] = df_tweets_normalized['time_diff'] > pd.Timedelta(hours=7)

# 5. Cumulate events in event_id
df_tweets_normalized['event_id'] = df_tweets_normalized['new_event'].cumsum()

# 6. Compute the dominant emotion for each event and merge back to df
def compute_event_emotion(group):
    total_engagement = group['engagement'].sum()
    weighted_emo_scores = {
        emotion: (group[emotion] * group['engagement']).sum() / total_engagement
        for emotion in ekman_emotions
    }
    event_emotion = max(weighted_emo_scores, key=weighted_emo_scores.get)
    return pd.Series({
        **weighted_emo_scores,
        'event_emotion': event_emotion
    })

if not ('event_emotion' in df_tweets_normalized.columns):
    df_event_emotions = (df_tweets_normalized
        .drop(columns=['event_id'])
        .groupby(df_tweets_normalized['event_id'], group_keys=False)
        .apply(compute_event_emotion)
        .reset_index()
    )

    df_tweets_normalized = (df_tweets_normalized
        .merge(
            df_event_emotions[['event_id', 'event_emotion']],
            on='event_id',
            how='left'
        )
        .drop(columns={'anger', 'fear', 'joy', 'sadness', 'disgust', 'surprise', 'tweet_emotion', 'new_event'})
    )
df_tweets_normalized.head()

# 8. Set event from time to time of the earliest tweet (models from what point in time a reaction can occur)
df_tweets_normalized['event_time'] = df_tweets_normalized.groupby('event_id')['timestamp'].transform('min')

print("[Info]\nNumber of events = ", df_tweets_normalized['event_id'].nunique(), "\n")
df_tweets_normalized.head()


2. Filtering Data

In [61]:
# 0. Create new dfs with necessary event data
df_events = df_tweets_normalized[['event_id', 'event_emotion', 'event_time']].sort_values('event_time')
df_returns = df_stock_data[['timestamp', 'log_return']].sort_values('timestamp')

# 1. Define estimation and observation period (analogue to topic analysis)
est = 300
obs = 120
event_gap = est + obs

# 2. Filter events (duplicates and timeframe overlaps)
def filter_overlaps(df):
    init_count = len(df)

    # Sort by timestamp and get timeshift
    df = df.sort_values('event_time').reset_index(drop=True)
    df['timeshift'] = df['event_time'].diff()

    # First event always valid
    valid_indices = [0]

    # Add event to valid_indices if delta >= event_gap
    for i in range(1, len(df)):
        if df['timeshift'].iloc[i] >= pd.Timedelta(minutes=event_gap):
            valid_indices.append(i)

    # Only keep valid events
    filtered_df = df.iloc[valid_indices]

    # Info
    print(f"[FilterOverlaps]\nFiltered {init_count - len(filtered_df)} events out of {init_count}\nMin Timeshift is: {df['timeshift'].min()}\n")

    return filtered_df.drop(columns=['timeshift'], axis=1)

df_events = df_events.drop_duplicates('event_id')
df_events = filter_overlaps(df_events)

print("[Info]\nNumber of events = ", df_events['event_id'].nunique(), "\n")
print("[Events]\n", df_events.head(), "\n")
print("[Returns]\n", df_returns.head(), "\n")

[FilterOverlaps]
Filtered 0 events out of 691
Min Timeshift is: 0 days 07:03:31

[Info]
Number of events =  691 

[Events]
    event_id event_emotion                event_time
0         0           joy 2023-05-01 12:35:21-04:00
1         1           joy 2023-05-02 11:26:02-04:00
2         2          fear 2023-05-04 13:56:08-04:00
3         3          fear 2023-05-05 14:24:48-04:00
4         4         anger 2023-05-07 06:48:44-04:00 

[Returns]
                   timestamp  log_return
0 2023-05-01 04:00:00-04:00         NaN
1 2023-05-01 04:01:00-04:00    0.000122
2 2023-05-01 04:03:00-04:00   -0.001946
3 2023-05-01 04:04:00-04:00   -0.000792
4 2023-05-01 04:05:00-04:00   -0.002745 



3. Align dataframes

In [62]:
# combine event_study results
df_results = []

# Compute estimation and event window and map stock data to the timeframes
for time in df_events['event_time']:
    # deduce timeframes for each event
    try:
        estimation_start = time - pd.Timedelta(minutes=est)
        estimation_end = time
        event_end = time + pd.Timedelta(minutes=obs)

        # get stock data for estimation and event window
        estimation_window = df_returns[
            (df_returns['timestamp'] >= estimation_start) & (df_returns['timestamp'] < estimation_end)
        ].copy()
        event_window = df_returns[
            (df_returns['timestamp'] >= time) & (df_returns['timestamp'] < event_end)
            ].copy()

    except Exception as e:
        print(f"[AlignDataframes][0] Error while processing event at {time}: {e}")
        continue

    try:
        # average return of the stock in the estimation window per min (baseline to calculate abnormal return)
        event_window['estimated_returns'] = estimation_window['log_return'].mean()

        # return actually observed in the event window per min (thus they are the log_returns already computed)
        event_window['actual_returns'] = event_window['log_return']

        # abnormal returns are the difference between the actual returns and the estimated returns (baseline)
        event_window['abnormal_returns'] = event_window['actual_returns'] - event_window['estimated_returns']

    except Exception as e:
        print(f"[AlignDataframes][1] Error while processing event at {time}: {e}")
        continue



In [63]:
event_window

Unnamed: 0,timestamp,log_return,estimated_returns,actual_returns,abnormal_returns
478264,2025-04-29 10:33:00-04:00,0.001834,-0.000089,0.001834,0.001923
478265,2025-04-29 10:34:00-04:00,-0.000231,-0.000089,-0.000231,-0.000142
478266,2025-04-29 10:35:00-04:00,0.003795,-0.000089,0.003795,0.003884
478267,2025-04-29 10:36:00-04:00,-0.000119,-0.000089,-0.000119,-0.000030
478268,2025-04-29 10:37:00-04:00,0.001364,-0.000089,0.001364,0.001453
...,...,...,...,...,...
478379,2025-04-29 12:28:00-04:00,-0.000247,-0.000089,-0.000247,-0.000158
478380,2025-04-29 12:29:00-04:00,-0.000566,-0.000089,-0.000566,-0.000477
478381,2025-04-29 12:30:00-04:00,0.000263,-0.000089,0.000263,0.000352
478382,2025-04-29 12:31:00-04:00,0.002997,-0.000089,0.002997,0.003086
