<h1>TikTok Analysis - Part 1</h1>
<h2><i>Data Preparation</i></h2>

In [None]:
### Imports ###
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib import style
style.use('ggplot')
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
import torch
from transformers import pipeline
from tqdm import tqdm
import pytz
import re
from nltk.stem.porter import PorterStemmer
from src.TimeNormalizer import TimeNormalizer

### Set necessary workspace variables ###

# Set execution type (to avoid repeating resource intensive operations)
#RUN_TYPE = 0 # set to 0 to avoid file creation process
RUN_TYPE = 1 # set to 1 to perform emotion analysis file creation process
#RUN_TYPE = 2 # set to 2 to perform topic analysis file creation process

# Define Ekman's emotions 
ekman_emotions = ['anger', 'fear', 'joy', 'sadness', 'disgust', 'surprise']

# Timezones
eastern = pytz.timezone("US/Eastern")
european = pytz.timezone("Europe/Berlin")


In [None]:
### Read and transform data (total_engagement, combined_text, set_timezone) ###
# Read dataframe
if RUN_TYPE == 1:
    df_tiktok = pd.read_excel('../data/tiktok/tiktok_transcript.xlsx').drop(columns=['Hashtag', 'URL', 'Author']).dropna()

    # Extract total_engagement and combine text columns
    df_tiktok['total_engagement'] = df_tiktok['Likes'] + df_tiktok['Comments'] + df_tiktok['Shares']
    df_tiktok['combined_text'] = df_tiktok['CaptionCleaned'].fillna('') + ' ' + df_tiktok['Transcript'].fillna('')
    df_tiktok.drop(columns=['Likes', 'Comments', 'Shares', 'Caption', 'CaptionCleaned', 'Transcript'], inplace=True)

    # Convert 'Date' column to datetime format and set timezone
    df_tiktok = df_tiktok.rename(columns={'Created': 'timestamp'})
    df_tiktok['timestamp'] = pd.to_datetime(
        df_tiktok['timestamp'],
        format="%a %b %d %H:%M:%S %z %Y",
        errors='coerce'
    )
    df_tiktok['timestamp'] = df_tiktok['timestamp'].dt.tz_localize(eastern)

    print(f"Länge df: {len(df_tiktok)}")
    print(f"Columns: {df_tiktok.columns.tolist()}")

In [None]:
### Prepare data for analysis (normalize text) ###
if RUN_TYPE == 1:
    # 1. Set Classifier
    # Load Hugging Face's emotion classifier
    print("[Info]")
    device = "cuda" if torch.cuda.is_available() else "cpu"
    classifier = pipeline("text-classification", model="bhadresh-savani/bert-base-uncased-emotion", top_k=None, device=0 if device == "cuda" else -1)

    # 2. Functions to clean and process text data
    # Removing noise from the text
    def remove_noise(text):
        text = text.lower()
        text = re.sub(r"https\S+|www\S+httpss\S+", '', text, flags=re.MULTILINE) # Remove Url
        text = re.sub(r"\@w+|\#", '', text) # remove @ and #
        text = re.sub(r"[^\w\s]", '', text) # remove punctuation
        text_tokens = text.split()
        filtered_text = [w for w in text_tokens if not w in stop_words]
        return " ".join(filtered_text)

    # Reduction of dimensionality by abstracting word to word stem and truncating text
    stemmer = PorterStemmer()
    def stem_words(text):
        words = text.split()
        stemmed_text = [stemmer.stem(word) for word in words]
        return stemmed_text

    def truncate_text(text, max_length=512):
        words = text.split()
        return " ".join(words[:max_length])

In [None]:
### Prepare data for analysis (append emotions) ###
if RUN_TYPE == 1:
    # 3. Computing and appending emotions to dataframe
    def compute_emotions(text):
        if not isinstance(text, str) or text.strip() == "":
            print("[ComputeEmotions] Empty cell after data cleaning. Returning 0.0 for all emotions.")
            return {emotion: 0.0 for emotion in ekman_emotions}

        try:
            # Classify emotions using the Hugging Face pipeline and handle errors
            results = classifier(text)[0]
            if not results or not isinstance(results, list) or len(results[0]) == 0:
                return {emotion: 0.0 for emotion in ekman_emotions}

            emotion_scores = {result['label']: result['score'] for result in results}
            return {emotion: emotion_scores.get(emotion, 0.0) for emotion in ekman_emotions}

        except Exception as e:
            print(f"[ComputeEmotions] Error while processing text: {text[:20]}... Error: {e}")
            return {emotion: 0.0 for emotion in ekman_emotions}

    def append_emotions(df: pd.DataFrame, text_column: str) -> pd.DataFrame:
        if text_column not in df.columns:
            raise ValueError(f"[AppendEmotions] Column '{text_column}' not found in DataFrame.")

        print("[AppendEmotions] Computing emotions for column:", text_column)

        cleaned_column = f"{text_column}_cleaned"
        df[cleaned_column] = df[text_column].apply(
            lambda x: " ".join(stem_words(remove_noise(x))) if isinstance(x, str) and x.strip() else ""
        )

        # Truncate text if cleaned text exceeds 512 tokens
        if (df[cleaned_column].str.split().str.len() > 512).any():
            print("[AppendEmotions] At least one row with more than 512 tokens - truncating text ...")
            df[cleaned_column] = df[cleaned_column].apply(lambda x: truncate_text(x, max_length=512))

        emotion_scores = [compute_emotions(text) for text in tqdm(df[cleaned_column], desc="[AppendEmotions] Processing emotions")]
        emotions_df = pd.DataFrame(emotion_scores)
        emotions_df.index = df.index
        emotions_df.columns = [f"{text_column}_{emotion}" for emotion in ekman_emotions]

        # Add dominant emotion column
        dominant = emotions_df.idxmax(axis=1).apply(lambda x: x.split('_')[-1])
        all_zero = emotions_df.eq(0.0).all(axis=1)
        dominant[all_zero] = np.nan
        emotions_df[f"{text_column}_dominant_emotion"] = dominant

        # Insert right hand of input text_column
        insert_at = df.columns.get_loc(text_column) + 1

        # DataFrame in drei Teile splitten und zusammenfügen
        left = df.iloc[:, :insert_at]
        right = df.iloc[:, insert_at:].drop(columns=[cleaned_column], errors='ignore')
        result_df = pd.concat([left, df[[cleaned_column]], emotions_df, right], axis=1)

        return result_df

In [None]:
### Create/Read dataset: Performing emotion analysis ###
if RUN_TYPE == 1:
    # Append emotions to TikTok DataFrame and safe to file
    df_tiktok = append_emotions(df_tiktok, 'combined_text')
    df_tiktok.to_csv('../data/tiktok/tiktok_emotions.csv', index=False)

df_tiktok_emotions = pd.read_csv('../data/tiktok/tiktok_emotions.csv')

# Count dominant emotions
emotion_counts = df_tiktok_emotions['combined_text_dominant_emotion'].value_counts()

# Plot emotions in barchart
plt.figure(figsize=(10, 6))
plt.bar(emotion_counts.index, emotion_counts.values)
plt.title('Anzahl der Emotionen in TikTok Videos')
plt.xlabel('Emotion')
plt.ylabel('Anzahl')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Zeige die Zahlen
print(f"Anzahl der Videos: {len(df_tiktok_emotions)}")
print(f"Anzahl Emotionen in Videos:\n{emotion_counts}")

<h1>TikTok Analysis - Part 2</h1>
<h2><i>Event Study</i></h2>
<h3>[2.1][Data Preparation]</h3>

In [None]:
### Read necessary data for event study (tiktok, us_stock_data, ger_stock_data) ###
if RUN_TYPE == 1:
    # TikTok data
    # see above

    # Stock data (US)
    df_us_stock_data = pd.read_csv('../legacy/data/tsla_intraday_202305_202504-1m.csv')
    df_us_stock_data = df_us_stock_data.rename(
        columns={'Unnamed: 0': 'timestamp'}
    )
    df_us_stock_data['timestamp'] = pd.to_datetime(
        df_us_stock_data['timestamp']
    ).dt.tz_localize(
        tz=eastern
    )
    df_us_stock_data['timestamp'].copy().sort_values(ascending=True, inplace=True)
    df_us_stock_data.set_index('timestamp').sort_index()

    # Stock data (GER)
    df_ger_stock_data = pd.read_csv('../legacy/data/TSL0_intraday_230501_250501-1m.csv').drop(columns=['Unnamed: 0'])
    df_ger_stock_data = df_ger_stock_data.rename(
        columns={'datetime': 'timestamp'}
    )

    df_ger_stock_data['timestamp'] = pd.to_datetime(
        df_ger_stock_data['timestamp']
    ).dt.tz_localize(
        tz=european
    )
    df_ger_stock_data.set_index('timestamp').sort_index()

    # Save to file
    df_tiktok_emotions.to_csv('../data/tiktok/tiktok_emotions.csv', index=False)

    df_ger_stock_data.head()

<h3>[2.2][Log Transformation]</h3>
<p>In the following, similarly to the twitter data analysis, we compute the log_return and log_volume for the fin data. Since in the tsl0_intraday data file we have some points in time without data, we also need to fill it. For this, we use the foreward fill ffill() method for the return and set the volume to 0 for the new tuple.</p>

In [None]:
### Foreward fill missing values in stock data ###
if RUN_TYPE == 1:
    def fill_missing_timestamps(df):
        df = df.copy()

        if df.index.name != 'timestamp':
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df = df.set_index('timestamp').sort_index()

        start_time = df.index.min()
        end_time = df.index.max()

        # Filter trading hours
        full_index = pd.date_range(
            start=start_time.replace(hour=4, minute=0, second=0, microsecond=0),
            end=end_time.replace(hour=22, minute=59, second=0, microsecond=0),
            freq='1min'
        )

        # Filter business days
        business_minutes = full_index[full_index.dayofweek < 5]

        # Reindex and fill data
        df_filled = df.reindex(business_minutes)

        # Forward fill prices
        price_cols = ['open', 'high', 'low', 'close']
        df_filled[price_cols] = df_filled[price_cols].ffill()

        # Fill Volume with 0
        df_filled['volume'] = df_filled['volume'].fillna(0)

        return df_filled

    def compute_stock_measures(df, last_date='2025-04-30'):

        # Check if DataFrame is already processed
        required_columns = [
            'log_return', 'log_return_z', 'log_return_z_intraday',
            'log_volume', 'log_volume_z', 'log_volume_z_intraday',
            'minute_of_day'
        ]

        if all(col in df.columns for col in required_columns):
            print("[Info] Daten bereits vollständig verarbeitet")

            # Check date
            cutoff_timestamp = pd.Timestamp(f'{last_date} 22:59:00', tz=df.index.tz)
            if df.index.max() > cutoff_timestamp:
                print(f"[Info] Schneide Daten nach {last_date} ab")
                df = df[df.index <= cutoff_timestamp]

            return df

        # Fill missing timestamps
        df = fill_missing_timestamps(df)

        # Filter data until last_date
        cutoff_timestamp = pd.Timestamp(f'{last_date} 22:59:00', tz=df.index.tz)
        df = df[df.index <= cutoff_timestamp]

        # Compute missing column values
        if 'minute_of_day' not in df.columns:
            df['minute_of_day'] = df.index.hour * 60 + df.index.minute

        if 'log_return' not in df.columns:
            df['log_return'] = np.log(df['close'] / df['close'].shift(1))
            df['log_return_z'] = (df['log_return'] - df['log_return'].mean()) / df['log_return'].std()
            df['log_return_z_intraday'] = df.groupby('minute_of_day')['log_return'].transform(
                lambda x: (x - x.mean()) / x.std()
            )

        if 'log_volume' not in df.columns:
            df['log_volume'] = np.log1p(df['volume'])
            df['log_volume_z'] = (df['log_volume'] - df['log_volume'].mean()) / df['log_volume'].std()
            df['log_volume_z_intraday'] = df.groupby('minute_of_day')['log_volume'].transform(
                lambda x: (x - x.mean()) / x.std()
            )
        return df

In [None]:
### Compute stock measures (log_return/volume) for US and GER stock data + forward fill fin data ###
# Create new csv files
if RUN_TYPE == 1:
    df_us_stock_data = compute_stock_measures(df_us_stock_data)
    df_us_stock_data.to_csv(
        path_or_buf='../data/stocks/tesla_nyse_intraday_202305_202504-1m.csv',
        index=True
    )

    df_ger_stock_data = compute_stock_measures(df_ger_stock_data)
    df_ger_stock_data.to_csv(
        path_or_buf='../data/stocks/tesla_xetra_intraday_202305_202504-1m.csv',
        index=True
    )

<h3>[2.2][Create Event Study Dataframe]</h3>

In [111]:
### Read necessary dataframes, set index, and convert timezones ###
# Prep tweets
df_tesla = pd.read_csv('../data/tiktok/tiktok_emotions.csv').dropna()
df_tesla = df_tesla.drop(columns={'combined_text', 'combined_text_cleaned'}).rename(columns={
    'total_engagement': 'engagement',
    'combined_text_anger': 'anger',
    'combined_text_fear': 'fear',
    'combined_text_joy': 'joy',
    'combined_text_sadness': 'sadness',
    'combined_text_disgust': 'disgust',
    'combined_text_surprise': 'surprise',
    'combined_text_dominant_emotion': 'video_emotion'
})
df_tesla['timestamp'] = pd.to_datetime(df_tesla['timestamp'], format='%Y-%m-%d %H:%M:%S%z', utc=True)
# Prep nyse data
df_nyse = (pd.read_csv('../data/stocks/tesla_nyse_intraday_202305_202504-1m.csv')
    .rename(columns={'Unnamed: 0': 'timestamp'})
    .drop(columns={'open', 'high', 'low', 'close', 'volume'})
).set_index('timestamp').sort_index()

# Prep xetra data
df_xetra = (pd.read_csv('../data/stocks/tesla_xetra_intraday_202305_202504-1m.csv')
    .rename(columns={'Unnamed: 0': 'timestamp'})
    .drop(columns={'open', 'high', 'low', 'close', 'volume'})
).set_index('timestamp').sort_index()

In [107]:

#df_tesla['timestamp'] = pd.to_datetime(df_tesla['timestamp'], format='%Y-%m-%d %H:%M:%S%z', utc=True)
print("Datentyp der Spalte:", df_tesla['timestamp'].dtype)
print("Beispielwerte:", df_tesla['timestamp'].head(3).to_list())
print("Zeitzoneninfo:", df_tesla['timestamp'].dt.tz)
print("Tesla Columns:" , df_tesla.columns)

Datentyp der Spalte: datetime64[ns, UTC]
Beispielwerte: [Timestamp('2024-05-24 04:03:33+0000', tz='UTC'), Timestamp('2024-10-10 19:21:19+0000', tz='UTC'), Timestamp('2025-04-08 21:05:48+0000', tz='UTC')]
Zeitzoneninfo: UTC
Tesla Columns: Index(['timestamp', 'engagement', 'anger', 'fear', 'joy', 'sadness', 'disgust',
       'surprise', 'tweet_emotion'],
      dtype='object')


In [112]:
### Group tweets to events with dominant emotion ###

## Pre-process tweets for event study
# 1. Filter out tweets outside market hours (+-2h)
df_tesla = df_tesla[
    df_tesla['timestamp'].dt.time >= pd.to_datetime("04:00").time()
]
df_tesla = df_tesla[
    df_tesla['timestamp'].dt.time <= pd.to_datetime("17:59").time()
]

# 2. Sort by time
df_tesla = df_tesla.sort_values("timestamp").reset_index(drop=True)

# 3. Compute time difference between tweets
df_tesla['time_diff'] = df_tesla['timestamp'].diff()

# 4. Cluster Tweets to events with time_diff >= 7h
df_tesla['new_event'] = df_tesla['time_diff'] > pd.Timedelta(hours=7)

# 5. Cumulate events in event_id
df_tesla['event_id'] = df_tesla['new_event'].cumsum()

# 6. Compute the dominant emotion for each event and merge back to df
def compute_event_emotion(group):
    total_engagement = group['engagement'].sum()
    weighted_emo_scores = {
        emotion: (group[emotion] * group['engagement']).sum() / total_engagement
        for emotion in ekman_emotions
    }
    event_emotion = max(weighted_emo_scores, key=weighted_emo_scores.get)
    return pd.Series({
        **weighted_emo_scores,
        'event_emotion': event_emotion
    })

if not ('event_emotion' in df_tesla.columns):
    df_event_emotions = (df_tesla
        .drop(columns=['event_id'])
        .groupby(df_tesla['event_id'], group_keys=False)
        .apply(compute_event_emotion)
        .reset_index()
    )

    df_tesla = (df_tesla
        .merge(
            df_event_emotions[['event_id', 'event_emotion']],
            on='event_id',
            how='left'
        )
        .drop(columns={'anger', 'fear', 'joy', 'sadness', 'disgust', 'surprise', 'video_emotion', 'new_event'})
    )
df_tesla.head()

# 8. Set event from time to time of the earliest tweet (models from what point in time a reaction can occur)
df_tesla['event_time'] = df_tesla.groupby('event_id')['timestamp'].transform('min')

# 9. Filter duplicate events
df_tesla = df_tesla.drop_duplicates(subset=['event_id'])

print("[Info] Number of unique events = ", df_tesla['event_id'].nunique(), "\n")

[Info] Number of unique events =  114 



In [123]:
print("Index Type:", df_nyse.index.dtype)

Index Type: object


In [125]:
### Create/Read Event Study DataFrame (with actual, expected, abnormal return and volume) ###

# 1. Filter events based on overlaps, estimation and observation windows
# 2. Calculate expected and abnormal returns for each event
def filter_events(df, df_trades, get_pre = 0):
    # Define necessary vars
    est = 300
    obs = 120
    gap = est + obs
    results = []

    # 1. Filter by Overlaps
    valid_events = []
    last_event = None

    # Add valid events to the list
    for date in df.index:
        # First event always valid
        if last_event is None:
            valid_events.append(date)
            last_event = date
        else:
            # If timeshift > gaps, keep the current event and set as new last_event
            if (date - last_event) > pd.Timedelta(minutes=gap):
                valid_events.append(date)
                last_event = date

    # Only keep valid events
    df = df.loc[valid_events]
    print(f"[Overlap] New df_length = {len(df)}")

    # 2. Align event_time with the next available trading timestamp
    df.index = df.index.tz_convert(eastern)

    for i, date in enumerate(df.index):
        if date not in df_trades.index:
            future_times = df_trades.index[df_trades.index > date]
            if not future_times.empty:
                df.index.values[i] = pd.Timestamp(future_times[0])

    # 3. Filter events without sufficient estimation_window data
    # 4. Filter events without sufficient event_window data
    valid_events = []

    for date in df.index:
        day_data = df_trades[df_trades.index.date == date.date()]
        event_index = day_data.index.get_loc(date)

        remaining = len(day_data) - event_index

        if event_index >= est:
            if remaining >= obs:
                valid_events.append(date)

    df = df.loc[valid_events]
    print(f"[Sum Data] New df_length = {len(df)}")

    # 5. Event study variables
    for event_row in df.itertuples():
        event_id = event_row.event_id
        event_emo = event_row.event_emotion
        event_time = event_row.Index

        estimation_window = df_trades.loc[
            event_time - pd.Timedelta(minutes=est) : event_time - pd.Timedelta(minutes=1)
        ]
        event_window = df_trades.loc[
            event_time - pd.Timedelta(minutes=get_pre * est): event_time + pd.Timedelta(minutes=obs - 1)
        ]

        expected_return = estimation_window['log_return'].mean()
        expected_volume = estimation_window['log_volume'].median() # robust against outliers, see Bamber (1987)
        expected_volume_z_intraday = estimation_window['log_volume_z_intraday'].median()

        for i, fin_row in enumerate(event_window.itertuples()):
            if get_pre == 1:
                i = int((fin_row.Index - event_time).total_seconds() // 60)

            actual_return = fin_row.log_return
            abnormal_return = actual_return - expected_return

            actual_volume = fin_row.log_volume
            abnormal_volume = actual_volume - expected_volume

            actual_volume_z_intraday = fin_row.log_volume_z_intraday
            abnormal_volume_z_intraday = actual_volume_z_intraday - expected_volume_z_intraday

            results.append({
                'event_id': event_id,
                'event_fin_offset': i,
                'event_time': event_time,
                'fin_time': fin_row.Index,
                'event_emo': event_emo,
                'actual_return': actual_return,
                'expected_return': expected_return,
                'abnormal_return': abnormal_return,
                'actual_volume': actual_volume,
                'expected_volume': expected_volume,
                'abnormal_volume': abnormal_volume,
                'actual_volume_z_intraday': actual_volume_z_intraday,
                'expected_volume_z_intraday': expected_volume_z_intraday,
                'abnormal_volume_z_intraday': abnormal_volume_z_intraday
            })

    # Make a dataframe from the result list
    df_valid_events = pd.DataFrame(results)

    return df_valid_events

if RUN_TYPE == 1: # if not yet created, create the event study dataframes with abnormal returns and volumes

    #  Create new dfs with necessary event data and set time column as index
    df_events = (df_tesla[['event_id', 'event_emotion', 'event_time']]
                .set_index('event_time')
                .sort_index())

    df_nyse = df_nyse[['log_return', 'log_volume', 'log_volume_z_intraday']] # keeps index col ('timestamp')
    df_nyse.index = pd.to_datetime(
        df_nyse.index,
        format='%Y-%m-%d %H:%M:%S%z',
        utc=True,
        errors='coerce'
    )
    df_nyse = df_nyse.sort_index()

    df_xetra = df_xetra[['log_return', 'log_volume', 'log_volume_z_intraday']] # keeps index col ('timestamp')
    df_xetra.index = pd.to_datetime(
        df_xetra.index,
        format='%Y-%m-%d %H:%M:%S%z',
        utc=True,
        errors='coerce'
    )
    df_xetra = df_xetra.sort_index()

    for df_trades, market in zip([df_nyse, df_xetra], ['nyse', 'xetra']):
        df_event_study = filter_events(df_events, df_trades, get_pre = 0).to_csv(
            f'../data/twitter/tiktok_{market}_emotion_event_study.csv',
            index=False
        )
        df_event_study_pre_post = filter_events(df_events, df_trades, get_pre=1).to_csv(
            f'../data/twitter/tiktok_{market}emotion_event_study_pre_post.csv',
            index=False)

print("[Info] Dataframe Inspection")
print(f"Emotions: {df_event_study['event_emo'].unique()}")
emotion = df_event_study['event_emo'].unique()
for emo in emotion:
    print(f"  - {emo}: {len(df_event_study[df_event_study['event_emo'] == emo].groupby('event_id'))} events")

print(f"\n[Info] Event Study DataFrame has {len(df_event_study)} entries.")
print(f"[Info] Event Study Pre/Post DataFrame has {len(df_event_study_pre_post)} entries.\n")
if (df_event_study_pre_post.columns == df_event_study.columns).all():
    print("[Info] emotion_event_study(_pre_post).csv files have the following columns:")
    print(df_event_study.columns.tolist())

[Overlap] New df_length = 114


  df.index.values[i] = pd.Timestamp(future_times[0])


KeyError: Timestamp('2023-11-30 23:06:00-0500', tz='US/Eastern')