<h1>Twitter Topic Emotion Analysis - Part 1</h1>
<h2><i>Topic Modeling</i></h2>

In [17]:
### Imports ###
import pandas as pd
from matplotlib import style
from src.TextNormalizer import TextNormalizer
from src.TimeNormalizer import TimeNormalizer
style.use('ggplot')
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired

RUN_TYPE = 0 # Normal Mode (without file creation)
# RUN_TYPE = 1 # Analysis Mode (with file creation)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aklei\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
### Read Data for Topic Modeling ###
if RUN_TYPE == 1:
    df = pd.read_csv('../data/twitter/tweets_isTweet_emotions.csv')
    df['combined_text'] = df['tweet_text'].fillna('') + '' + df['quoted_tweet_text'].fillna('')
    df['combined_text'] = df["combined_text"].apply(TextNormalizer.remove_noise)
    df = (df[df['combined_text'] != ''])
    df['combined_text'].head()

In [3]:
### Pre process text (embeddings) ###
if RUN_TYPE == 1:
    tweets = df['combined_text'].values.tolist()
    print(f"[Info] Embedding {len(tweets)} tweets ...")

    # 1. Embedding-Modell (vorher berechnen oder cachen)
    embedder = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = embedder.encode(tweets, show_progress_bar=True)

In [4]:
### Fit BERTopic model and print topic info ###
if RUN_TYPE == 1:
    # 2. UMAP (Reduktion für semantisch klarere Cluster)
    umap_model = UMAP(n_neighbors=50, n_components=20, min_dist=0.05, metric="cosine", random_state=42)

    # 3. HDBSCAN (Cluster-Zahl steuern)
    hdbscan_model = HDBSCAN(min_cluster_size=20, cluster_selection_epsilon=0.3, metric="euclidean", cluster_selection_method="eom", prediction_data=True)

    # 4. CountVectorizer
    vectorizer_model = CountVectorizer(min_df=2, stop_words="english")

    # 5. Repräsentation (optional, für bessere Labels)
    representation_model = KeyBERTInspired()

    # 6.1 Tesla-related seed_words (use to find other topics and populate seed_topic_list)
    seed_words = ["tesla", "elon musk", "autopilot", "cybertruck", "model3", "gigafactory", "electric vehicle", "supercharger", "amp"]

    # 6.2 Tesla-related seed_topic_list (populated by finding broad topics with seed_words and wide clustering, now narrowing it down for accuracy)
    # --> Should not choose to many words as seed_list can become blurry
    seed_topic_list = [
        ["tesla", "elon musk", "autopilot", "cybertruck", "model3", "gigafactory", "electric vehicle", "supercharger"],
        ["president", "trump", "government", "election", "republican", "democrat", "vote", "ballot"],
        ["judge", "activist", "illegal"],
        ["doge", "dogefather"],
        ["spacex", "launch", "falcon", "orbit", "mars"],
        ["bitcoin", "dogecoin"],
        ["starlink", "broadband", "highspeed"],
        ["fertility", "birthrate", "population", "births", "demographic"],
        ["twitter", "tweet", "ban", "free speech", "grok", "grokai"],
        ["crypto", "bitcoin", "dogecoin"],
        ["white", "farmers", "south africa", "field", "genocide"],
        ["afd", "german", "coalition", "berlin"]
    ]

    # 7. Topic-Modell
    topic_model = BERTopic(
        embedding_model=embedder,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        representation_model=representation_model,
        calculate_probabilities=True,
        #seed_topic_list=seed_words, # used to populate seed_topic_list
        seed_topic_list=seed_topic_list,
        nr_topics="auto", # Automatically generate topic count
        verbose=True
    )

    # 8. Fitting
    topics, probs = topic_model.fit_transform(tweets, embeddings)

    # 9. Reduce Outliers
    #new_topics = topic_model.reduce_outliers(tweets, topics, strategy="embeddings") # Method to reduce outliers
    new_topics = topic_model.reduce_outliers(tweets, topics, probabilities=probs, strategy="probabilities") # Method to reduce outliers
    topic_model.update_topics(tweets, topics=new_topics)

    # 9. Show Topic Info
    topic_model.get_topic_info()

In [5]:
### Create Dataframe from Topic List and Filter Tesla Topic ###
if RUN_TYPE == 1:
    # 1. Get topic info
    topic_info = topic_model.get_topic_info()

    # 2. Create dataframe
    df_topics = pd.DataFrame(topic_info)[['Topic', 'Representation']]

    # 3. Get topic ids with tesla related topics
    tesla_key = "tesla"
    tesla_topic_ids = []

    for _, row in df_topics.iterrows():
        if tesla_key in row['Representation']:
            tesla_topic_ids.append(row['Topic'])

    print(f"Tesla-Topic-IDs: {tesla_topic_ids}")

    # 4. Filter df by documents
    doc_info = topic_model.get_document_info(tweets)
    df['topics'] = doc_info['Topic']
    df_tesla = df[df['topics'].isin(tesla_topic_ids)]

    # 5. Drop unnecessary columns and safe to csv
    df_tesla = df_tesla[['tweet_id', 'createdAt', 'topics', 'combined_text', 'tweet_text_dominant_emotion', 'quoted_tweet_id', 'quoted_tweet_text_dominant_emotion']]
    df_tesla.to_csv('../data/twitter/tweets_isTweet_emotions_tesla.csv', index=False)
    print(f"Found Tesla-Tweets: {len(df_tesla)}")

<h1>Twitter Topic Emotion Analysis - Part 1</h1>
<h2><i>Event Study</i></h2>
<p>
    In this section, the two event studies of the emotion and topic data will be combined in order to examine, if we can observe an effect of the topic data with certain emotions on the stock price / trading volume on the NYSE and Xetra.
</p>

In [18]:
### Read necessary dataframes, set index, and convert timezones ###
# Prep tweets
df_tesla = (pd.read_csv('../data/twitter/tweets_isTweet_emotions_tesla.csv')
    .dropna()
    .rename(columns={'createdAt': 'timestamp'})
)
df_tesla = TimeNormalizer.normalize_time(df_tesla, 'timestamp', 'US/Eastern').set_index('timestamp')

# Prep nyse data
df_nyse = pd.read_csv('../data/stocks/tsla_intraday_202305_202504-1m.csv')

# Prep xetra data
df_xetra = pd.read_csv('../data/stocks/TSL0_intraday_230501_250501-1m.csv')

df_tesla.head()


Unnamed: 0_level_0,tweet_id,topics,combined_text,tweet_text_dominant_emotion,quoted_tweet_id,quoted_tweet_text_dominant_emotion
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2025-04-24 18:30:51-04:00,1915533926749364264,5.0,worth itfsd supervised 99month effectively 333...,joy,1.915527e+18,joy
2025-04-24 12:39:13-04:00,1915445436116267382,5.0,act blue guilty widespread criminal identity t...,anger,1.915418e+18,joy
2025-04-24 12:37:58-04:00,1915445119509209426,5.0,wowso let get straight socalled maryland dad k...,joy,1.915423e+18,joy
2025-04-24 10:48:09-04:00,1915417484380803137,30.0,scam centurya new legal letter aimed openai la...,anger,1.915004e+18,joy
2025-04-22 15:47:24-04:00,1914768017738604728,5.0,starlink available 120 countriesstarlinks high...,joy,1.914728e+18,joy
