# Topic Modelling Notebook 

In [11]:
# Utilities
from joblib import Parallel, delayed
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=False)
from multiprocessing import Pool, cpu_count
import warnings 
warnings.filterwarnings("ignore")
import os
import pandas as pd
import numpy as np
import tqdm
from tqdm import tqdm
tqdm.pandas(desc="progress bar")
import gc
os.environ['TOKENIZERS_PARALLELISM'] = 'true'

# Import time packages 
import time
import datetime

# Visualization Tools
import matplotlib.pyplot as plt

# Deep Learning Models 
import torch
import tensorflow as tf

# Topic Modelling Packages
from bertopic import BERTopic


INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [12]:
from utils import search_folder
current_dir = os.getcwd()
thesis_folder_path = current_dir.replace("thesis_code", "")
twitter_data_path = search_folder(thesis_folder_path,"twitter_data")
eikon_data_path = search_folder(thesis_folder_path,"eikon_news")
stock_data_path = search_folder(thesis_folder_path,"stock_prices")
modelling_data_path = search_folder(thesis_folder_path,"modelling_data")
google_trending_path = search_folder(thesis_folder_path,"google_search")

In [3]:
# Retrieving sentiment on cleaned text
twitter_df = pd.read_csv(twitter_data_path +"/twitter_senti_df.csv")
twitter_df["datetime"] = pd.to_datetime(twitter_df["datetime"])
twitter_df =twitter_df.sort_values(by = "datetime")
twitter_df = twitter_df.drop_duplicates()

In [4]:
# Retrieving Sentiment on Eikon Raw News 
eikon_df = pd.read_csv(eikon_data_path +"/raw_eikon_senti_df.csv")
eikon_df["datetime"] = pd.to_datetime(eikon_df["datetime"])
eikon_df =eikon_df.sort_values(by = "datetime")

In [5]:
print(twitter_df.shape)
print(eikon_df.shape)

(2157988, 21)
(44423, 16)


In [6]:
# Tweets on Saturday and Sunday are moved to Monday in order to not losse any relevance
from utils import fix_dates
twitter_df["datetime"] = twitter_df["datetime"].progress_apply(lambda x: fix_dates(x))
eikon_df["datetime"] = eikon_df["datetime"].progress_apply(lambda x: fix_dates(x))

progress bar: 100%|██████████| 2157988/2157988 [00:06<00:00, 349610.74it/s]
progress bar: 100%|██████████| 44423/44423 [00:00<00:00, 504870.77it/s]


In [7]:
twitter_topics_df = twitter_df.copy()
# Convert float values to strings
twitter_topics_df["date"] = twitter_topics_df["datetime"].dt.date
twitter_topics_df["text"] = twitter_topics_df["text"].astype(str)
twitter_topics_df["cleaned_text"] = twitter_topics_df["cleaned_text"].astype(str)

# Group the rows by "date" and "company" columns and concatenate the text values
twitter_topics_df = twitter_topics_df[["date", "company", "text", "cleaned_text"]]
twitter_topics_df = twitter_topics_df.groupby(["date", "company"]).agg({
    "text": ' '.join,
    "cleaned_text": ' '.join
}).reset_index()


In [8]:
eikon_topics_df = eikon_df.copy()
# Convert float values to strings
eikon_topics_df["date"] = eikon_topics_df["datetime"].dt.date
eikon_topics_df["text"] = eikon_topics_df["text"].astype(str)
eikon_topics_df["cleaned_text"] = eikon_topics_df["cleaned_text"].astype(str)

# Group the rows by "date" and "company" columns and concatenate the text values
eikon_topics_df = eikon_topics_df[["date", "company", "text", "cleaned_text"]]
eikon_topics_df = eikon_topics_df.groupby(["date", "company"]).agg({
    "text": ' '.join,
    "cleaned_text": ' '.join
}).reset_index()


---
## Dynamic Modelling Introduction 

Dynamic topic modeling (DTM) is a collection of techniques aimed at analyzing the evolution of topics over time. These methods allow to understand how a topic is represented across different times. For example, in 1995 people may talk differently about environmental awareness than those in 2015. Although the topic itself remains the same, environmental awareness, the exact representation of that topic might differ.

In [28]:
subest_twitter_topics_df = eikon_topics_df[eikon_topics_df["company"] == "moderna"]

In [29]:
timestamps = subest_twitter_topics_df.date.to_list()
timestamps = pd.to_datetime(timestamps)
tweets = subest_twitter_topics_df.cleaned_text.to_list()

Then, we need to extract the global topic representations by simply creating and training a BERTopic model:

In [30]:
topics = {
    "LABEL_1": "Company and Product News",
    "LABEL_2": "Dividend & Earnings",
    "LABEL_3": "Energy",
    "LABEL_4": "Financials",
    "LABEL_5": "General News",
    "LABEL_6": "Pandemic",
    "LABEL_7": "Investments",
    "LABEL_8": "Macro",
    "LABEL_9": "Politics",
    "LABEL_10": "Stock Commentary",
    "LABEL_11": "Stock Movement",
}

In [31]:
topic_model = BERTopic(verbose=True, seed_topic_list = topics)

In [32]:
topics, probs = topic_model.fit_transform(tweets)

Batches:   0%|          | 0/25 [00:00<?, ?it/s]

2023-05-18 18:43:41,051 - BERTopic - Transformed documents to Embeddings


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2023-05-18 18:43:43,263 - BERTopic - Reduced dimensionality
2023-05-18 18:43:43,279 - BERTopic - Clustered reduced embeddings


From these topics, we are going to generate the topic representations at each timestamp for each topic. We do this by simply calling topics_over_time and passing the tweets, the corresponding timestamps, and the related topics:

In [33]:
topics_over_time_twitter = topic_model.topics_over_time(tweets, timestamps, nr_bins=5)

5it [00:00, 40.36it/s]


In [34]:
topic_model.visualize_topics_over_time(topics_over_time_twitter, top_n_topics=5)

In [36]:
topic_model.visualize_barchart()