# Topic Modelling Notebook 

In [1]:
# Utilities
from joblib import Parallel, delayed
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)
from multiprocessing import Pool, cpu_count
import warnings 
warnings.filterwarnings("ignore")
import os
import pandas as pd
import numpy as np
import tqdm
from tqdm import tqdm
tqdm.pandas(desc="progress bar")
import gc
os.environ['TOKENIZERS_PARALLELISM'] = 'true'

# Import time packages 
import time
import datetime

# Visualization Tools
import matplotlib.pyplot as plt

# Deep Learning Models 
import torch
import tensorflow as tf

# Topic Modelling Packages
from bertopic import BERTopic


INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


2023-05-19 15:26:41.842430: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from utils import search_folder
current_dir = os.getcwd()
thesis_folder_path = current_dir.replace("thesis_code", "")
twitter_data_path = search_folder(thesis_folder_path,"twitter_data")
eikon_data_path = search_folder(thesis_folder_path,"eikon_news")
stock_data_path = search_folder(thesis_folder_path,"stock_prices")
modelling_data_path = search_folder(thesis_folder_path,"modelling_data")
google_trending_path = search_folder(thesis_folder_path,"google_search")

In [3]:
# Retrieving sentiment on cleaned text
twitter_df = pd.read_csv(twitter_data_path +"/twitter_senti_df.csv")
twitter_df["datetime"] = pd.to_datetime(twitter_df["datetime"])
twitter_df =twitter_df.sort_values(by = "datetime")
twitter_df = twitter_df.drop_duplicates()

In [4]:
# Retrieving Sentiment on Eikon Raw News 
eikon_df = pd.read_csv(eikon_data_path +"/raw_eikon_senti_df.csv")
eikon_df["datetime"] = pd.to_datetime(eikon_df["datetime"])
eikon_df =eikon_df.sort_values(by = "datetime")

In [5]:
print(twitter_df.shape)
print(eikon_df.shape)

(2157988, 21)
(44423, 16)


In [6]:
# Tweets on Saturday and Sunday are moved to Monday in order to not losse any relevance
from utils import fix_dates
twitter_df["datetime"] = twitter_df["datetime"].progress_apply(lambda x: fix_dates(x))
eikon_df["datetime"] = eikon_df["datetime"].progress_apply(lambda x: fix_dates(x))

progress bar: 100%|██████████| 2157988/2157988 [00:06<00:00, 346075.33it/s]
progress bar: 100%|██████████| 44423/44423 [00:00<00:00, 501376.57it/s]


---
#### Using Tweet News Dataset 

In [7]:
tweets_train = pd.read_csv(thesis_folder_path+ "/topic_modelling/train_data.csv")
tweets_valid = pd.read_csv(thesis_folder_path+ "/topic_modelling/valid_data.csv")

In [8]:
tweets_train

Unnamed: 0,text,label
0,Here are Thursday's biggest analyst calls: App...,0
1,Buy Las Vegas Sands as travel to Singapore bui...,0
2,"Piper Sandler downgrades DocuSign to sell, cit...",0
3,"Analysts react to Tesla's latest earnings, bre...",0
4,Netflix and its peers are set for a ‘return to...,0
...,...,...
16985,KfW credit line for Uniper could be raised to ...,3
16986,KfW credit line for Uniper could be raised to ...,3
16987,Russian https://t.co/R0iPhyo5p7 sells 1 bln r...,3
16988,Global ESG bond issuance posts H1 dip as supra...,3


In [9]:
twitter_topics_df = twitter_df.copy()
# Convert float values to strings
twitter_topics_df["date"] = twitter_topics_df["datetime"].dt.date
twitter_topics_df["text"] = twitter_topics_df["text"].astype(str)
twitter_topics_df["cleaned_text"] = twitter_topics_df["cleaned_text"].astype(str)

# Group the rows by "date" and "company" columns and concatenate the text values
twitter_topics_df = twitter_topics_df[["date", "company", "text", "cleaned_text"]]
twitter_topics_df = twitter_topics_df.groupby(["date", "company"]).agg({
    "text": ' '.join,
    "cleaned_text": ' '.join
}).reset_index()


In [10]:
eikon_topics_df = eikon_df.copy()
# Convert float values to strings
eikon_topics_df["date"] = eikon_topics_df["datetime"].dt.date
eikon_topics_df["text"] = eikon_topics_df["text"].astype(str)
eikon_topics_df["cleaned_text"] = eikon_topics_df["cleaned_text"].astype(str)

# Group the rows by "date" and "company" columns and concatenate the text values
eikon_topics_df = eikon_topics_df[["date", "company", "text", "cleaned_text"]]
eikon_topics_df = eikon_topics_df.groupby(["date", "company"]).agg({
    "text": ' '.join,
    "cleaned_text": ' '.join
}).reset_index()


---
## Dynamic Modelling Introduction 

Dynamic topic modeling (DTM) is a collection of techniques aimed at analyzing the evolution of topics over time. These methods allow to understand how a topic is represented across different times. For example, in 1995 people may talk differently about environmental awareness than those in 2015. Although the topic itself remains the same, environmental awareness, the exact representation of that topic might differ.

In [11]:
subest_twitter_topics_df = twitter_topics_df[twitter_topics_df["company"] == "tesla"]

In [12]:
timestamps = subest_twitter_topics_df.date.to_list()
timestamps = pd.to_datetime(timestamps)
tweets = subest_twitter_topics_df.text.to_list()

Then, we need to extract the global topic representations by simply creating and training a BERTopic model:

In [13]:
from bertopic import BERTopic
with tf.device('/device:GPU:0'):
    topic_model = BERTopic(verbose=True)
    topics, probs = topic_model.fit_transform(tweets)

2023-05-19 15:27:19.208053: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-05-19 15:27:19.208112: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Metal device set to: Apple M2 Pro

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



Batches:   0%|          | 0/52 [00:00<?, ?it/s]

2023-05-19 15:29:51,764 - BERTopic - Transformed documents to Embeddings
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2023-05-19 15:29:58,268 - BERTopic - Reduced dimensionality
2023-05-19 15:29:58,311 - BERTopic - Clustered reduced embeddings


From these topics, we are going to generate the topic representations at each timestamp for each topic. We do this by simply calling topics_over_time and passing the tweets, the corresponding timestamps, and the related topics:

In [17]:
with tf.device('/device:GPU:0'):
    topics_over_time_eikon = topic_model.topics_over_time(tweets, timestamps, nr_bins=10)

1it [00:01,  1.29s/it]

In [15]:
topic_model.visualize_topics_over_time(topics_over_time_eikon, top_n_topics=10)

In [16]:
topic_model.visualize_barchart()

---
### LDA Method: 

In [23]:
import pandas as pd
from transformers import BertTokenizer, BertModel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [54]:
moderna_eikon_topics_df = eikon_topics_df[eikon_topics_df["company"] == "moderna"]

In [55]:
# Tokenize the tweets using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
moderna_eikon_topics_df['tokens'] = moderna_eikon_topics_df['text'].parallel_apply(lambda x: tokenizer.tokenize(x))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=77), Label(value='0 / 77'))), HBox…

In [57]:
model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [60]:
model = BertModel.from_pretrained('bert-base-uncased')

def create_embeddings(text):
        encoded_tweets = []
        inputs = tokenizer.encode(text, add_special_tokens=True, truncation=True, padding='max_length', max_length=512)
        inputs = torch.tensor(inputs).unsqueeze(0)  # Convert to tensor
        outputs = model(inputs)
        encoded_tweets.append(outputs.pooler_output.tolist())
        return encoded_tweets


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [62]:
moderna_eikon_topics_df["bert_embeddings"] = moderna_eikon_topics_df['tokens'].parallel_apply(lambda x: create_embeddings(x))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=77), Label(value='0 / 77'))), HBox…

In [64]:
moderna_eikon_topics_df

Unnamed: 0,date,company,text,cleaned_text,tokens,bert_embeddings
946,2018-11-28,moderna,Biotech company Moderna expects to raise up to...,biotech company moderna expects to raise up to...,"[bio, ##tech, company, modern, ##a, expects, t...","[[[-0.15786303579807281, -0.5081111788749695, ..."
949,2018-11-29,moderna,UPDATE 1-Biotech company Moderna expects to ra...,update 1 biotech company moderna expects to ra...,"[update, 1, -, bio, ##tech, company, modern, #...","[[[-0.21495932340621948, -0.5278425812721252, ..."
960,2018-12-06,moderna,EXCLUSIVE-Moderna mulls increasing largest bio...,exclusive moderna mulls increasing largest bio...,"[exclusive, -, modern, ##a, mu, ##lls, increas...","[[[-0.12667618691921234, -0.4134136736392975, ..."
963,2018-12-07,moderna,Moderna braves market jitters with upsized IPO...,moderna braves market jitters with upsized ipo...,"[modern, ##a, braves, market, ji, ##tters, wit...","[[[-0.42688554525375366, -0.6361693739891052, ..."
966,2018-12-10,moderna,CORRECTED-BREAKINGVIEWS-Uber and Lyft race to ...,corrected uber and lyft race to get through op...,"[corrected, -, breaking, ##view, ##s, -, uber,...","[[[-0.1000707596540451, -0.4349295198917389, -..."
...,...,...,...,...,...,...
3917,2023-04-03,moderna,"Insider Trading: Chief Executive Officer, Banc...",insider trading chief executive officer sells ...,"[insider, trading, :, chief, executive, office...","[[[-0.4395850896835327, -0.6473029851913452, -..."
3920,2023-04-04,moderna,FDA poised to authorize second Omicron COVID b...,fda poised to authorize second omicron covid b...,"[fda, poised, to, author, ##ize, second, om, #...","[[[-0.18135827779769897, -0.502711832523346, -..."
3923,2023-04-05,moderna,"1,304 Shares in Moderna, Inc. (NASDAQ:MRNA) Ac...","1,304 shares in moderna inc nasdaq moderna acq...","[1, ,, 304, shares, in, modern, ##a, ,, inc, ....","[[[-0.22266535460948944, -0.539493203163147, -..."
3926,2023-04-06,moderna,Moderna (MRNA) Dips More Than Broader Markets:...,dips more than broader markets what you should...,"[modern, ##a, (, mrna, ), dip, ##s, more, than...","[[[-0.6368915438652039, -0.7150578498840332, -..."


In [66]:
# Apply BERT for topic modeling using LDA
num_topics = 5  # Number of topics to extract
num_words = 10  # Number of words per topic
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(moderna_eikon_topics_df['cleaned_text'])
feature_names = vectorizer.get_feature_names()
lda_model = LatentDirichletAllocation(n_components=num_topics)
lda_model.fit(X)

# Assign topics to each tweet
topic_columns = []
for i in range(num_topics):
    topic_column = f"topic_{i}"
    topic_columns.append(topic_column)
    moderna_eikon_topics_df[topic_column] = 0
topics = lda_model.transform(X)
moderna_eikon_topics_df[topic_columns] = topics

# Convert topics into features (binary values)
df_topics = moderna_eikon_topics_df[topic_columns].applymap(lambda x: 1 if x > 0.5 else 0)


AttributeError: 'CountVectorizer' object has no attribute 'get_feature_names'