# Topic Modelling Notebook 

In [22]:
# Utilities
from joblib import Parallel, delayed
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)
from multiprocessing import Pool, cpu_count
import warnings 
warnings.filterwarnings("ignore")
import os
import pandas as pd
import numpy as np
import tqdm
from tqdm import tqdm
tqdm.pandas(desc="progress bar")
import gc
os.environ['TOKENIZERS_PARALLELISM'] = 'true'

# Import time packages 
import time
import datetime

# Visualization Tools
import matplotlib.pyplot as plt

# Deep Learning Models 
import torch
import tensorflow as tf

# Topic Modelling Packages
from bertopic import BERTopic


INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
from utils import search_folder
current_dir = os.getcwd()
thesis_folder_path = current_dir.replace("thesis_code", "")
twitter_data_path = search_folder(thesis_folder_path,"twitter_data")
eikon_data_path = search_folder(thesis_folder_path,"eikon_news")
stock_data_path = search_folder(thesis_folder_path,"stock_prices")
modelling_data_path = search_folder(thesis_folder_path,"modelling_data")
google_trending_path = search_folder(thesis_folder_path,"google_search")

In [3]:
# Retrieving sentiment on cleaned text
twitter_df = pd.read_csv(twitter_data_path +"/twitter_senti_df.csv")
twitter_df["datetime"] = pd.to_datetime(twitter_df["datetime"])
twitter_df =twitter_df.sort_values(by = "datetime")
twitter_df = twitter_df.drop_duplicates()

In [4]:
# Retrieving Sentiment on Eikon Raw News 
eikon_df = pd.read_csv(eikon_data_path +"/raw_eikon_senti_df.csv")
eikon_df["datetime"] = pd.to_datetime(eikon_df["datetime"])
eikon_df =eikon_df.sort_values(by = "datetime")

In [5]:
print(twitter_df.shape)
print(eikon_df.shape)

(2157988, 21)
(44423, 16)


In [10]:
# Tweets on Saturday and Sunday are moved to Monday in order to not losse any relevance
from utils import fix_dates
twitter_df["datetime"] = twitter_df["datetime"].progress_apply(lambda x: fix_dates(x))
eikon_df["datetime"] = eikon_df["datetime"].progress_apply(lambda x: fix_dates(x))

progress bar: 100%|██████████| 2157988/2157988 [00:05<00:00, 430345.30it/s]
progress bar: 100%|██████████| 44423/44423 [00:00<00:00, 570006.02it/s]


In [11]:
twitter_topics_df = twitter_df.copy()
# Convert float values to strings
twitter_topics_df["date"] = twitter_topics_df["datetime"].dt.date
twitter_topics_df["text"] = twitter_topics_df["text"].astype(str)
twitter_topics_df["cleaned_text"] = twitter_topics_df["cleaned_text"].astype(str)

# Group the rows by "date" and "company" columns and concatenate the text values
twitter_topics_df = twitter_topics_df[["date", "company", "text", "cleaned_text"]]
twitter_topics_df = twitter_topics_df.groupby(["date", "company"]).agg({
    "text": ' '.join,
    "cleaned_text": ' '.join
}).reset_index()


In [12]:
eikon_topics_df = eikon_df.copy()
# Convert float values to strings
eikon_topics_df["date"] = eikon_topics_df["datetime"].dt.date
eikon_topics_df["text"] = eikon_topics_df["text"].astype(str)
eikon_topics_df["cleaned_text"] = eikon_topics_df["cleaned_text"].astype(str)

# Group the rows by "date" and "company" columns and concatenate the text values
eikon_topics_df = eikon_topics_df[["date", "company", "text", "cleaned_text"]]
eikon_topics_df = eikon_topics_df.groupby(["date", "company"]).agg({
    "text": ' '.join,
    "cleaned_text": ' '.join
}).reset_index()


---
## Dynamic Modelling Introduction 

Dynamic topic modeling (DTM) is a collection of techniques aimed at analyzing the evolution of topics over time. These methods allow to understand how a topic is represented across different times. For example, in 1995 people may talk differently about environmental awareness than those in 2015. Although the topic itself remains the same, environmental awareness, the exact representation of that topic might differ.

In [107]:
subest_twitter_topics_df = eikon_topics_df[eikon_topics_df["company"] == "apple"]

In [108]:
timestamps = subest_twitter_topics_df.date.to_list()
timestamps = pd.to_datetime(timestamps)
tweets = subest_twitter_topics_df.cleaned_text.to_list()

Then, we need to extract the global topic representations by simply creating and training a BERTopic model:

In [120]:
from bertopic import BERTopic

topic_model = BERTopic(verbose=True)
topics, probs = topic_model.fit_transform(tweets)

Batches:   0%|          | 0/51 [00:00<?, ?it/s]

2023-05-14 18:43:49,081 - BERTopic - Transformed documents to Embeddings
2023-05-14 18:43:52,611 - BERTopic - Reduced dimensionality
2023-05-14 18:43:52,650 - BERTopic - Clustered reduced embeddings


From these topics, we are going to generate the topic representations at each timestamp for each topic. We do this by simply calling topics_over_time and passing the tweets, the corresponding timestamps, and the related topics:

In [121]:
topics_over_time_eikon = topic_model.topics_over_time(tweets, timestamps, nr_bins=10)

10it [00:00, 24.98it/s]


In [97]:
topics_over_time

Unnamed: 0,Topic,Words,Frequency,Timestamp,Name
0,-1,"apple, to, the, in, for",101,2016-12-30 16:58:04.800,-1_apple_the_to_in
1,0,"apple, to, the, in, iphone",36,2016-12-30 16:58:04.800,0_apple_the_to_in
2,2,"apple, to, the, in, for",9,2016-12-30 16:58:04.800,2_apple_the_to_in
3,7,"apple, tesla, baba, spy, meta",2,2016-12-30 16:58:04.800,7_meta_tesla_cei_baba
4,8,"apple, to, the, in, of",2,2016-12-30 16:58:04.800,8_apple_the_to_and
...,...,...,...,...,...
147,14,"blk, coin, crm, cei, moderna",2,2022-08-27 16:48:00.000,14_apple_the_to_tesla
148,15,"apple, blk, meta, the, coin",2,2022-08-27 16:48:00.000,15_apple_the_to_in
149,17,"apple, the, iphone, 14, to",4,2022-08-27 16:48:00.000,17_apple_iphone_the_to
150,19,"apple, the, to, musk, and",2,2022-08-27 16:48:00.000,19_apple_the_to_and


In [111]:
topics_over_time_eikon

Unnamed: 0,Topic,Words,Frequency,Timestamp
0,-1,"apple, inc, rating, to, price",70,2016-12-30 17:08:09.600
1,0,"of, apple, update, fixing, regulator",9,2016-12-30 17:08:09.600
2,2,"dissolves, share, stake, inc, appaloosa",1,2016-12-30 17:08:09.600
3,3,"rating, apple, inc, updated, for",39,2016-12-30 17:08:09.600
4,4,"q1, assortment, sort, star, hidden",2,2016-12-30 17:08:09.600
...,...,...,...,...
112,4,"gigabit, to, apple, biden, earnings",3,2022-08-21 09:36:00.000
113,5,"the, stocks, apple, buzz, on",10,2022-08-21 09:36:00.000
114,6,"benefit, others, earnings, preview, lawsuit",1,2022-08-21 09:36:00.000
115,9,"microsoft, favourable, perseverance, platforme...",1,2022-08-21 09:36:00.000


In [96]:
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10)

In [122]:
topic_model.visualize_topics_over_time(topics_over_time_eikon, top_n_topics=10)

In [124]:
topic_model.visualize_barchart()

In [116]:
topic_freq = topic_model.get_topic_freq()
print(topic_freq)

    Topic  Count
0      -1    695
1       0    320
2       1    151
3       2    140
4       3     49
5       4     32
6       5     30
7       6     30
8       7     28
9       8     27
10      9     25
11     10     22
12     11     16
13     12     12
14     13     11
15     14     11
16     15     10


---
### LDA Method: 

In [23]:
import pandas as pd
from transformers import BertTokenizer, BertModel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [54]:
moderna_eikon_topics_df = eikon_topics_df[eikon_topics_df["company"] == "moderna"]

In [55]:
# Tokenize the tweets using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
moderna_eikon_topics_df['tokens'] = moderna_eikon_topics_df['text'].parallel_apply(lambda x: tokenizer.tokenize(x))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=77), Label(value='0 / 77'))), HBox…

In [57]:
model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [60]:
model = BertModel.from_pretrained('bert-base-uncased')

def create_embeddings(text):
        encoded_tweets = []
        inputs = tokenizer.encode(text, add_special_tokens=True, truncation=True, padding='max_length', max_length=512)
        inputs = torch.tensor(inputs).unsqueeze(0)  # Convert to tensor
        outputs = model(inputs)
        encoded_tweets.append(outputs.pooler_output.tolist())
        return encoded_tweets


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [62]:
moderna_eikon_topics_df["bert_embeddings"] = moderna_eikon_topics_df['tokens'].parallel_apply(lambda x: create_embeddings(x))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=77), Label(value='0 / 77'))), HBox…

In [64]:
moderna_eikon_topics_df

Unnamed: 0,date,company,text,cleaned_text,tokens,bert_embeddings
946,2018-11-28,moderna,Biotech company Moderna expects to raise up to...,biotech company moderna expects to raise up to...,"[bio, ##tech, company, modern, ##a, expects, t...","[[[-0.15786303579807281, -0.5081111788749695, ..."
949,2018-11-29,moderna,UPDATE 1-Biotech company Moderna expects to ra...,update 1 biotech company moderna expects to ra...,"[update, 1, -, bio, ##tech, company, modern, #...","[[[-0.21495932340621948, -0.5278425812721252, ..."
960,2018-12-06,moderna,EXCLUSIVE-Moderna mulls increasing largest bio...,exclusive moderna mulls increasing largest bio...,"[exclusive, -, modern, ##a, mu, ##lls, increas...","[[[-0.12667618691921234, -0.4134136736392975, ..."
963,2018-12-07,moderna,Moderna braves market jitters with upsized IPO...,moderna braves market jitters with upsized ipo...,"[modern, ##a, braves, market, ji, ##tters, wit...","[[[-0.42688554525375366, -0.6361693739891052, ..."
966,2018-12-10,moderna,CORRECTED-BREAKINGVIEWS-Uber and Lyft race to ...,corrected uber and lyft race to get through op...,"[corrected, -, breaking, ##view, ##s, -, uber,...","[[[-0.1000707596540451, -0.4349295198917389, -..."
...,...,...,...,...,...,...
3917,2023-04-03,moderna,"Insider Trading: Chief Executive Officer, Banc...",insider trading chief executive officer sells ...,"[insider, trading, :, chief, executive, office...","[[[-0.4395850896835327, -0.6473029851913452, -..."
3920,2023-04-04,moderna,FDA poised to authorize second Omicron COVID b...,fda poised to authorize second omicron covid b...,"[fda, poised, to, author, ##ize, second, om, #...","[[[-0.18135827779769897, -0.502711832523346, -..."
3923,2023-04-05,moderna,"1,304 Shares in Moderna, Inc. (NASDAQ:MRNA) Ac...","1,304 shares in moderna inc nasdaq moderna acq...","[1, ,, 304, shares, in, modern, ##a, ,, inc, ....","[[[-0.22266535460948944, -0.539493203163147, -..."
3926,2023-04-06,moderna,Moderna (MRNA) Dips More Than Broader Markets:...,dips more than broader markets what you should...,"[modern, ##a, (, mrna, ), dip, ##s, more, than...","[[[-0.6368915438652039, -0.7150578498840332, -..."


In [66]:
# Apply BERT for topic modeling using LDA
num_topics = 5  # Number of topics to extract
num_words = 10  # Number of words per topic
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(moderna_eikon_topics_df['cleaned_text'])
feature_names = vectorizer.get_feature_names()
lda_model = LatentDirichletAllocation(n_components=num_topics)
lda_model.fit(X)

# Assign topics to each tweet
topic_columns = []
for i in range(num_topics):
    topic_column = f"topic_{i}"
    topic_columns.append(topic_column)
    moderna_eikon_topics_df[topic_column] = 0
topics = lda_model.transform(X)
moderna_eikon_topics_df[topic_columns] = topics

# Convert topics into features (binary values)
df_topics = moderna_eikon_topics_df[topic_columns].applymap(lambda x: 1 if x > 0.5 else 0)


AttributeError: 'CountVectorizer' object has no attribute 'get_feature_names'