# Topic Modelling Notebook 

In [1]:
# Utilities
from joblib import Parallel, delayed
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)
from multiprocessing import Pool, cpu_count
import warnings 
warnings.filterwarnings("ignore")
import os
import pandas as pd
import numpy as np
import tqdm
from tqdm import tqdm
tqdm.pandas(desc="progress bar")
import gc
os.environ['TOKENIZERS_PARALLELISM'] = 'true'

# Import time packages 
import time
import datetime

# Visualization Tools
import matplotlib.pyplot as plt
import plotly.graph_objects as go

# Deep Learning Models 
import torch
import tensorflow as tf

# Topic Modelling Packages
from transformers import AutoTokenizer, AutoModelForSequenceClassification
#from bertopic import BERTopic


INFO: Pandarallel will run on 14 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [2]:
from utils import search_folder
current_dir = os.getcwd()
thesis_folder_path = current_dir.replace("\\pc_code\\thesis_code", "")
twitter_data_path = search_folder(thesis_folder_path,"twitter_data")
eikon_data_path = search_folder(thesis_folder_path,"eikon_news")
topic_modelling = search_folder(thesis_folder_path, "topic_modelling")

In [3]:
# Retrieving sentiment on cleaned text
twitter_df = pd.read_csv(twitter_data_path +"/twitter_senti_df.csv")
twitter_df["datetime"] = pd.to_datetime(twitter_df["datetime"])
twitter_df =twitter_df.sort_values(by = "datetime")
twitter_df = twitter_df.drop_duplicates()

In [4]:
# Retrieving Sentiment on Eikon Raw News 
eikon_df = pd.read_csv(eikon_data_path +"/raw_eikon_senti_df.csv")
eikon_df["datetime"] = pd.to_datetime(eikon_df["datetime"])
eikon_df = eikon_df.sort_values(by = "datetime")

In [5]:
print(twitter_df.shape)
print(eikon_df.shape)

(2157988, 21)
(44423, 16)


In [6]:
# Tweets on Saturday and Sunday are moved to Monday in order to not losse any relevance
from utils import fix_dates
twitter_df["datetime"] = twitter_df["datetime"].progress_apply(lambda x: fix_dates(x))
eikon_df["datetime"] = eikon_df["datetime"].progress_apply(lambda x: fix_dates(x))

progress bar: 100%|██████████| 2157988/2157988 [00:05<00:00, 425384.28it/s]
progress bar: 100%|██████████| 44423/44423 [00:00<00:00, 449906.96it/s]


---
## Finbert-tone-finetuned-finance-topic-classification
This model is a fine-tuned version of yiyanghkust/finbert-tone on Twitter Financial News Topic dataset.
Model determines the financial topic of given tweets over 20 various topics. Given the unbalanced distribution of the class labels, the weights were adjusted to pay attention to the less sampled labels which should increase overall performance.

In [7]:
# Instantiating the model 
tokenizer = AutoTokenizer.from_pretrained("nickmuchi/finbert-tone-finetuned-finance-topic-classification")
model = AutoModelForSequenceClassification.from_pretrained("nickmuchi/finbert-tone-finetuned-finance-topic-classification")

In [47]:
def topic_classifier(tweet: str):
    tokens = tokenizer.encode_plus(tweet, truncation=True, padding=True, return_tensors="pt", max_length= 512)
    outputs = model(**tokens)
    class_probs = outputs.logits.softmax(dim=1)
    output_class = class_probs.argmax().item()
    topic = topics_list[output_class]
    return topic

---
### Quick Test on Publicly Available Dataset 

In [48]:
tweets_train = pd.read_csv(thesis_folder_path+ "/topic_modelling/train_data.csv")
tweets_valid = pd.read_csv(thesis_folder_path+ "/topic_modelling/valid_data.csv")

In [24]:
topics = {
        "LABEL_0": "Analyst Update",
        "LABEL_1": "Fed & Central Banks",
        "LABEL_2": "Company & Product News",
        "LABEL_3": "Treasuries & Corporate Debt",
        "LABEL_4": "Dividend",
        "LABEL_5": "Earnings",
        "LABEL_6": "Energy & Oil",
        "LABEL_7": "Financials",
        "LABEL_8": "Currencies",
        "LABEL_9": "General News & Opinion",
        "LABEL_10": "Gold, Metals & Materials",
        "LABEL_11": "IPO",
        "LABEL_12": "Legal & Regulation",
        "LABEL_13": "M&A & Investments",
        "LABEL_14": "Macro",
        "LABEL_15": "Markets",
        "LABEL_16": "Politics",
        "LABEL_17": "Personnel Change",
        "LABEL_18": "Stock Commentary",
        "LABEL_19": "Stock Movement",
    }

topics_list = list(topics.values())

In [25]:
#tweets_valid["pred"] = tweets_valid["text"].progress_apply(lambda tweet: topic_classifier(tweet))

In [26]:
#accuracy_score(tweets_valid["label"].values,tweets_valid["pred"].values)*100

----
### Classifying Tweets into different Topics 

In [27]:
eikon_subset = eikon_df[["datetime", "company", "text", "cleaned_text"]]
twitter_subset = twitter_df[["datetime", "company", "text", "cleaned_text"]]

In [49]:
companies = list(twitter_subset["company"].unique())[:-1]
companies.sort(reverse=True)
companies = ["moderna","apple", "tesla", "google"]

In [50]:
companies

['moderna', 'apple', 'tesla', 'google']

In [51]:
twitter_subset["company"].value_counts()

company
tesla      1250193
apple       674679
google      184195
moderna      48780
Name: count, dtype: int64

In [52]:
def handle_topic_classifier(tweet):
    try:
        topic = topic_classifier(tweet)
        return topic
    except Exception as e:
        print(f"Error occurred for tweet: {tweet}")
        print(f"Error message: {str(e)}")
        return None


In [55]:
# Calculate the average tweet length in characters
average_length = twitter_subset_topics_df["text"].apply(len).mean()

# Print the result
print(f"The average tweet length is {average_length:.2f} characters.")


The average tweet length is 140.05 characters.


In [54]:
# Count the number of tweets exceeding 512 characters
count_exceeding_280 = sum(twitter_subset_topics_df["text"].apply(len) > 512)

# Count the number of tweets exceeding 800 characters
count_exceeding_800 = sum(twitter_subset_topics_df["text"].apply(len) > 800)

# Print the results
print(f"Number of tweets exceeding 512 characters: {count_exceeding_280}")
print(f"Number of tweets exceeding 800 characters: {count_exceeding_800}")


Number of tweets exceeding 512 characters: 7
Number of tweets exceeding 800 characters: 1


In [None]:
for comp in companies:
    twitter_subset_topics_df = twitter_subset[twitter_subset["company"] == comp]
    twitter_subset_topics_df["topic"] = twitter_subset_topics_df["text"].progress_apply(lambda tweet: handle_topic_classifier(tweet))
    #twitter_subset_topics_df["topic"] = twitter_subset_topics_df["text"].progress_apply(lambda tweet: topic_classifier(tweet))
    twitter_subset_topics_df["date"] = pd.to_datetime(twitter_subset_topics_df["datetime"].dt.date)
    twitter_subset_topics_df = twitter_subset_topics_df.set_index('date')
    twitter_subset_topics_df = twitter_subset_topics_df.pivot_table(index='date', columns='topic', aggfunc='size', fill_value=0)
    tw_daily = twitter_subset_topics_df.resample('D').size().reset_index(name='tweet_count')
    twitter_subset_topics_df = tw_daily.merge(twitter_subset_topics_df, on='date')
    twitter_subset_topics_df["company"] = comp
    topics_final_df = pd.read_csv(topic_modelling+ "/topic_over_time_companies.csv")
    topics_final_df = pd.concat([topics_final_df, twitter_subset_topics_df])
    topics_final_df = topics_final_df.fillna(0.0)
    topics_final_df.to_csv(topic_modelling+ "/topic_over_time_companies.csv", index = False)
    

In [58]:
test = pd.DataFrame(columns = list(topics_final_df.columns))

In [59]:
test.to_csv(topic_modelling+ "/topic_over_time_companies.csv", index = False)

In [60]:
topics_final_df = pd.read_csv(topic_modelling+ "/topic_over_time_companies.csv")
topics_final_df

Unnamed: 0,date,tweet_count,Analyst Update,Company & Product News,Earnings,Energy & Oil,Fed & Central Banks,General News & Opinion,Legal & Regulation,M&A & Investments,...,Stock Commentary,Stock Movement,Financials,Treasuries & Corporate Debt,Dividend,Personnel Change,IPO,company,Currencies,"Gold, Metals & Materials"


In [171]:
twitter_subset_final = pd.read_csv(topic_modelling + "/topic_over_time_moderna.csv", index_col = [0])
twitter_subset_final.iloc[:,1:21].head(15)

Unnamed: 0_level_0,Analyst Update,Company | Product News,Currencies,Dividend,Earnings,Energy | Oil,Fed | Central Banks,Financials,General News | Opinion,Gold | Metals | Materials,IPO,Legal | Regulation,M&A | Investments,Macro,Markets,Personnel Change,Politics,Stock Commentary,Stock Movement,Treasuries | Corporate Debt
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2017-01-03,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2017-01-04,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0
2017-01-05,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2017-01-09,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2017-01-10,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2017-01-12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2017-01-26,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2017-01-30,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2017-02-02,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2017-02-03,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [147]:
# twitter_subset_final = twitter_subset_final.iloc[:,:21]

# for col_name in list(twitter_subset_final.columns): 
    
#     twitter_subset_final["r42" + col_name] = twitter_subset_final[col_name].rolling(42, min_periods = 1).mean()

In [87]:


# Specify the columns you want to plot
columns_to_plot = list(twitter_subset_final.columns[21:])

# Create the figure
fig = go.Figure()

# Iterate over the columns and add traces to the figure
for column in columns_to_plot:
    fig.add_trace(go.Scatter(x=twitter_subset_final.index, y=twitter_subset_final[column], name=column))

# Update the layout
fig.update_layout(title='Topics Over Time', xaxis_title='Datetime', yaxis_title='Value')

# Display the plot
fig.show()



---
### Eikon Topics for Moderna

In [31]:
eikon_subset["topic"] = eikon_subset["text"].progress_apply(lambda tweet: topic_classifier(tweet))

progress bar: 100%|██████████| 6944/6944 [02:37<00:00, 44.10it/s]


In [32]:
eikon_topics_df = eikon_subset.copy()
eikon_topics_df["date"] = pd.to_datetime(eikon_topics_df["datetime"].dt.date)
eikon_topics_df.head()

Unnamed: 0,datetime,company,text,cleaned_text,topic,date
6502,2018-11-28 16:48:54,moderna,Biotech company Moderna expects to raise up to...,biotech company moderna expects to raise up to...,IPO,2018-11-28
6510,2018-11-29 00:21:26,moderna,UPDATE 1-Biotech company Moderna expects to ra...,update 1 biotech company moderna expects to ra...,IPO,2018-11-29
6594,2018-12-06 19:36:56,moderna,EXCLUSIVE-Moderna mulls increasing largest bio...,exclusive moderna mulls increasing largest bio...,IPO,2018-12-06
6599,2018-12-07 00:24:29,moderna,Moderna braves market jitters with upsized IPO...,moderna braves market jitters with upsized ipo,IPO,2018-12-07
6602,2018-12-07 03:30:44,moderna,UPDATE 3-Moderna to raise $604 mln in upsized ...,update 3 moderna to raise $ 604 mln in upsized...,IPO,2018-12-07


In [33]:
eikon_topics_df.set_index('date', inplace=True)
df_eikon_daily = eikon_topics_df.resample('D').size().reset_index(name='tweet_count')
eikon_topics_df = eikon_topics_df.pivot_table(index='date', columns='topic', aggfunc='size', fill_value=0)
eikon_topics_df = df_eikon_daily.merge(eikon_topics_df, on='date')
eikon_topics_df = eikon_topics_df.set_index("date")

In [34]:
for col_name in list(eikon_topics_df.columns): 
    
    eikon_topics_df["r5_" + col_name] = eikon_topics_df[col_name].rolling(5).mean()

In [35]:
eikon_topics_df

Unnamed: 0_level_0,tweet_count,Analyst Update,Company | Product News,Currencies,Dividend,Earnings,Energy | Oil,Fed | Central Banks,Financials,General News | Opinion,...,r5_IPO,r5_Legal | Regulation,r5_M&A | Investments,r5_Macro,r5_Markets,r5_Personnel Change,r5_Politics,r5_Stock Commentary,r5_Stock Movement,r5_Treasuries | Corporate Debt
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-11-28,1,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
2018-11-29,1,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
2018-12-06,1,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
2018-12-07,7,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
2018-12-10,1,0,0,0,0,0,0,0,0,0,...,2.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-04-03,7,3,2,0,0,0,0,0,0,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0
2023-04-04,3,0,1,0,0,0,0,0,0,0,...,0.0,0.4,1.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0
2023-04-05,1,0,0,0,0,0,0,0,0,0,...,0.0,0.4,1.2,0.0,0.0,0.0,0.0,0.0,0.2,0.0
2023-04-06,8,0,1,0,0,0,0,0,0,0,...,0.0,0.4,2.4,0.0,0.0,0.0,0.0,0.0,0.2,0.0


In [36]:
# Specify the columns you want to plot
columns_to_plot = list(eikon_topics_df.columns[1:])

# Create the figure
fig = go.Figure()

# Iterate over the columns and add traces to the figure
for column in columns_to_plot:
    fig.add_trace(go.Scatter(x=eikon_topics_df.index, y=eikon_topics_df[column], name=column))

# Update the layout
fig.update_layout(title='Topics Over Time', xaxis_title='Datetime', yaxis_title='Value')

# Display the plot
fig.show()
