# Topic Modelling Notebook 

In [1]:
# Utilities
from joblib import Parallel, delayed
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)
from multiprocessing import Pool, cpu_count
import warnings 
warnings.filterwarnings("ignore")
import os
import pandas as pd
import numpy as np
import tqdm
from tqdm import tqdm
tqdm.pandas(desc="progress bar")
import gc
os.environ['TOKENIZERS_PARALLELISM'] = 'true'

# Import time packages 
import time
import datetime

# Visualization Tools
import matplotlib.pyplot as plt
import plotly.graph_objects as go

# Deep Learning Models 
import torch
import tensorflow as tf

# Topic Modelling Packages
from transformers import AutoTokenizer, AutoModelForSequenceClassification
#from bertopic import BERTopic


INFO: Pandarallel will run on 14 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [85]:
from utils import search_folder
current_dir = os.getcwd()
thesis_folder_path = current_dir.replace("\\pc_code\\thesis_code", "")
twitter_data_path = search_folder(thesis_folder_path,"twitter_data")
eikon_data_path = search_folder(thesis_folder_path,"eikon_news")
topic_modelling = search_folder(thesis_folder_path, "topic_modelling")

In [86]:
# Retrieving sentiment on cleaned text
twitter_df = pd.read_csv(twitter_data_path +"/twitter_senti_df.csv")
twitter_df["datetime"] = pd.to_datetime(twitter_df["datetime"])
twitter_df =twitter_df.sort_values(by = "datetime")
twitter_df = twitter_df.drop_duplicates()

In [87]:
# Retrieving Sentiment on Eikon Raw News 
eikon_df = pd.read_csv(eikon_data_path +"/raw_eikon_senti_df.csv")
eikon_df["datetime"] = pd.to_datetime(eikon_df["datetime"])
eikon_df = eikon_df.sort_values(by = "datetime")

In [88]:
print(twitter_df.shape)
print(eikon_df.shape)

(2157988, 21)
(44423, 16)


In [89]:
# Tweets on Saturday and Sunday are moved to Monday in order to not losse any relevance
from utils import fix_dates
twitter_df["datetime"] = twitter_df["datetime"].progress_apply(lambda x: fix_dates(x))
eikon_df["datetime"] = eikon_df["datetime"].progress_apply(lambda x: fix_dates(x))

progress bar: 100%|██████████| 2157988/2157988 [00:05<00:00, 400633.49it/s]
progress bar: 100%|██████████| 44423/44423 [00:00<00:00, 463243.70it/s]


---
## Finbert-tone-finetuned-finance-topic-classification
This model is a fine-tuned version of yiyanghkust/finbert-tone on Twitter Financial News Topic dataset.
Model determines the financial topic of given tweets over 20 various topics. Given the unbalanced distribution of the class labels, the weights were adjusted to pay attention to the less sampled labels which should increase overall performance.

In [7]:
# Instantiating the model 
tokenizer = AutoTokenizer.from_pretrained("nickmuchi/finbert-tone-finetuned-finance-topic-classification")
model = AutoModelForSequenceClassification.from_pretrained("nickmuchi/finbert-tone-finetuned-finance-topic-classification")

In [47]:
def topic_classifier(tweet: str):
    tokens = tokenizer.encode_plus(tweet, truncation=True, padding=True, return_tensors="pt", max_length= 512)
    outputs = model(**tokens)
    class_probs = outputs.logits.softmax(dim=1)
    output_class = class_probs.argmax().item()
    topic = topics_list[output_class]
    return topic

---
### Quick Test on Publicly Available Dataset 

In [48]:
tweets_train = pd.read_csv(thesis_folder_path+ "/topic_modelling/train_data.csv")
tweets_valid = pd.read_csv(thesis_folder_path+ "/topic_modelling/valid_data.csv")

In [24]:
topics = {
        "LABEL_0": "Analyst Update",
        "LABEL_1": "Fed & Central Banks",
        "LABEL_2": "Company & Product News",
        "LABEL_3": "Treasuries & Corporate Debt",
        "LABEL_4": "Dividend",
        "LABEL_5": "Earnings",
        "LABEL_6": "Energy & Oil",
        "LABEL_7": "Financials",
        "LABEL_8": "Currencies",
        "LABEL_9": "General News & Opinion",
        "LABEL_10": "Gold, Metals & Materials",
        "LABEL_11": "IPO",
        "LABEL_12": "Legal & Regulation",
        "LABEL_13": "M&A & Investments",
        "LABEL_14": "Macro",
        "LABEL_15": "Markets",
        "LABEL_16": "Politics",
        "LABEL_17": "Personnel Change",
        "LABEL_18": "Stock Commentary",
        "LABEL_19": "Stock Movement",
    }

topics_list = list(topics.values())

In [25]:
#tweets_valid["pred"] = tweets_valid["text"].progress_apply(lambda tweet: topic_classifier(tweet))

In [26]:
#accuracy_score(tweets_valid["label"].values,tweets_valid["pred"].values)*100

----
### Classifying Tweets into different Topics 

In [72]:
eikon_subset = eikon_df[["datetime", "company", "text", "cleaned_text"]]
twitter_subset = twitter_df[["datetime", "company", "text", "cleaned_text"]]

In [49]:
companies = list(twitter_subset["company"].unique())[:-1]
companies.sort(reverse=True)
companies = ["moderna","apple", "tesla", "google"]

In [50]:
companies

['moderna', 'apple', 'tesla', 'google']

In [51]:
twitter_subset["company"].value_counts()

company
tesla      1250193
apple       674679
google      184195
moderna      48780
Name: count, dtype: int64

In [52]:
def handle_topic_classifier(tweet):
    try:
        topic = topic_classifier(tweet)
        return topic
    except Exception as e:
        print(f"Error occurred for tweet: {tweet}")
        print(f"Error message: {str(e)}")
        return None


In [55]:
# Calculate the average tweet length in characters
average_length = twitter_subset_topics_df["text"].apply(len).mean()

# Print the result
print(f"The average tweet length is {average_length:.2f} characters.")


The average tweet length is 140.05 characters.


In [54]:
# Count the number of tweets exceeding 512 characters
count_exceeding_280 = sum(twitter_subset_topics_df["text"].apply(len) > 512)

# Count the number of tweets exceeding 800 characters
count_exceeding_800 = sum(twitter_subset_topics_df["text"].apply(len) > 800)

# Print the results
print(f"Number of tweets exceeding 512 characters: {count_exceeding_280}")
print(f"Number of tweets exceeding 800 characters: {count_exceeding_800}")


Number of tweets exceeding 512 characters: 7
Number of tweets exceeding 800 characters: 1


In [61]:
for comp in companies:
    twitter_subset_topics_df = twitter_subset[twitter_subset["company"] == comp]
    twitter_subset_topics_df["topic"] = twitter_subset_topics_df["text"].progress_apply(lambda tweet: handle_topic_classifier(tweet))
    #twitter_subset_topics_df["topic"] = twitter_subset_topics_df["text"].progress_apply(lambda tweet: topic_classifier(tweet))
    twitter_subset_topics_df["date"] = pd.to_datetime(twitter_subset_topics_df["datetime"].dt.date)
    twitter_subset_topics_df = twitter_subset_topics_df.set_index('date')
    twitter_subset_topics_df = twitter_subset_topics_df.pivot_table(index='date', columns='topic', aggfunc='size', fill_value=0)
    tw_daily = twitter_subset_topics_df.resample('D').size().reset_index(name='tweet_count')
    twitter_subset_topics_df = tw_daily.merge(twitter_subset_topics_df, on='date')
    twitter_subset_topics_df["company"] = comp
    topics_final_df = pd.read_csv(topic_modelling+ "/topic_over_time_companies.csv")
    topics_final_df = pd.concat([topics_final_df, twitter_subset_topics_df])
    topics_final_df = topics_final_df.fillna(0.0)
    topics_final_df.to_csv(topic_modelling+ "/topic_over_time_companies.csv", index = False)
    

progress bar: 100%|██████████| 48780/48780 [28:55<00:00, 28.11it/s]
progress bar: 100%|██████████| 674679/674679 [6:42:25<00:00, 27.94it/s]  
progress bar: 100%|██████████| 1250193/1250193 [12:01:45<00:00, 28.87it/s]  
progress bar: 100%|██████████| 184195/184195 [1:41:22<00:00, 30.28it/s]


In [73]:
test = pd.DataFrame(columns = list(topics_final_df.columns))

In [59]:
#test.to_csv(topic_modelling+ "/topic_over_time_companies.csv", index = False)

In [63]:
topics_final_df = pd.read_csv(topic_modelling+ "/topic_over_time_companies.csv")
topics_final_df

Unnamed: 0,date,tweet_count,Analyst Update,Company & Product News,Earnings,Energy & Oil,Fed & Central Banks,General News & Opinion,Legal & Regulation,M&A & Investments,...,Stock Commentary,Stock Movement,Financials,Treasuries & Corporate Debt,Dividend,Personnel Change,IPO,company,Currencies,"Gold, Metals & Materials"
0,2017-01-03,1,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,moderna,0,0
1,2017-01-04,1,0,3,0,0,0,0,0,0,...,3,0,0,0,0,0,0,moderna,0,0
2,2017-01-05,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,moderna,0,0
3,2017-01-09,1,0,8,0,0,0,0,0,0,...,1,0,0,0,0,0,0,moderna,0,0
4,2017-01-10,1,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,moderna,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6211,2023-04-10 00:00:00,1,2,24,1,0,0,13,0,0,...,20,2,0,0,0,0,0,google,0,0
6212,2023-04-11 00:00:00,1,0,43,0,0,0,3,8,2,...,12,7,0,0,0,0,0,google,0,0
6213,2023-04-12 00:00:00,1,1,10,0,0,0,1,2,1,...,9,0,0,1,0,1,0,google,0,0
6214,2023-04-13 00:00:00,1,0,13,0,0,0,4,8,0,...,9,2,0,0,0,0,0,google,0,0


In [69]:
twitter_subset_final = topics_final_df[topics_final_df["company"] == "tesla"]
twitter_subset_final = twitter_subset_final.set_index("date")

In [70]:
# twitter_subset_final = twitter_subset_final.iloc[:,:21]

# for col_name in list(twitter_subset_final.columns): 
    
#     twitter_subset_final["r42" + col_name] = twitter_subset_final[col_name].rolling(42, min_periods = 1).mean()

In [71]:


# Specify the columns you want to plot
columns_to_plot = list(twitter_subset_final.columns)

# Create the figure
fig = go.Figure()

# Iterate over the columns and add traces to the figure
for column in columns_to_plot:
    fig.add_trace(go.Scatter(x=twitter_subset_final.index, y=twitter_subset_final[column], name=column))

# Update the layout
fig.update_layout(title='Topics Over Time', xaxis_title='Datetime', yaxis_title='Value')

# Display the plot
fig.show()



---
### Eikon Topics for Moderna

In [81]:
eikon_subset.company.value_counts()
companies = ["apple", "tesla", "moderna"]

In [90]:
eikon_subset

Unnamed: 0,date,news_count,tweet_count,Analyst Update,Company & Product News,Currencies,Dividend,Earnings,Energy & Oil,Fed & Central Banks,...,Legal & Regulation,M&A & Investments,Macro,Markets,Personnel Change,Politics,Stock Commentary,Stock Movement,Treasuries & Corporate Debt,company


In [91]:
eikon_df

Unnamed: 0,datetime,source,stock,text,company,cleaned_text,Finbert_sentim,Finbert_pos,Finbert_neg,Finbert_neut,Textblob_senti,Textblob_obj,Vader_neg,Vader_neut,Vader_pos,Vader_sentim
0,2017-01-02 12:44:28,ZOLCOM,AAPL.OQ,Apple Inc. (AAPL) Sees Large Drop in Short Int...,apple,apple inc apple sees large drop in short interest,-1.0,0.019455,0.965669,0.014876,0.107143,0.364286,0.174,0.579,0.248,0.2263
1,2017-01-02 16:34:58,ZOLCOM,AAPL.OQ,"JLB & Associates Inc. Has $10,751,000 Position...",apple,"associates inc has $ 10,751,000 position in ap...",0.0,0.039715,0.015316,0.944969,0.000000,0.000000,0.000,1.000,0.000,0.0000
2,2017-01-02 09:24:17,RTRS,.GDAXI .IBEX,German and French share indexes start 2017 on ...,apple,german and french share indexes start 2017 on ...,-1.0,0.015887,0.972655,0.011458,0.000000,0.000000,0.206,0.638,0.156,-0.1779
3,2017-01-02 09:52:42,RTRS,AAPL.O DLGS.DE,BUZZ-Dialog Semi: Falls on report Apple plans ...,apple,buzz dialog semi falls on report apple plans i...,-1.0,0.012941,0.917182,0.069877,0.000000,0.000000,0.189,0.811,0.000,-0.2732
4,2017-01-02 22:54:59,AMEBAN,TSLA.OQ,"Tesla Motors, Inc. (TSLA) Downgraded by Vetr Inc.",tesla,tesla motors inc tesla downgraded by inc,-1.0,0.031287,0.625415,0.343297,0.000000,0.000000,0.000,1.000,0.000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44418,2023-04-07 16:24:11,RTRS,TSLA.O,UPDATE 1-Tesla recalls 422 U.S. vehicles over ...,tesla,update 1 tesla recalls 422 vehicles over suspe...,-1.0,0.010826,0.962579,0.026595,0.000000,0.000000,0.000,1.000,0.000,0.0000
44419,2023-04-07 18:23:42,RTRS,TSLA.O GM.N,Used U.S. electric vehicle sales jump as price...,tesla,used electric vehicle sales jump as prices fall,-1.0,0.028715,0.933346,0.037939,0.000000,0.000000,0.000,1.000,0.000,0.0000
44420,2023-04-07 19:02:55,BUSINT,TSLA.O,Tesla just flashed a sell signal that could sp...,tesla,tesla just flashed a sell signal that could sp...,1.0,0.530141,0.445824,0.024035,0.500000,0.500000,0.127,0.775,0.098,-0.1440
44421,2023-04-07 20:17:22,SEECOM,IMG.TO PSMT.O,"Catalyst Watch: Eyes on inflation data, big ba...",moderna,catalyst watch eyes on inflation data big bank...,0.0,0.033585,0.044580,0.921835,0.000000,0.100000,0.000,1.000,0.000,0.0000


In [92]:
# eikon_subset["topic"] = eikon_subset["text"].progress_apply(lambda tweet: handle_topic_classifier(tweet))

for comp in companies:
    eikon_subset = eikon_df[eikon_df["company"] == comp]
    eikon_subset["topic"] = eikon_subset["text"].progress_apply(lambda tweet: handle_topic_classifier(tweet))
    eikon_subset["date"] = pd.to_datetime(eikon_subset["datetime"].dt.date)
    eikon_subset = eikon_subset.set_index('date')
    eikon_subset = eikon_subset.pivot_table(index='date', columns='topic', aggfunc='size', fill_value=0)
    tw_daily = eikon_subset.resample('D').size().reset_index(name='news_count')
    eikon_subset = tw_daily.merge(twitter_subset_topics_df, on='date')
    eikon_subset["company"] = comp
    topics_final_df = pd.read_csv(topic_modelling+ "/eikon_topic_over_time_companies.csv")
    topics_final_df = pd.concat([topics_final_df, eikon_subset])
    topics_final_df = topics_final_df.fillna(0.0)
    topics_final_df.to_csv(topic_modelling+ "/eikon_topic_over_time_companies.csv", index = False)

progress bar: 100%|██████████| 19142/19142 [06:45<00:00, 47.20it/s]
progress bar: 100%|██████████| 18337/18337 [06:34<00:00, 46.53it/s]
progress bar: 100%|██████████| 6944/6944 [02:34<00:00, 45.05it/s]


In [95]:
topics_final_df = pd.read_csv(topic_modelling+ "/eikon_topic_over_time_companies.csv")
eikon_subset_final = topics_final_df[topics_final_df["company"] == "tesla"]
eikon_subset_final = eikon_subset_final.set_index("date")

In [96]:
columns_to_plot = list(eikon_subset_final.columns)

# Create the figure
fig = go.Figure()

# Iterate over the columns and add traces to the figure
for column in columns_to_plot:
    fig.add_trace(go.Scatter(x=eikon_subset_final.index, y=eikon_subset_final[column], name=column))

# Update the layout
fig.update_layout(title='Topics Over Time', xaxis_title='Datetime', yaxis_title='Value')

# Display the plot
fig.show()