# Topic Modelling Notebook 

In [None]:
# Utilities
from joblib import Parallel, delayed
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)
from multiprocessing import Pool, cpu_count
import warnings 
warnings.filterwarnings("ignore")
import os
import pandas as pd
import numpy as np
import tqdm
from tqdm import tqdm
tqdm.pandas(desc="progress bar")
import gc
os.environ['TOKENIZERS_PARALLELISM'] = 'true'

# Import time packages 
import time
import datetime

# Visualization Tools
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px

# Deep Learning Models 
import torch
import tensorflow as tf

# Topic Modelling Packages
from transformers import AutoTokenizer, AutoModelForSequenceClassification
#from bertopic import BERTopic


In [4]:
from utils import search_folder
current_dir = os.getcwd()
thesis_folder_path = current_dir.replace("/laptop code/thesis_code", "")
twitter_data_path = search_folder(thesis_folder_path,"twitter_data")
eikon_data_path = search_folder(thesis_folder_path,"eikon_news")
topic_modelling = search_folder(thesis_folder_path, "topic_modelling")

In [5]:
# Retrieving sentiment on cleaned text
twitter_df = pd.read_csv(twitter_data_path +"/twitter_senti_df.csv")
twitter_df["datetime"] = pd.to_datetime(twitter_df["datetime"])
twitter_df =twitter_df.sort_values(by = "datetime")
twitter_df = twitter_df.drop_duplicates()

In [6]:
# Retrieving Sentiment on Eikon Raw News 
eikon_df = pd.read_csv(eikon_data_path +"/raw_eikon_senti_df.csv")
eikon_df["datetime"] = pd.to_datetime(eikon_df["datetime"])
eikon_df = eikon_df.sort_values(by = "datetime")

In [7]:
print(twitter_df.shape)
print(eikon_df.shape)

(2157988, 21)
(44423, 16)


In [8]:
# Tweets on Saturday and Sunday are moved to Monday in order to not losse any relevance
from utils import fix_dates
twitter_df["datetime"] = twitter_df["datetime"].progress_apply(lambda x: fix_dates(x))
eikon_df["datetime"] = eikon_df["datetime"].progress_apply(lambda x: fix_dates(x))

progress bar: 100%|██████████| 2157988/2157988 [00:06<00:00, 329765.93it/s]
progress bar: 100%|██████████| 44423/44423 [00:00<00:00, 468987.48it/s]


---
## Finbert-tone-finetuned-finance-topic-classification
This model is a fine-tuned version of yiyanghkust/finbert-tone on Twitter Financial News Topic dataset.
Model determines the financial topic of given tweets over 20 various topics. Given the unbalanced distribution of the class labels, the weights were adjusted to pay attention to the less sampled labels which should increase overall performance.

In [9]:
# Instantiating the model 
tokenizer = AutoTokenizer.from_pretrained("nickmuchi/finbert-tone-finetuned-finance-topic-classification")
model = AutoModelForSequenceClassification.from_pretrained("nickmuchi/finbert-tone-finetuned-finance-topic-classification")

In [37]:
def topic_classifier(tweet: str):
    with tf.device('/device:GPU:0'):
        tokens = tokenizer.encode_plus(tweet, truncation=True, padding=True, return_tensors="pt", max_length= 512)
        outputs = model(**tokens)
        class_probs = outputs.logits.softmax(dim=1)
        output_class = class_probs.argmax().item()
        topic = topics_list[output_class]
        return topic

---
### Quick Test on Publicly Available Dataset 

In [11]:
tweets_train = pd.read_csv(thesis_folder_path+ "/topic_modelling/train_data.csv")
tweets_valid = pd.read_csv(thesis_folder_path+ "/topic_modelling/valid_data.csv")

In [12]:
topics = {
        "LABEL_0": "Analyst Update",
        "LABEL_1": "Fed & Central Banks",
        "LABEL_2": "Company & Product News",
        "LABEL_3": "Treasuries & Corporate Debt",
        "LABEL_4": "Dividend",
        "LABEL_5": "Earnings",
        "LABEL_6": "Energy & Oil",
        "LABEL_7": "Financials",
        "LABEL_8": "Currencies",
        "LABEL_9": "General News & Opinion",
        "LABEL_10": "Gold, Metals & Materials",
        "LABEL_11": "IPO",
        "LABEL_12": "Legal & Regulation",
        "LABEL_13": "M&A & Investments",
        "LABEL_14": "Macro",
        "LABEL_15": "Markets",
        "LABEL_16": "Politics",
        "LABEL_17": "Personnel Change",
        "LABEL_18": "Stock Commentary",
        "LABEL_19": "Stock Movement",
    }

topics_list = list(topics.values())

In [25]:
#tweets_valid["pred"] = tweets_valid["text"].progress_apply(lambda tweet: topic_classifier(tweet))

In [26]:
#accuracy_score(tweets_valid["label"].values,tweets_valid["pred"].values)*100

----
### Classifying Tweets into different Topics 

In [22]:
eikon_subset = eikon_df[["datetime", "company", "text", "cleaned_text"]]
twitter_subset = twitter_df[["datetime", "company", "text", "cleaned_text"]]

In [24]:
companies = list(twitter_subset["company"].unique())[:-1]
companies.sort(reverse=True)
companies = ["moderna","apple", "tesla", "google"]

In [25]:
companies

['moderna', 'apple', 'tesla', 'google']

In [26]:
twitter_subset["company"].value_counts()

tesla      1250193
apple       674679
google      184195
moderna      48780
Name: company, dtype: int64

In [27]:
def handle_topic_classifier(tweet):
    try:
        topic = topic_classifier(tweet)
        return topic
    except Exception as e:
        print(f"Error occurred for tweet: {tweet}")
        print(f"Error message: {str(e)}")
        return None


In [95]:
tesla_subset = twitter_df.copy()
tesla_subset = tesla_subset[tesla_subset["company"] == "apple"]
tesla_subset = tesla_subset[["datetime", "company", "text", "cleaned_text", "Vader_sentim"]]

In [96]:
with tf.device('/device:GPU:0'):
    tesla_subset["topic"] = tesla_subset["text"].progress_apply(lambda tweet: handle_topic_classifier(tweet))

progress bar: 100%|██████████| 674679/674679 [32:11:45<00:00,  5.82it/s]     


In [97]:
tesla_subset

Unnamed: 0,datetime,company,text,cleaned_text,Vader_sentim,topic
0,2017-01-02 00:04:53+00:00,apple,Review: Bragi's Headphone offers a solid alter...,review 's headphone offers a solid alternative...,0.1531,Company & Product News
2,2017-01-02 00:14:25+00:00,apple,Roasted Persian Chicken and Cauliflower | Mark...,roasted persian chicken and cauliflower | mark...,0.0000,Company & Product News
3,2017-01-02 00:38:06+00:00,apple,Apple also integrated DTrace into macOS and wr...,apple also integrated into macos and wrote som...,0.4215,Company & Product News
10,2017-01-02 01:10:47+00:00,apple,adage: ICYMI: How Apple alienated Mac loyalist...,adage how apple alienated mac loyalists,0.2732,Company & Product News
15,2017-01-02 01:13:19+00:00,apple,$AAPL OI for maturity 01/06/2017. 104.00 Highe...,apple oi for maturity 104.00 highest put 119.0...,0.0000,Stock Commentary
...,...,...,...,...,...,...
2167949,2023-04-14 21:22:38+00:00,apple,Monopoly GO Takes the Classic Board Game in a ...,monopoly go takes the classic board game in a ...,0.5106,Stock Commentary
2167951,2023-04-14 21:28:23+00:00,apple,$VIX $SPY $DIA $QQQ $NDX $NVDA $META $CMG $AZO...,spy dia meta azo ulta elf apple,0.0000,Stock Commentary
2167952,2023-04-14 21:30:08+00:00,apple,$TSLA $AAPL $NVDA i wuz worried about our shor...,tesla apple i wuz worried about our shorts but...,0.5023,Stock Commentary
2167968,2023-04-14 22:32:33+00:00,apple,HomeKit Weekly: Matter support comes to smart ...,weekly matter support comes to smart plugs wit...,0.6705,Stock Commentary


In [98]:
tesla_subset["date"] = tesla_subset["datetime"].dt.strftime('%Y-%m-%d')
tesla_subset_grouped = pd.DataFrame(tesla_subset.groupby(by = ["date", "topic"])["Vader_sentim"].sum())
tesla_subset_grouped["date"] = pd.to_datetime(tesla_subset_grouped["date"])
tesla_subset_grouped = tesla_subset_grouped.reset_index()

In [89]:
tesla_subset_grouped.head(7)

Unnamed: 0,date,topic,Vader_sentim
0,2017-01-03,Company & Product News,0.0
1,2017-01-03,Stock Movement,0.5994
2,2017-01-04,Company & Product News,0.8481
3,2017-01-04,Stock Commentary,2.1985
4,2017-01-05,Stock Commentary,0.4215
5,2017-01-09,Company & Product News,2.3788
6,2017-01-09,Stock Commentary,0.5267


In [168]:
moderna_topics = pd.read_csv(topic_modelling+ "/moderna_topic_senti.csv")
apple_topics = pd.read_csv(topic_modelling+ "/apple_topic_senti.csv")
tesla_topics = pd.read_csv(topic_modelling+ "/tesla_topic_senti.csv")

In [112]:
moderna_topics.groupby(["topic"])["topic"].count().sort_values(ascending = False)

topic
Stock Commentary               1129
Company & Product News         1034
General News & Opinion          811
Stock Movement                  810
Macro                           416
Legal & Regulation              386
Politics                        292
Markets                         287
Analyst Update                  283
Earnings                        211
M&A & Investments               187
Financials                      103
Personnel Change                 94
Treasuries & Corporate Debt      74
IPO                              52
Energy & Oil                     45
Fed & Central Banks              30
Gold, Metals & Materials         29
Currencies                       23
Dividend                          1
Name: topic, dtype: int64

In [118]:
apple_topics.groupby(["topic"])["topic"].count().sort_values(ascending = False)

topic
Stock Commentary               1640
Company & Product News         1640
Stock Movement                 1634
General News & Opinion         1620
Markets                        1563
Legal & Regulation             1548
Analyst Update                 1523
M&A & Investments              1400
Earnings                       1238
Macro                          1092
Politics                       1008
Financials                      977
Treasuries & Corporate Debt     682
Fed & Central Banks             675
Personnel Change                589
Energy & Oil                    440
Dividend                        402
IPO                             298
Currencies                      181
Gold, Metals & Materials        167
Name: topic, dtype: int64

In [169]:
tesla_topics

Unnamed: 0,date,topic,Vader_sentim
0,2017-01-02,Analyst Update,2.6299
1,2017-01-02,Company & Product News,5.0529
2,2017-01-02,General News & Opinion,-0.0941
3,2017-01-02,Legal & Regulation,-2.3734
4,2017-01-02,Markets,0.5106
...,...,...,...
23199,2023-04-14,Markets,0.4576
23200,2023-04-14,Politics,0.0273
23201,2023-04-14,Stock Commentary,23.7870
23202,2023-04-14,Stock Movement,-0.2660


In [167]:
tesla_topics_count = pd.DataFrame(tesla_topics.groupby(["date","topic"])["topic"].count())
tesla_topics_count

Unnamed: 0_level_0,Unnamed: 1_level_0,topic
date,topic,Unnamed: 2_level_1
2017-01-02,Analyst Update,1
2017-01-02,Company & Product News,1
2017-01-02,General News & Opinion,1
2017-01-02,Legal & Regulation,1
2017-01-02,Markets,1
...,...,...
2023-04-14,Markets,1
2023-04-14,Politics,1
2023-04-14,Stock Commentary,1
2023-04-14,Stock Movement,1


In [172]:
tesla_topics_count = tesla_topics.groupby(["date", "topic"]).size().reset_index(name='topic_count')
tesla_topics_count.topic_count.value_counts()

1    23204
Name: topic_count, dtype: int64

In [159]:
tesla_topics_count = pd.DataFrame(tesla_topics.groupby(["date","topic"])["topic"].count())
tesla_topics_count = tesla_topics_count.rename(columns = {"topic": "topic_count"})
tesla_topics_count = tesla_topics_count.reset_index()
tesla_topics_count = tesla_topics_count.pivot(index = "date", columns = "topic", values = "topic_count")
tesla_topics_count = tesla_topics_count.fillna(0.0)

In [162]:
tesla_topics_count

topic,Analyst Update,Company & Product News,Currencies,Dividend,Earnings,Energy & Oil,Fed & Central Banks,Financials,General News & Opinion,"Gold, Metals & Materials",IPO,Legal & Regulation,M&A & Investments,Macro,Markets,Personnel Change,Politics,Stock Commentary,Stock Movement,Treasuries & Corporate Debt
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2017-01-02,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
2017-01-03,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
2017-01-04,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0
2017-01-05,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0
2017-01-06,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-04-10,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2023-04-11,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0
2023-04-12,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0
2023-04-13,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0


In [161]:
# Specify the columns you want to plot
columns_to_plot = list(tesla_topics_count.columns)

# Create the figure
fig = go.Figure()

# Iterate over the columns and add traces to the figure
for column in columns_to_plot:
    fig.add_trace(go.Scatter(x=tesla_topics_count.index, y=tesla_topics_count[column], name=column))

# Update the layout
fig.update_layout(title='Topics Over Time', xaxis_title='Datetime', yaxis_title='Value')

# Display the plot
fig.show()



In [136]:
tesla_topics

Unnamed: 0,date,topic,Vader_sentim
0,2017-01-02,Analyst Update,2.6299
1,2017-01-02,Company & Product News,5.0529
2,2017-01-02,General News & Opinion,-0.0941
3,2017-01-02,Legal & Regulation,-2.3734
4,2017-01-02,Markets,0.5106
...,...,...,...
23199,2023-04-14,Markets,0.4576
23200,2023-04-14,Politics,0.0273
23201,2023-04-14,Stock Commentary,23.7870
23202,2023-04-14,Stock Movement,-0.2660


In [141]:
tesla_topics_pivot = tesla_topics.pivot(index = "date", columns = "topic",values=["Vader_sentim"])
tesla_topics_pivot = tesla_topics_pivot.ffill()
tesla_topics_pivot = tesla_topics_pivot.fillna(0.0)

In [143]:
tesla_topics_pivot.corr()

Unnamed: 0_level_0,Unnamed: 1_level_0,Vader_sentim,Vader_sentim,Vader_sentim,Vader_sentim,Vader_sentim,Vader_sentim,Vader_sentim,Vader_sentim,Vader_sentim,Vader_sentim,Vader_sentim,Vader_sentim,Vader_sentim,Vader_sentim,Vader_sentim,Vader_sentim,Vader_sentim,Vader_sentim,Vader_sentim,Vader_sentim
Unnamed: 0_level_1,topic,Analyst Update,Company & Product News,Currencies,Dividend,Earnings,Energy & Oil,Fed & Central Banks,Financials,General News & Opinion,"Gold, Metals & Materials",IPO,Legal & Regulation,M&A & Investments,Macro,Markets,Personnel Change,Politics,Stock Commentary,Stock Movement,Treasuries & Corporate Debt
Unnamed: 0_level_2,topic,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Vader_sentim,Analyst Update,1.0,0.087883,-0.001399,-0.016891,0.098738,0.089279,-0.02365,0.17668,-0.008364,0.006361,0.016098,-0.038319,-0.003586,0.024842,0.126787,-0.006964,0.029428,0.177981,0.304633,0.066775
Vader_sentim,Company & Product News,0.087883,1.0,-0.040783,0.188999,0.253471,0.061414,0.018929,0.159981,0.596695,0.026721,0.034348,-0.237662,0.172236,0.08998,0.108353,0.036979,0.11598,0.62365,0.353356,0.036872
Vader_sentim,Currencies,-0.001399,-0.040783,1.0,-0.068071,-0.054556,0.028898,-0.015242,-0.01292,-0.016431,-0.021211,0.009539,-0.005165,-0.084111,0.001431,-0.014337,-0.010279,0.01899,-0.032193,-0.011751,0.007154
Vader_sentim,Dividend,-0.016891,0.188999,-0.068071,1.0,0.189893,-0.004606,0.019838,0.127785,0.127821,-0.032148,0.066511,-0.010387,0.05432,0.076941,0.083115,0.007562,0.058709,0.058142,0.081249,0.020578
Vader_sentim,Earnings,0.098738,0.253471,-0.054556,0.189893,1.0,0.003973,0.099417,0.503577,0.093224,0.06313,0.035451,-0.075499,0.059745,0.129761,0.115856,0.067471,0.093515,0.274997,0.281697,0.016123
Vader_sentim,Energy & Oil,0.089279,0.061414,0.028898,-0.004606,0.003973,1.0,-0.005701,0.01441,0.007026,-0.021989,0.053623,-0.020668,-0.04483,0.059784,0.031438,-0.018273,-0.028432,0.064906,0.047295,-0.027746
Vader_sentim,Fed & Central Banks,-0.02365,0.018929,-0.015242,0.019838,0.099417,-0.005701,1.0,0.073901,-0.028385,0.006844,0.038777,0.025185,0.061403,0.06012,0.092859,0.027542,-0.002112,0.077697,0.108945,-0.016873
Vader_sentim,Financials,0.17668,0.159981,-0.01292,0.127785,0.503577,0.01441,0.073901,1.0,0.000128,0.074919,0.081928,-0.059529,0.058392,0.093783,0.141686,-0.012514,0.062738,0.165617,0.374623,0.054411
Vader_sentim,General News & Opinion,-0.008364,0.596695,-0.016431,0.127821,0.093224,0.007026,-0.028385,0.000128,1.0,-0.042327,-0.042749,-0.185304,0.111856,0.105262,0.056023,0.021378,0.138075,0.446909,0.156293,0.046545
Vader_sentim,"Gold, Metals & Materials",0.006361,0.026721,-0.021211,-0.032148,0.06313,-0.021989,0.006844,0.074919,-0.042327,1.0,-0.019919,-0.016501,0.01583,0.080758,0.083016,-0.009579,0.038333,0.006802,0.006539,0.026927


In [113]:
fig = px.line(moderna_topics, x='date', y='Vader_sentim', color='topic', labels={'Vader_sentim': 'Sentiment'})

# Update the layout
fig.update_layout(
    title='Topic-Sentiment Pair Analysis',
    xaxis_title='Date',
    yaxis_title='Sentiment per Topic',
    legend_title='Topic'
)
fig.show()

In [125]:
fig = px.line(tesla_topics, x='date', y='Vader_sentim', color='topic', labels={'Vader_sentim': 'Sentiment'})

# Update the layout
fig.update_layout(
    title='Topic-Sentiment Pair Analysis - Tesla',
    xaxis_title='Date',
    yaxis_title='Sentiment per Topic',
    legend_title='Topic'
)
# Show the graph
fig.show()

In [124]:
fig = px.line(apple_topics, x='date', y='Vader_sentim', color='topic', labels={'Vader_sentim': 'Sentiment'})
# Update the layout
fig.update_layout(
    title='Topic-Sentiment Pair Analysis - Apple',
    xaxis_title='Date',
    yaxis_title='Sentiment per Topic',
    legend_title='Topic'
)
# Show the graph
fig.show()

In [None]:
# Calculate the average tweet length in characters
average_length = twitter_subset_topics_df["text"].apply(len).mean()

# Print the result
print(f"The average tweet length is {average_length:.2f} characters.")


In [54]:
# Count the number of tweets exceeding 512 characters
count_exceeding_280 = sum(twitter_subset_topics_df["text"].apply(len) > 512)
# Count the number of tweets exceeding 800 characters
count_exceeding_800 = sum(twitter_subset_topics_df["text"].apply(len) > 800)
# Print the results
print(f"Number of tweets exceeding 512 characters: {count_exceeding_280}")
print(f"Number of tweets exceeding 800 characters: {count_exceeding_800}")


Number of tweets exceeding 512 characters: 7
Number of tweets exceeding 800 characters: 1


In [61]:
for comp in companies:
    twitter_subset_topics_df = twitter_subset[twitter_subset["company"] == comp]
    twitter_subset_topics_df["topic"] = twitter_subset_topics_df["text"].progress_apply(lambda tweet: handle_topic_classifier(tweet))
    twitter_subset_topics_df["date"] = pd.to_datetime(twitter_subset_topics_df["datetime"].dt.date)
    twitter_subset_topics_df = twitter_subset_topics_df.set_index('date')
    twitter_subset_topics_df = twitter_subset_topics_df.pivot_table(index='date', columns='topic', aggfunc='size', fill_value=0)
    tw_daily = twitter_subset_topics_df.resample('D').size().reset_index(name='tweet_count')
    twitter_subset_topics_df = tw_daily.merge(twitter_subset_topics_df, on='date')
    twitter_subset_topics_df["company"] = comp
    topics_final_df = pd.read_csv(topic_modelling+ "/topic_over_time_companies.csv")
    topics_final_df = pd.concat([topics_final_df, twitter_subset_topics_df])
    topics_final_df = topics_final_df.fillna(0.0)
    topics_final_df.to_csv(topic_modelling+ "/topic_over_time_companies.csv", index = False)
    

progress bar: 100%|██████████| 48780/48780 [28:55<00:00, 28.11it/s]
progress bar: 100%|██████████| 674679/674679 [6:42:25<00:00, 27.94it/s]  
progress bar: 100%|██████████| 1250193/1250193 [12:01:45<00:00, 28.87it/s]  
progress bar: 100%|██████████| 184195/184195 [1:41:22<00:00, 30.28it/s]


In [73]:
test = pd.DataFrame(columns = list(topics_final_df.columns))

In [59]:
#test.to_csv(topic_modelling+ "/topic_over_time_companies.csv", index = False)

In [63]:
topics_final_df = pd.read_csv(topic_modelling+ "/topic_over_time_companies.csv")
topics_final_df

Unnamed: 0,date,tweet_count,Analyst Update,Company & Product News,Earnings,Energy & Oil,Fed & Central Banks,General News & Opinion,Legal & Regulation,M&A & Investments,...,Stock Commentary,Stock Movement,Financials,Treasuries & Corporate Debt,Dividend,Personnel Change,IPO,company,Currencies,"Gold, Metals & Materials"
0,2017-01-03,1,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,moderna,0,0
1,2017-01-04,1,0,3,0,0,0,0,0,0,...,3,0,0,0,0,0,0,moderna,0,0
2,2017-01-05,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,moderna,0,0
3,2017-01-09,1,0,8,0,0,0,0,0,0,...,1,0,0,0,0,0,0,moderna,0,0
4,2017-01-10,1,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,moderna,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6211,2023-04-10 00:00:00,1,2,24,1,0,0,13,0,0,...,20,2,0,0,0,0,0,google,0,0
6212,2023-04-11 00:00:00,1,0,43,0,0,0,3,8,2,...,12,7,0,0,0,0,0,google,0,0
6213,2023-04-12 00:00:00,1,1,10,0,0,0,1,2,1,...,9,0,0,1,0,1,0,google,0,0
6214,2023-04-13 00:00:00,1,0,13,0,0,0,4,8,0,...,9,2,0,0,0,0,0,google,0,0


In [69]:
twitter_subset_final = topics_final_df[topics_final_df["company"] == "tesla"]
twitter_subset_final = twitter_subset_final.set_index("date")

In [70]:
# twitter_subset_final = twitter_subset_final.iloc[:,:21]

# for col_name in list(twitter_subset_final.columns): 
    
#     twitter_subset_final["r42" + col_name] = twitter_subset_final[col_name].rolling(42, min_periods = 1).mean()

In [71]:


# Specify the columns you want to plot
columns_to_plot = list(twitter_subset_final.columns)

# Create the figure
fig = go.Figure()

# Iterate over the columns and add traces to the figure
for column in columns_to_plot:
    fig.add_trace(go.Scatter(x=twitter_subset_final.index, y=twitter_subset_final[column], name=column))

# Update the layout
fig.update_layout(title='Topics Over Time', xaxis_title='Datetime', yaxis_title='Value')

# Display the plot
fig.show()



---
### Eikon Topics for Moderna

In [81]:
eikon_subset.company.value_counts()
companies = ["apple", "tesla", "moderna"]

In [90]:
eikon_subset

Unnamed: 0,date,news_count,tweet_count,Analyst Update,Company & Product News,Currencies,Dividend,Earnings,Energy & Oil,Fed & Central Banks,...,Legal & Regulation,M&A & Investments,Macro,Markets,Personnel Change,Politics,Stock Commentary,Stock Movement,Treasuries & Corporate Debt,company


In [175]:
eikon_subset

Unnamed: 0,datetime,company,text,cleaned_text
0,2017-01-02 12:44:28,apple,Apple Inc. (AAPL) Sees Large Drop in Short Int...,apple inc apple sees large drop in short interest
1,2017-01-02 16:34:58,apple,"JLB & Associates Inc. Has $10,751,000 Position...","associates inc has $ 10,751,000 position in ap..."
2,2017-01-02 09:24:17,apple,German and French share indexes start 2017 on ...,german and french share indexes start 2017 on ...
3,2017-01-02 09:52:42,apple,BUZZ-Dialog Semi: Falls on report Apple plans ...,buzz dialog semi falls on report apple plans i...
4,2017-01-02 22:54:59,tesla,"Tesla Motors, Inc. (TSLA) Downgraded by Vetr Inc.",tesla motors inc tesla downgraded by inc
...,...,...,...,...
44418,2023-04-07 16:24:11,tesla,UPDATE 1-Tesla recalls 422 U.S. vehicles over ...,update 1 tesla recalls 422 vehicles over suspe...
44419,2023-04-07 18:23:42,tesla,Used U.S. electric vehicle sales jump as price...,used electric vehicle sales jump as prices fall
44420,2023-04-07 19:02:55,tesla,Tesla just flashed a sell signal that could sp...,tesla just flashed a sell signal that could sp...
44421,2023-04-07 20:17:22,moderna,"Catalyst Watch: Eyes on inflation data, big ba...",catalyst watch eyes on inflation data big bank...


In [177]:
with tf.device('/device:GPU:0'):
    eikon_df["topic"] = eikon_df["text"].progress_apply(lambda tweet: handle_topic_classifier(tweet))

progress bar: 100%|██████████| 44423/44423 [1:31:15<00:00,  8.11it/s]     


In [178]:
eikon_df

Unnamed: 0,datetime,source,stock,text,company,cleaned_text,Finbert_sentim,Finbert_pos,Finbert_neg,Finbert_neut,Textblob_senti,Textblob_obj,Vader_neg,Vader_neut,Vader_pos,Vader_sentim,topic
0,2017-01-02 12:44:28,ZOLCOM,AAPL.OQ,Apple Inc. (AAPL) Sees Large Drop in Short Int...,apple,apple inc apple sees large drop in short interest,-1.0,0.019455,0.965669,0.014876,0.107143,0.364286,0.174,0.579,0.248,0.2263,Stock Movement
1,2017-01-02 16:34:58,ZOLCOM,AAPL.OQ,"JLB & Associates Inc. Has $10,751,000 Position...",apple,"associates inc has $ 10,751,000 position in ap...",0.0,0.039715,0.015316,0.944969,0.000000,0.000000,0.000,1.000,0.000,0.0000,Stock Commentary
2,2017-01-02 09:24:17,RTRS,.GDAXI .IBEX,German and French share indexes start 2017 on ...,apple,german and french share indexes start 2017 on ...,-1.0,0.015887,0.972655,0.011458,0.000000,0.000000,0.206,0.638,0.156,-0.1779,Markets
3,2017-01-02 09:52:42,RTRS,AAPL.O DLGS.DE,BUZZ-Dialog Semi: Falls on report Apple plans ...,apple,buzz dialog semi falls on report apple plans i...,-1.0,0.012941,0.917182,0.069877,0.000000,0.000000,0.189,0.811,0.000,-0.2732,Company & Product News
4,2017-01-02 22:54:59,AMEBAN,TSLA.OQ,"Tesla Motors, Inc. (TSLA) Downgraded by Vetr Inc.",tesla,tesla motors inc tesla downgraded by inc,-1.0,0.031287,0.625415,0.343297,0.000000,0.000000,0.000,1.000,0.000,0.0000,Analyst Update
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44418,2023-04-07 16:24:11,RTRS,TSLA.O,UPDATE 1-Tesla recalls 422 U.S. vehicles over ...,tesla,update 1 tesla recalls 422 vehicles over suspe...,-1.0,0.010826,0.962579,0.026595,0.000000,0.000000,0.000,1.000,0.000,0.0000,Company & Product News
44419,2023-04-07 18:23:42,RTRS,TSLA.O GM.N,Used U.S. electric vehicle sales jump as price...,tesla,used electric vehicle sales jump as prices fall,-1.0,0.028715,0.933346,0.037939,0.000000,0.000000,0.000,1.000,0.000,0.0000,Macro
44420,2023-04-07 19:02:55,BUSINT,TSLA.O,Tesla just flashed a sell signal that could sp...,tesla,tesla just flashed a sell signal that could sp...,1.0,0.530141,0.445824,0.024035,0.500000,0.500000,0.127,0.775,0.098,-0.1440,Stock Movement
44421,2023-04-07 20:17:22,SEECOM,IMG.TO PSMT.O,"Catalyst Watch: Eyes on inflation data, big ba...",moderna,catalyst watch eyes on inflation data big bank...,0.0,0.033585,0.044580,0.921835,0.000000,0.100000,0.000,1.000,0.000,0.0000,Earnings


In [92]:
# eikon_subset["topic"] = eikon_subset["text"].progress_apply(lambda tweet: handle_topic_classifier(tweet))

for comp in companies:
    eikon_subset = eikon_df[eikon_df["company"] == comp]
    eikon_subset["topic"] = eikon_subset["text"].progress_apply(lambda tweet: handle_topic_classifier(tweet))
    eikon_subset["date"] = pd.to_datetime(eikon_subset["datetime"].dt.date)
    eikon_subset = eikon_subset.set_index('date')
    eikon_subset = eikon_subset.pivot_table(index='date', columns='topic', aggfunc='size', fill_value=0)
    tw_daily = eikon_subset.resample('D').size().reset_index(name='news_count')
    eikon_subset = tw_daily.merge(twitter_subset_topics_df, on='date')
    eikon_subset["company"] = comp
    topics_final_df = pd.read_csv(topic_modelling+ "/eikon_topic_over_time_companies.csv")
    topics_final_df = pd.concat([topics_final_df, eikon_subset])
    topics_final_df = topics_final_df.fillna(0.0)
    topics_final_df.to_csv(topic_modelling+ "/eikon_topic_over_time_companies.csv", index = False)

progress bar: 100%|██████████| 19142/19142 [06:45<00:00, 47.20it/s]
progress bar: 100%|██████████| 18337/18337 [06:34<00:00, 46.53it/s]
progress bar: 100%|██████████| 6944/6944 [02:34<00:00, 45.05it/s]


In [144]:
topics_final_df = pd.read_csv(topic_modelling+ "/eikon_topic_over_time_companies.csv")
eikon_subset_final = topics_final_df[topics_final_df["company"] == ""]
eikon_subset_final = eikon_subset_final.set_index("date")

In [145]:
columns_to_plot = list(eikon_subset_final.columns)
# Create the figure
fig = go.Figure()
# Iterate over the columns and add traces to the figure
for column in columns_to_plot:
    fig.add_trace(go.Scatter(x=eikon_subset_final.index, y=eikon_subset_final[column], name=column))
# Update the layout
fig.update_layout(title='Topics Count Over Time - Tesla', xaxis_title='Date', yaxis_title='News Count')
# Display the plot
fig.show()

In [None]:
topics_final_df = pd.read_csv(topic_modelling+ "/twitter_topic_over_time_companies.csv")
eikon_subset_final = topics_final_df[topics_final_df["company"] == ""]
eikon_subset_final = eikon_subset_final.set_index("date")