In [2]:
import pandas as pd
import numpy as np
# Load data
dp = pd.read_csv('data/sp500_2023_ret_headline.csv')
headline = dp['headline']

In [3]:
from sentence_transformers import SentenceTransformer
# Pre-calculate embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(headline, show_progress_bar=True)

Batches:   0%|          | 0/71639 [00:00<?, ?it/s]

In [4]:
np.save('data/Grid_Bert_23_embeddings.npy', embeddings)

In [5]:
from umap import UMAP

# Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

2024-05-21 11:17:51.924393: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
from hdbscan import HDBSCAN

# Cluster embeddings
hdbscan_model = HDBSCAN(min_cluster_size=150, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

# Vectorize
vectorizer_model = CountVectorizer(stop_words="english", min_df=0.1, max_df = 0.9, ngram_range=(1, 3), max_features=10000)

In [14]:
import openai
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech

# KeyBERT
keybert_model = KeyBERTInspired()

# Part-of-Speech
pos_model = PartOfSpeech("en_core_web_sm")

# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.3)

# GPT-3.5
client = openai.OpenAI(api_key="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")
prompt = """
I have a topic that contains the following documents: 
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
topic: <topic label>
"""
openai_model = OpenAI(client, model="gpt-3.5-turbo", exponential_backoff=True, chat=True, prompt=prompt)

# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    "OpenAI": openai_model,  # Uncomment if you will use OpenAI
    "MMR": mmr_model,
    "POS": pos_model
}

ModuleNotFoundError: In order to use Part of Speech with Spacy you will need to install via;

pip install bertopic[spacy]



In [8]:
from bertopic import BERTopic

# Create BERTopic model
topic_model = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  #representation_model=representation_model,
  calculate_probabilities=False,
  low_memory = True,
    

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

# Train model
topics, probs = topic_model.fit_transform(headline, embeddings)

# Show topics
topic_model.get_topic_info()

2024-05-21 11:21:00,446 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


MemoryError: Unable to allocate memory to append item

In [97]:
chatgpt_topic_labels = {topic: " | ".join(list(zip(*values))[0]) for topic, values in topic_model.topic_aspects_["OpenAI"].items()}
chatgpt_topic_labels[-1] = "Outlier Topic"
topic_model.set_topic_labels(chatgpt_topic_labels)
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,CustomName,Representation,KeyBERT,OpenAI,MMR,POS,Representative_Docs
0,-1,16691,-1_stock_outperforms_competitors_underperforms,Outlier Topic,"[stock, outperforms, competitors, underperform...","[stock outperforms, stock underperforms, outpe...",[Stock Performance Comparison in Market],"[stock, outperforms, competitors, underperform...","[stock, global, trading, options, energy, stoc...",[Adobe Inc. Stock Outperforms Competitors On S...
1,0,1954,0_healthcare conference_morgan healthcare_heal...,Healthcare Conference Presentations,"[healthcare conference, morgan healthcare, hea...","[morgan healthcare, therapeutics, healthcare c...",[Healthcare Conference Presentations],"[healthcare conference, morgan healthcare, hea...","[healthcare, annual, conference, present, 41st...",[Press Release: TG Therapeutics to Present at ...
2,1,1028,1_00_price target_target_00 share,Stock Price Target Changes,"[00, price target, target, 00 share, price, sh...","[price target, target raised, wells fargo, pri...",[Stock Price Target Changes],"[00, price target, target, 00 share, price, sh...","[target, price, shares, stock, practices, auto...",[RealReal Price Target Cut to $2.00/Share From...
3,2,930,2_ces_ces 2023_2023_new,CES 2023 Electric Vehicle Innovations,"[ces, ces 2023, 2023, new, electric, unveils, ...","[ces 2023, 2023 new, 2023 press, 2023 2032, 20...",[CES 2023 Electric Vehicle Innovations],"[ces, ces 2023, 2023, new, electric, unveils, ...","[new, electric, vehicle, industry, technology,...",[Press Release: Blink Charging Unveils Five Ne...
4,3,883,3_net asset_asset value_net_asset,Asset Value Reports,"[net asset, asset value, net, asset, value, ms...","[net asset, asset value, asset, value, investm...",[Asset Value Reports],"[net asset, asset value, net, asset, value, ms...","[dividend, payments, shareholders, , , , , , , ]",[iShares VII UTB 3-7 UCITS (ACC) $ Net Asset V...
5,4,696,4_quarter_fourth quarter_fourth_earnings,Earnings Conference Calls 2022,"[quarter, fourth quarter, fourth, earnings, re...","[earnings conference, conference press, confer...",[Earnings Conference Calls 2022],"[quarter, fourth quarter, fourth, earnings, re...","[quarter, fourth quarter, fourth, earnings, re...",[Press Release: Radware Schedules Conference C...
6,5,691,5_stocks_mw_fed_jobs,Stocks Slide After Fed Comments,"[stocks, mw, fed, jobs, gas, oil, jobs data, d...","[jobs data, wsj stocks, marketwatch mw, stocks...",[Stocks Slide After Fed Comments],"[stocks, mw, fed, jobs, gas, oil, jobs data, d...","[stocks, jobs, gas, oil, data, rate, futures, ...","[MW U.S. stocks slide on jobs data, hawkish Fe..."
7,6,664,6_registers_vp_surrenders_acquires,VP Registration Activity Inc,"[registers, vp, surrenders, acquires, officer,...","[vp, network corp, acquires, registers, pgr, i...",[VP Registration Activity Inc],"[registers, vp, surrenders, acquires, officer,...","[, , , , , , , , , ]","[VP Lacy Surrenders 242 Of UDR Inc >UDR, VP Ga..."
8,7,577,7_plc_form_voting_rights,Voting Rights Holdings Analysis,"[plc, form, voting, rights, holding, total, ho...","[plc announces, plc, trust plc, morgan stanley...",[Voting Rights Holdings Analysis],"[plc, form, voting, rights, holding, total, ho...","[voting, rights, total, company, shares, capit...",[Morgan Stanley & Co. Int'l plc Form 8.5 (EPT/...
9,8,552,8_sky news_news_newly released_released,Global News Updates,"[sky news, news, newly released, released, new...","[sky news, new book, news, book, faith, covid ...",[Global News Updates],"[sky news, news, newly released, released, new...","[book, life, story, new book, faith, new, powe...",[WHO 'Really Concerned' About COVID Situation ...


In [None]:
topic_distr, _ = topic_model.approximate_distribution(headline)
#topic_distr is a n x m matrix where n are the topics and m the documents. We can then visualize the distribution of topics in a document.
topic_distr = pd.DataFrame(topic_distr)
topic_distr

In [100]:
#aggregate each company's topic exposures each day
index = 0
days = dp['date']
companies = dp['permno']
topic_num = topic_distr.shape[1]

agg_df = pd.DataFrame()
agg_df["date"] = days
agg_df["company"] = companies
agg_df["ret"] = dp['ret']

#merge topic_distr with agg_df
agg_df = pd.concat([agg_df, topic_distr], axis=1)

#rename columns
agg_df.columns = ["date", "company", "ret"] + ["topic_" + str(i + 1) for i in range(topic_num)]

#aggregate each company's topic exposures each day
agg_df = agg_df.groupby(["date", "company", "ret"], sort = False).sum().reset_index()
#sort by company and date
agg_df = agg_df.sort_values(by = ["company", "date"], ignore_index = True)
agg_df

#drop outliers, replace topic_exposure > 100 with 1
# agg_df.iloc[:, 3:] = agg_df.iloc[:, 3:].clip(upper = 50)
# agg_df


Unnamed: 0,date,company,ret,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,...,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24,topic_25
0,2023/1/3,10104,0.024223,0.000000,0.000000,2.106822,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,5.699434,0.000000,0.056947,0.000000,1.000000,0.000000,0.000000
1,2023/1/4,10104,0.009078,1.436034,0.000000,3.961611,0.000000,1.674169,1.068066,0.000000,...,0.000000,1.866024,0.752970,9.316722,0.000000,1.493938,0.389043,1.703532,0.000000,1.682725
2,2023/1/5,10104,-0.002012,0.794239,2.000000,2.239412,0.000000,0.428667,0.711916,1.537805,...,1.000000,0.519467,0.676334,6.466521,0.000000,0.000000,1.089898,0.882205,0.000000,0.934168
3,2023/1/6,10104,0.016012,0.303172,0.000000,0.653039,0.000000,1.569746,0.000000,0.000000,...,0.000000,2.571093,0.239538,5.080517,0.000000,0.700622,0.153336,1.430320,0.000000,2.255549
4,2023/1/3,10107,-0.001001,83.412645,3.717476,47.344967,3.025995,44.012265,9.424005,4.835909,...,31.439623,0.662840,0.500128,21.433908,1.056901,26.519599,2.164499,0.156774,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1772,2023/1/6,93429,-0.002262,0.000000,0.000000,0.000000,0.000000,0.478594,0.586416,0.000000,...,0.000000,0.410530,1.030486,0.000000,0.000000,0.000000,0.079011,0.018359,1.000000,2.528101
1773,2023/1/3,93436,-0.122422,0.072187,3.278765,2.135957,0.000000,5.581603,12.249522,0.000000,...,1.168878,1.214592,0.964179,0.788651,4.617672,0.078229,20.180251,2.010494,1.042299,0.124055
1774,2023/1/4,93436,0.051249,0.000000,2.525622,3.319081,0.000000,1.609239,14.730206,0.316826,...,1.320130,1.815835,1.497449,1.509737,3.183657,0.337654,15.415663,1.593878,0.999922,1.247877
1775,2023/1/5,93436,-0.029039,0.133479,1.580263,3.245940,1.340552,2.563166,8.111830,0.000000,...,0.490946,0.383714,0.732782,1.367356,3.462956,0.000000,8.017667,0.601475,0.000000,0.834020


In [None]:
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import statsmodels.api as sm

#multi regress return on the topic exposures
returns = agg_df["ret"].values.reshape(-1, 1)
topics = agg_df.iloc[:, 3:].values
model_sk = LinearRegression()
model_sk.fit(topics, returns)

# Add a constant to the independent variable for the intercept
X = sm.add_constant(topics)
# Fit the regression model
model_sm = sm.OLS(returns, X).fit()
# Print the summary of the regression
print(model_sm.summary())
print("\n")

#plot the coefficients
plt.figure(figsize=(10, 5))
plt.bar(range(len(model_sk.coef_[0])), model_sk.coef_[0])
plt.xticks(range(len(model_sk.coef_[0])), ["topic_" + str(i + 1) for i in range(topic_num)], rotation=90)
plt.title("Coefficients of Topics")
plt.show()

