In [1]:
!nvidia-smi

Tue May 28 16:31:34 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:3B:00.0 Off |                    0 |
| N/A   64C    P0              68W /  70W |  12856MiB / 15360MiB |    100%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  Tesla T4                       Off | 00000000:5E:00.0 Off |  

In [2]:
# Choose GPUs with enough memories
import os 
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [6]:
from zipfile import ZipFile
import pandas as pd

with ZipFile("/user/wx2309/Topic-modeling-store/Processed data/past one year/contem_2023.csv.zip", "r") as unzipped_file:
  with unzipped_file.open("contem_2023.csv") as csv_file:
    contem_2023 = pd.read_csv(csv_file)

In [7]:
contem_2023.head()

Unnamed: 0,date,rp_entity_id,comnam,ret,headline
0,2023-01-03,B4C673,YUM BRANDS INC,-0.008745,Business Disruptions Wane As Some Industries S...
1,2023-01-03,B4C673,YUM BRANDS INC,-0.008745,Press Release: KFC(R) Offers A Massive Deal on...
2,2023-01-03,B4C673,YUM BRANDS INC,-0.008745,Business News: Pizza Chains Race to Hire Driv...
3,2023-01-03,B4C673,YUM BRANDS INC,-0.008745,KFC(R) Offers A Massive Deal on a Comfort Food...
4,2023-01-03,B4C673,YUM BRANDS INC,-0.008745,Business Disruptions Wane as Some Industries S...


In [8]:
contem_2023.shape

(1749770, 5)

In [9]:
from sentence_transformers import SentenceTransformer
from cuml.manifold import UMAP
from cuml.cluster import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
# embeddings = embedding_model.encode(contem_2023.headline.tolist(),show_progress_bar=True)

In [None]:
# import numpy as np

# os.mkdir("/user/wx2309/Topic-modeling-store/Bertopic_sp500_2023_contem_model_folder")
# with open("/user/wx2309/Topic-modeling-store/Bertopic_sp500_2023_contem_model_folder/contem_2023_embeddings.npy", "wb") as f:
#     np.save(f, embeddings)
# headline_list = contem_2023.headline.tolist()

In [12]:
import numpy as np

with open("/user/wx2309/Topic-modeling-store/Bertopic_sp500_2023_contem_model_folder/contem_2023_embeddings.npy", "rb") as f:
    embeddings = np.load(f)
headline_list = contem_2023.headline.tolist()

In [13]:
# create vocabulary in advance to release memory
import collections
from tqdm import tqdm

vocab = collections.Counter()
tokenizer = CountVectorizer().build_tokenizer()
for headline in tqdm(headline_list):
    vocab.update(tokenizer(headline))
vocab = [word for word, count in vocab.items() if count > 20]
len(vocab)

100%|██████████████████████████████████| 1749770/1749770 [00:08<00:00, 208610.39it/s]


41353

In [14]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
umap_model = UMAP(n_neighbors = 50, n_components = 5, min_dist = 0, metric = 'cosine',random_state = 42,verbose = True)
# Upon several fittings, it was found that cluster size around 1000 to a 1.7M headlines create 50-100 topics
hdbscan_model = HDBSCAN(min_cluster_size =1000,  metric='euclidean', cluster_selection_method='eom',\
                        gen_min_span_tree=True,prediction_data=False,min_samples = 50,verbose = True)
vectorizer_model = CountVectorizer(vocabulary=vocab,stop_words="english")
Topic_model = BERTopic(embedding_model=embedding_model, umap_model=umap_model, hdbscan_model=hdbscan_model, vectorizer_model=vectorizer_model,
                       calculate_probabilities = False,verbose = True,low_memory = True)

In [None]:
# Topic_model.fit(headline_list,embeddings)
# os.mkdir("/user/wx2309/Topic-modeling-store/Bertopic_sp500_2023_contem_model_folder/Bertopic_sp500_2023_contem_model")
# Topic_model.save("/user/wx2309/Topic-modeling-store/Bertopic_sp500_2023_contem_model_folder/Bertopic_sp500_2023_contem_model",serialization = "safetensors", save_ctfidf = True, save_embedding_model = embedding_model)

In [16]:
Topic_model = BERTopic.load("/user/wx2309/Topic-modeling-store/Bertopic_sp500_2023_contem_model_folder/Bertopic_sp500_2023_contem_model",
                           embedding_model = embedding_model)

In [18]:
topic_dist, _ = Topic_model.approximate_distribution(headline_list)
contem_ret_topic_dist = pd.concat([contem_2023.drop(columns = ["rp_entity_id","headline"]),pd.DataFrame(topic_dist)],axis = 1)
grouped = contem_ret_topic_dist.groupby(['date',"comnam","ret"])
grouped_sum = grouped.sum()

100%|████████████████████████████████████████████| 1750/1750 [01:59<00:00, 14.61it/s]


In [19]:
X = np.array(grouped_sum)
ret = []
for ind in list(grouped_sum.index):
  ret.append(ind[2])
Y = np.array(ret).reshape(-1,1)
from sklearn.linear_model import LinearRegression
bert_model = LinearRegression(fit_intercept=True)
bert_model.fit(X,Y)
bert_model.score(X,Y)

0.008572583236138631