In [1]:
# Import necessary modules
import os 
import pandas as pd 
from sentence_transformers import SentenceTransformer
# Take PCA and KMeans as the example here
from cuml.decomposition import PCA
# from cuml.cluster import KMeans
from pycave.bayes import GaussianMixture as GMM
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
import numpy as np
import collections
from tqdm import tqdm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
# Check available GPUs
os.environ["CUDA_VISIBLE_DEVICES"] = "7"

In [3]:
df_folder = "/scratch/wx2309/Processed_data/one_year_window"
embeddings_folder = "/scratch/wx2309/embeddings"
df = pd.read_csv(df_folder+"/contem_2023.csv")
headlines = df.headline.tolist()
embeddings = np.load(embeddings_folder+"/contem_2023_embeddings.npy")

In [4]:
vocab = collections.Counter()
tokenizer = CountVectorizer().build_tokenizer()
for headline in tqdm(headlines):
    vocab.update(tokenizer(headline))
vocab = [word for word, count in vocab.items() if count > 20]
len(vocab)

100%|██████████| 1749770/1749770 [00:07<00:00, 233618.72it/s]


41353

In [6]:
class MyGMM:
    def __init__(self, num_components, trainer_params):
        self.gmm = GMM(num_components=num_components,trainer_params=trainer_params)
        self.labels_ = None
    
    def fit(self,data):
        self.gmm.fit(data)
        self.labels_ = np.array(self.gmm.predict(data))
        return self
    
    def predict(self,data):
        return np.array(self.gmm.predict(data))

In [8]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
umap_model = PCA(n_components = 10)
# Assume 62 clusters here
hdbscan_model = MyGMM(num_components=62,trainer_params={"accelerator":'gpu',"devices":1})
vectorizer_model = CountVectorizer(vocabulary=vocab,stop_words="english")
Topic_model = BERTopic(embedding_model=embedding_model, umap_model=umap_model, hdbscan_model=hdbscan_model, vectorizer_model=vectorizer_model,
                    calculate_probabilities = False,verbose = True,low_memory = True)

In [9]:
Topic_model.fit(headlines,embeddings)

2024-06-17 23:51:41,431 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-17 23:51:44,071 - BERTopic - Dimensionality - Completed ✓
2024-06-17 23:51:44,109 - BERTopic - Cluster - Start clustering the reduced embeddings
Fitting K-means estimator...
Running initialization...


Epoch 122: 100%|██████████| 1/1 [00:00<00:00, 106.76it/s]
Epoch 299: 100%|██████████| 1/1 [00:00<00:00, 109.59it/s]

Fitting K-Means...



Epoch 299: 100%|██████████| 1/1 [00:00<00:00, 41.05it/s, inertia=0.0527]

Running initialization...



Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 57.32it/s]

Fitting Gaussian mixture...



Epoch 23: 100%|██████████| 1/1 [00:00<00:00, 54.03it/s, nll=-8.54] 
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  6.98it/s] 


2024-06-17 23:51:57,233 - BERTopic - Cluster - Completed ✓
2024-06-17 23:51:57,415 - BERTopic - Representation - Extracting topics from clusters using representation models.
  idf = np.log((avg_nr_samples / df)+1)
2024-06-17 23:52:07,869 - BERTopic - Representation - Completed ✓


<bertopic._bertopic.BERTopic at 0x7fa728ce9810>

In [10]:
topic_dist, _ = Topic_model.approximate_distribution(headlines)
contem_ret_topic_dist = pd.concat([df.drop(columns = ["rp_entity_id","headline"]),pd.DataFrame(topic_dist)],axis = 1)
grouped = contem_ret_topic_dist.groupby(['date',"comnam","ret"])
grouped_sum = grouped.sum()

X = np.array(grouped_sum)
ret = [ind[2] for ind in list(grouped_sum.index)]
Y = np.array(ret).reshape(-1,1)
X_tr, X_te, Y_tr, Y_te = train_test_split(X,Y,test_size=0.2,random_state=66)
regression = LinearRegression(fit_intercept=True)
regression.fit(X_tr,Y_tr)
Y_tr_pred = regression.predict(X_tr)
Y_te_pred = regression.predict(X_te)
mse_tr = mean_squared_error(Y_tr,Y_tr_pred)
mse_te= mean_squared_error(Y_te,Y_te_pred)
regression.fit(X,Y)
R_square = regression.score(X,Y)

print(f"Training error is {mse_tr}")
print(f"Testing error is {mse_te}")
print(f"R square is {R_square}")

100%|██████████| 1750/1750 [01:45<00:00, 16.63it/s]


Training error is 0.00038033350059003433
Testing error is 0.0003901718257529071
R square is 0.007925715186710591
