In [1]:
!nvidia-smi

Thu Jul  4 17:45:27 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          On  |   00000000:07:00.0 Off |                    0 |
| N/A   57C    P0            270W /  400W |   14715MiB /  81920MiB |    100%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA A100-SXM4-80GB          On  |   00

In [2]:
# Check available GPUs
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [4]:
# Import necessary modules
import pandas as pd 
from sentence_transformers import SentenceTransformer
# Take PCA and KMeans as the example here
from cuml.decomposition import PCA
# from cuml.cluster import KMeans
from pycave.bayes import GaussianMixture as GMM
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
import numpy as np
import collections
from tqdm import tqdm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
df_folder = "/shared/share_tm-finance/Processed_df/One_year_window"
embeddings_folder = "/shared/share_tm-finance/Embeddings/One_year_window"
df = pd.read_csv(df_folder+"/contem_2023.csv")
headlines = df.vocab_con_headline.tolist()
embeddings = np.load(embeddings_folder+"/contem_2023_embeddings.npy")

In [6]:
class MyGMM:
    def __init__(self, num_components, trainer_params):
        self.gmm = GMM(num_components=num_components,trainer_params=trainer_params)
        self.labels_ = None
    
    def fit(self,data):
        self.gmm.fit(data)
        self.labels_ = np.array(self.gmm.predict(data))
        return self
    
    def predict(self,data):
        return np.array(self.gmm.predict(data))

In [7]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
umap_model = PCA(n_components = 10)
# Assume 62 clusters here
hdbscan_model = MyGMM(num_components=62,trainer_params={"accelerator":'gpu',"devices":1})
vectorizer_model = CountVectorizer()
Topic_model = BERTopic(embedding_model=embedding_model, umap_model=umap_model, hdbscan_model=hdbscan_model, vectorizer_model=vectorizer_model,
                    calculate_probabilities = False,verbose = True,low_memory = True)

In [8]:
Topic_model.fit(headlines,embeddings)

2024-07-04 17:47:19,037 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-04 17:47:24,078 - BERTopic - Dimensionality - Completed ✓
2024-07-04 17:47:24,137 - BERTopic - Cluster - Start clustering the reduced embeddings
Fitting K-means estimator...
Running initialization...


Epoch 122: 100%|██████████| 1/1 [00:00<00:00, 70.66it/s] 
Epoch 299: 100%|██████████| 1/1 [00:00<00:00, 96.84it/s] 


Fitting K-Means...


Epoch 299: 100%|██████████| 1/1 [00:00<00:00, 24.77it/s, inertia=0.0604]

Running initialization...



Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 11.98it/s]


Fitting Gaussian mixture...


Epoch 21: 100%|██████████| 1/1 [00:00<00:00, 52.32it/s, nll=-7.83] 
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  3.40it/s] 


2024-07-04 17:47:44,561 - BERTopic - Cluster - Completed ✓
2024-07-04 17:47:44,798 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-07-04 17:47:53,066 - BERTopic - Representation - Completed ✓


<bertopic._bertopic.BERTopic at 0x7fcf877ff400>

In [11]:
topic_dist, _ = Topic_model.approximate_distribution(headlines)
contem_ret_topic_dist = pd.concat([df.drop(columns = ["rp_entity_id","headline","vocab_con_headline"]),pd.DataFrame(topic_dist)],axis = 1)
grouped = contem_ret_topic_dist.groupby(['date',"comnam","ret"])
grouped_sum = grouped.sum()

X = np.array(grouped_sum)
ret = [ind[2] for ind in list(grouped_sum.index)]
Y = np.array(ret).reshape(-1,1)
X_tr, X_te, Y_tr, Y_te = train_test_split(X,Y,test_size=0.2,random_state=66)
regression = LinearRegression(fit_intercept=True)
regression.fit(X_tr,Y_tr)
Y_tr_pred = regression.predict(X_tr)
Y_te_pred = regression.predict(X_te)
mse_tr = mean_squared_error(Y_tr,Y_tr_pred)
mse_te= mean_squared_error(Y_te,Y_te_pred)
regression.fit(X,Y)
R_square = regression.score(X,Y)

print(f"Training error is {mse_tr}")
print(f"Testing error is {mse_te}")
print(f"R square is {R_square}")

100%|██████████| 1750/1750 [01:27<00:00, 19.89it/s]


Training error is 0.00038121179866213643
Testing error is 0.00039281251210995127
R square is 0.0047007730087390565
