In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "6"

In [2]:
# Import necessary modules
import pandas as pd 
from sentence_transformers import SentenceTransformer
# Take PCA and GMM as the example here
from cuml.decomposition import PCA
from pycave.bayes import GaussianMixture as GMM
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
import torch
import gc

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df_folder = "/shared/share_tm-finance/Processed_df_Sentiment/One_year_window"
embeddings_folder = "/shared/share_tm-finance/Embeddings_with_Sentiment/One_year_window"
saved_model_folder = "/shared/share_tm-finance/Stored_model/pcagmm"
year = 2022
num_clusters = 60
df = pd.read_csv(df_folder+f"/contem_{year}_senti.csv")
headlines = df.vocab_con_headline.tolist()
embeddings = np.load(embeddings_folder+f"/contem_{year}_senti_embeddings.npy")

In [4]:
df.head()

Unnamed: 0,date,rp_entity_id,comnam,ret,headline,css,vocab_con_headline
0,2022-01-03,6284B5,IRON MOUNTAIN INC NEW,-0.019301,CFA High Yield:Insider Review For Week Ended 3...,0.0,cfa high yield insider review week ended 31 dec
1,2022-01-03,6284B5,IRON MOUNTAIN INC NEW,-0.019301,CFA Real Estate:Insider Review For Week Ended ...,0.0,cfa real estate insider review week ended 31 dec
2,2022-01-03,6284B5,IRON MOUNTAIN INC NEW,-0.019301,MW These are the best-performing S&P 500 and N...,0.1,best performing 500 nasdaq 100
3,2022-01-03,636639,DOVER CORP,-0.018007,PE Daily: Private Equity's Tech Bonanza | -2-,-0.06,pe daily private equity tech bonanza
4,2022-01-03,636639,DOVER CORP,-0.018007,Private Equity Daily: Private Equity's 2021 Te...,0.0,private equity daily private equity tech


In [5]:
class MyGMM:
    def __init__(self, num_components, trainer_params):
        self.gmm = GMM(num_components=num_components,trainer_params=trainer_params)
        self.labels_ = None
    
    def fit(self,data):
        self.gmm.fit(data)
        self.labels_ = np.array(self.gmm.predict(data))
        return self
    
    def predict(self,data):
        return np.array(self.gmm.predict(data))

In [6]:
insampler2_list = []
outsampler2_new_list = []
outsampler2_old_list = []
kfold = KFold(shuffle=True,random_state=66)
for i, (tr_ind,te_ind) in enumerate(kfold.split(range(len(headlines)))):
    torch.cuda.empty_cache()
    gc.collect()

    tr_df = df.iloc[tr_ind,:]
    te_df = df.iloc[te_ind,:]
    tr_headlines = [headlines[ind] for ind in tr_ind]
    te_headlines = [headlines[ind] for ind in te_ind]
    tr_embeddings = embeddings[tr_ind,:]

    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
    umap_model = PCA(n_components = 10)
    # Assume 60 clusters here
    hdbscan_model = MyGMM(num_components=num_clusters,trainer_params={"accelerator":'gpu',"devices":1})
    vectorizer_model = CountVectorizer()
    Topic_model = BERTopic(embedding_model=embedding_model, umap_model=umap_model, hdbscan_model=hdbscan_model, vectorizer_model=vectorizer_model,
                        calculate_probabilities = False,verbose = True)
    
    Topic_model.fit(tr_headlines,tr_embeddings)

    os.makedirs(saved_model_folder+f"/{year}_{num_clusters}_{i+1}",exist_ok=True)
    Topic_model.save(saved_model_folder+f"/{year}_{num_clusters}_{i+1}",serialization="safetensors")

    tr_topic_dist, _ = Topic_model.approximate_distribution(tr_headlines)
    tr_contem_ret_topic_dist = pd.concat([tr_df.drop(columns = ["rp_entity_id","headline","vocab_con_headline","css"]),pd.DataFrame(tr_topic_dist)],axis = 1)
    tr_grouped = tr_contem_ret_topic_dist.groupby(['date',"comnam","ret"])
    tr_grouped_sum = tr_grouped.sum()
    tr_X = np.array(tr_grouped_sum)
    tr_ret = [ind[2] for ind in list(tr_grouped_sum.index)]
    tr_Y = np.array(tr_ret).reshape(-1,1)
    tr_regression = LinearRegression(fit_intercept=True)
    tr_regression.fit(tr_X,tr_Y)
    insampler2_list.append(tr_regression.score(tr_X,tr_Y))

    te_topic_dist, _ = Topic_model.approximate_distribution(te_headlines)
    te_contem_ret_topic_dist = pd.concat([te_df.drop(columns = ["rp_entity_id","headline","vocab_con_headline","css"]),pd.DataFrame(te_topic_dist)],axis = 1)
    te_grouped = te_contem_ret_topic_dist.groupby(['date',"comnam","ret"])
    te_grouped_sum = te_grouped.sum()
    te_X = np.array(te_grouped_sum)
    te_ret = [ind[2] for ind in list(te_grouped_sum.index)]
    te_Y = np.array(te_ret).reshape(-1,1)
    te_regression = LinearRegression(fit_intercept=True)
    te_regression.fit(te_X,te_Y)
    outsampler2_new_list.append(te_regression.score(te_X,te_Y))

    outsampler2_old_list.append(tr_regression.score(te_X,te_Y))
    


2024-07-10 12:49:20,347 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-10 12:49:24,418 - BERTopic - Dimensionality - Completed ✓
2024-07-10 12:49:24,455 - BERTopic - Cluster - Start clustering the reduced embeddings
Fitting K-means estimator...
Running initialization...


Epoch 118: 100%|██████████| 1/1 [00:00<00:00, 112.35it/s]
Epoch 299: 100%|██████████| 1/1 [00:00<00:00, 84.49it/s] 


Fitting K-Means...


Epoch 299: 100%|██████████| 1/1 [00:00<00:00, 41.84it/s, inertia=0.0603]

Running initialization...



Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 53.17it/s]


Fitting Gaussian mixture...


Epoch 21: 100%|██████████| 1/1 [00:00<00:00, 61.06it/s, nll=-7.85] 
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  5.84it/s] 


2024-07-10 12:49:38,897 - BERTopic - Cluster - Completed ✓
2024-07-10 12:49:39,035 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-07-10 12:49:44,069 - BERTopic - Representation - Completed ✓
100%|██████████| 1436/1436 [00:56<00:00, 25.49it/s]
100%|██████████| 359/359 [00:14<00:00, 25.64it/s]
2024-07-10 12:50:59,489 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-10 12:51:00,148 - BERTopic - Dimensionality - Completed ✓
2024-07-10 12:51:00,178 - BERTopic - Cluster - Start clustering the reduced embeddings
Fitting K-means estimator...
Running initialization...


Epoch 118: 100%|██████████| 1/1 [00:00<00:00, 126.90it/s]
Epoch 299: 100%|██████████| 1/1 [00:00<00:00, 157.85it/s]

Fitting K-Means...



Epoch 299: 100%|██████████| 1/1 [00:00<00:00, 24.81it/s, inertia=0.0602]


Running initialization...


Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 62.74it/s]


Fitting Gaussian mixture...


Epoch 27: 100%|██████████| 1/1 [00:00<00:00, 36.61it/s, nll=-7.86] 
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  5.43it/s] 


2024-07-10 12:51:12,737 - BERTopic - Cluster - Completed ✓
2024-07-10 12:51:12,868 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-07-10 12:51:17,847 - BERTopic - Representation - Completed ✓
100%|██████████| 1436/1436 [00:56<00:00, 25.62it/s]
100%|██████████| 359/359 [00:13<00:00, 25.91it/s]
2024-07-10 12:52:31,902 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-10 12:52:32,506 - BERTopic - Dimensionality - Completed ✓
2024-07-10 12:52:32,547 - BERTopic - Cluster - Start clustering the reduced embeddings
Fitting K-means estimator...
Running initialization...


Epoch 118: 100%|██████████| 1/1 [00:00<00:00, 77.14it/s] 
Epoch 299: 100%|██████████| 1/1 [00:00<00:00, 144.36it/s]

Fitting K-Means...



Epoch 44: 100%|██████████| 1/1 [00:00<00:00, 22.52it/s, inertia=0.0604]

Running initialization...



Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 102.15it/s]

Fitting Gaussian mixture...



Epoch 37: 100%|██████████| 1/1 [00:00<00:00, 40.93it/s, nll=-7.87] 
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  5.95it/s] 


2024-07-10 12:52:38,299 - BERTopic - Cluster - Completed ✓
2024-07-10 12:52:38,434 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-07-10 12:52:43,482 - BERTopic - Representation - Completed ✓
100%|██████████| 1436/1436 [00:55<00:00, 25.76it/s]
100%|██████████| 359/359 [00:13<00:00, 26.47it/s]
2024-07-10 12:53:57,313 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-10 12:53:57,969 - BERTopic - Dimensionality - Completed ✓
2024-07-10 12:53:57,999 - BERTopic - Cluster - Start clustering the reduced embeddings
Fitting K-means estimator...
Running initialization...


Epoch 118: 100%|██████████| 1/1 [00:00<00:00, 140.35it/s]
Epoch 299: 100%|██████████| 1/1 [00:00<00:00, 126.72it/s]

Fitting K-Means...



Epoch 299: 100%|██████████| 1/1 [00:00<00:00, 30.21it/s, inertia=0.0602]


Running initialization...


Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 56.84it/s]


Fitting Gaussian mixture...


Epoch 23: 100%|██████████| 1/1 [00:00<00:00, 40.02it/s, nll=-7.86] 
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  5.46it/s] 


2024-07-10 12:54:12,398 - BERTopic - Cluster - Completed ✓
2024-07-10 12:54:12,541 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-07-10 12:54:17,662 - BERTopic - Representation - Completed ✓
100%|██████████| 1436/1436 [00:56<00:00, 25.51it/s]
100%|██████████| 359/359 [00:13<00:00, 25.77it/s]
2024-07-10 12:55:32,018 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-10 12:55:32,668 - BERTopic - Dimensionality - Completed ✓
2024-07-10 12:55:32,699 - BERTopic - Cluster - Start clustering the reduced embeddings
Fitting K-means estimator...
Running initialization...


Epoch 118: 100%|██████████| 1/1 [00:00<00:00, 113.80it/s]
Epoch 299: 100%|██████████| 1/1 [00:00<00:00, 77.05it/s] 


Fitting K-Means...


Epoch 299: 100%|██████████| 1/1 [00:00<00:00, 20.89it/s, inertia=0.0602]

Running initialization...



Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 96.58it/s] 


Fitting Gaussian mixture...


Epoch 23: 100%|██████████| 1/1 [00:00<00:00, 39.50it/s, nll=-7.87] 
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  5.40it/s] 


2024-07-10 12:55:46,343 - BERTopic - Cluster - Completed ✓
2024-07-10 12:55:46,479 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-07-10 12:55:51,750 - BERTopic - Representation - Completed ✓
100%|██████████| 1436/1436 [00:55<00:00, 25.70it/s]
100%|██████████| 359/359 [00:14<00:00, 25.49it/s]


In [7]:
print(insampler2_list)
print(outsampler2_new_list)
print(outsampler2_old_list)
print(np.mean(insampler2_list))
print(np.mean(outsampler2_new_list))
print(np.mean(outsampler2_old_list))

[0.002074418024785585, 0.0017820044682698155, 0.001709449963743781, 0.001887064272858674, 0.001870062278081952]
[0.0016992174542274086, 0.0020058086507290795, 0.0017732304322696235, 0.0022051776698324144, 0.001428947114732071]
[-0.0002506909965014348, -0.00012987675045428304, -0.00020084067109116255, -0.00022111001433544608, -0.00016451984265297703]
0.0018645998015479615
0.0018224762643581193
-0.0001934076550070607
