In [1]:
# Import necessary modules
import os 
import pandas as pd 
from sentence_transformers import SentenceTransformer
# Take PCA and KMeans as the example here
from cuml.decomposition import PCA
from cuml.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
import numpy as np
import collections
from tqdm import tqdm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Check available GPUs
os.environ["CUDA_VISIBLE_DEVICES"] = "7"

In [3]:
df_folder = "/scratch/wx2309/Processed_data/one_year_window"
embeddings_folder = "/scratch/wx2309/embeddings"
df = pd.read_csv(df_folder+"/contem_2023.csv")
headlines = df.headline.tolist()
embeddings = np.load(embeddings_folder+"/contem_2023_embeddings.npy")

In [4]:
vocab = collections.Counter()
tokenizer = CountVectorizer().build_tokenizer()
for headline in tqdm(headlines):
    vocab.update(tokenizer(headline))
vocab = [word for word, count in vocab.items() if count > 20]
len(vocab)

100%|██████████| 1749770/1749770 [00:07<00:00, 249243.79it/s]


41353

In [5]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
umap_model = PCA(n_components = 10)
# Assume 62 clusters here
hdbscan_model = KMeans(n_clusters=62,verbose = True)
vectorizer_model = CountVectorizer(vocabulary=vocab,stop_words="english")
Topic_model = BERTopic(embedding_model=embedding_model, umap_model=umap_model, hdbscan_model=hdbscan_model, vectorizer_model=vectorizer_model,
                    calculate_probabilities = False,verbose = True,low_memory = True)

In [6]:
Topic_model.fit(headlines,embeddings)

2024-06-15 12:34:19,415 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-15 12:34:21,303 - BERTopic - Dimensionality - Completed ✓
2024-06-15 12:34:21,343 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-15 12:34:21,870 - BERTopic - Cluster - Completed ✓
2024-06-15 12:34:22,035 - BERTopic - Representation - Extracting topics from clusters using representation models.
  idf = np.log((avg_nr_samples / df)+1)
2024-06-15 12:34:31,980 - BERTopic - Representation - Completed ✓


<bertopic._bertopic.BERTopic at 0x7f882dd2a940>

In [7]:
topic_dist, _ = Topic_model.approximate_distribution(headlines)
contem_ret_topic_dist = pd.concat([df.drop(columns = ["rp_entity_id","headline"]),pd.DataFrame(topic_dist)],axis = 1)
grouped = contem_ret_topic_dist.groupby(['date',"comnam","ret"])
grouped_sum = grouped.sum()

X = np.array(grouped_sum)
ret = []
for ind in list(grouped_sum.index):
  ret.append(ind[2])
Y = np.array(ret).reshape(-1,1)
X_tr, X_te, Y_tr, Y_te = train_test_split(X,Y,test_size=0.2,random_state=66)
regression = LinearRegression(fit_intercept=True)
regression.fit(X_tr,Y_tr)
Y_tr_pred = regression.predict(X_tr)
Y_te_pred = regression.predict(X_te)

mse_tr = mean_squared_error(Y_tr,Y_tr_pred)
mse_te= mean_squared_error(Y_te,Y_te_pred)
print(f"Training R square is {regression.score(X_tr,Y_tr)}")
print(f"Training error is {mse_tr}")
print(f"Testing error is {mse_te}")

100%|██████████| 1750/1750 [01:41<00:00, 17.20it/s]


Training R square is 0.008919540573840479
Training error is 0.0003796797822123109
Testing error is 0.0003894050908241998
