In [1]:
import os 
import openpyxl
import pandas as pd 
import numpy as np
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from bertopic.cluster import BaseCluster
import re
import collections
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from cuml.manifold import UMAP
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from cuml.cluster import HDBSCAN
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
# from pycave.bayes import GaussianMixture as GMM
from sklearn.feature_extraction.text import CountVectorizer
import collections
from tqdm import tqdm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
# import torch
# import gc

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
!nvidia-smi
# Check available GPUs
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

Mon Jul 15 17:20:59 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A40                     Off | 00000000:01:00.0 Off |                    0 |
|  0%   41C    P0              77W / 300W |    272MiB / 46068MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A40                     Off | 00000000:25:00.0 Off |  

In [3]:
class ClusterModel:
    def __init__(self, model):
        self.model = model

    def fit(self, X, y=None):
        self.model.fit(X)
        self.labels_ = self.model.predict(X)
        return self

    def predict(self, X):
        predictions = self.model.predict(X)
        self.labels_ = predictions
        return predictions
    
# class MyGMM:
#     def __init__(self, num_components, trainer_params):
#         self.gmm = GMM(num_components=num_components,trainer_params=trainer_params)
#         self.labels_ = None
    
#     def fit(self,data):
#         self.gmm.fit(data)
#         self.labels_ = np.array(self.gmm.predict(data))
#         return self
    
#     def predict(self,data):
#         return np.array(self.gmm.predict(data))

class Dimensionality:
  """ Use this for pre-calculated reduced embeddings """
  def __init__(self, reduced_embeddings):
    self.reduced_embeddings = reduced_embeddings

  def fit(self, X):
    return self

  def transform(self, X):
    return self.reduced_embeddings

In [4]:
def model_setup(modeltype, cluster_num, hdb_min_cluster_size):    
    # Embeddings
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
    # Reduce dimensionality
    PCA_model = PCA(n_components=10)
    SVD_model = TruncatedSVD(n_components=10, random_state=42, n_iter=10) 
    umap_model = UMAP(n_neighbors=10, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
    # Cluster embeddings
    gmm_model = ClusterModel(GaussianMixture(n_components=100, covariance_type='full', random_state = 42))
    # gmm_model2 = MyGMM(num_components=cluster_num,trainer_params={"accelerator":'gpu',"devices":1})
    KMeans_model = KMeans(n_clusters=cluster_num)
    hdbscan_model = HDBSCAN(min_cluster_size = hdb_min_cluster_size,  metric='euclidean', cluster_selection_method='eom',\
                             gen_min_span_tree=True,prediction_data=False,min_samples = 50,verbose = True)
    # Vectorize
    vectorizer_model = CountVectorizer(stop_words="english", min_df=1, max_df = 50, ngram_range=(1, 2))
    
    model_combinations = {
    'umaphdbscan': (umap_model, hdbscan_model),
    'pcakmeans': (PCA_model, KMeans_model),
    'umapgmm': (umap_model, gmm_model),
    'pcagmm': (PCA_model, gmm_model),
    'svdkeans': (SVD_model, KMeans_model)
    }

    dim_red_model, cluster_model = model_combinations.get(modeltype, (PCA_model, KMeans_model))

    Topic_model = BERTopic(embedding_model=embedding_model, umap_model=dim_red_model, hdbscan_model=cluster_model, vectorizer_model=vectorizer_model,
                        calculate_probabilities = False, verbose = True, low_memory = True)
    return Topic_model, embedding_model

def Standardization(css_sum_by_topic):
    css_sum_by_topic_df = css_sum_by_topic.reset_index()
    css_sum_by_topic_df.columns = ['topic', 'css']
    scaler = StandardScaler()
    css_sum_by_topic_df['css_standardized'] = scaler.fit_transform(css_sum_by_topic_df[['css']])
    css_standardized_series = css_sum_by_topic_df.set_index('topic')['css_standardized']
    return css_standardized_series

In [5]:
insampler2_dict = {}
outsampler2_dict = {}
cluster_num = 60
hdb_min_cluster_size = 800
df_folder = "/shared/share_tm-finance/Processed_df_Sentiment/One_year_window"
embeddings_folder = "/shared/share_tm-finance/Embeddings_with_Sentiment/One_year_window"
saved_model_folder = "/shared/share_tm-finance/Stored_model"
datatype = 'contem'
modeltype = 'pcakmeans'
sentiment_type = 'per_day_per_comp'
save_model = False
for i in range(2014, 2015):
    
    tar_year = i
    df = pd.read_csv(df_folder+'/{type}_{year}_senti.csv'.format(year = tar_year, type = datatype))
    red_headlines = df.vocab_con_headline.tolist()
    embeddings = np.load(embeddings_folder+"/{type}_{year}_senti_embeddings.npy".format(year = tar_year, type = datatype))

    Topic_model, embedding_model = model_setup(modeltype, cluster_num, hdb_min_cluster_size)
    
    insampler2_list = []
    outsampler2_list = []
    indices = np.arange(len(red_headlines))
    
    for i in range(1):
        # Perform the train-test split on indices
        tr_ind, te_ind = train_test_split(indices, test_size=0.2, shuffle= True, random_state=66)

        # torch.cuda.empty_cache()
        # gc.collect()
        tr_df = df.iloc[tr_ind,:]
        te_df = df.iloc[te_ind,:]
        tr_headlines = [red_headlines[ind] for ind in tr_ind]
        te_headlines = [red_headlines[ind] for ind in te_ind]
        tr_embeddings = embeddings[tr_ind,:]
        te_embeddings = embeddings[te_ind,:]
        
        topics, probs = Topic_model.fit_transform(tr_headlines)

        #save the topic model
        if save_model == True:
            Topic_model.save(saved_model_folder+'/{model}/{year}_{topic_num}_{count}'.format(year = tar_year, topic_num = cluster_num, model = modeltype, count = i+1),
            serialization = "safetensors", save_ctfidf = True, save_embedding_model = embedding_model)
        
        #calculate insample R2
        tr_topic_dist, _ = Topic_model.approximate_distribution(tr_headlines)
        tr_df = tr_df.reset_index(drop = True)
        tr_contem_ret_topic_dist = pd.concat([tr_df.drop(columns = ["rp_entity_id","headline","vocab_con_headline"]),pd.DataFrame(tr_topic_dist)],axis = 1)
        tr_grouped = tr_contem_ret_topic_dist.groupby(['date',"comnam","ret"])
        tr_grouped_sum = tr_grouped.sum()
        if sentiment_type == "per_topic":
            tr_df['topic'] = topics
            tr_css_sum_by_topic = tr_df.groupby('topic')['css'].sum()
            tr_df.drop(columns = ['topic'], inplace = True)
            tr_css_standardized_series = Standardization(tr_css_sum_by_topic)
            tr_grouped_sum.iloc[:, 1:] = tr_grouped_sum.iloc[:, 1:].mul(tr_css_standardized_series, axis=1)
        elif sentiment_type == "per_day_per_comp":
            tr_grouped_sum.iloc[:, 1:] = tr_grouped_sum.iloc[:, 1:].mul(tr_grouped_sum['css'], axis=0)
        
        tr_grouped_sum.drop(columns = ['css'], inplace = True)

        tr_X = np.array(tr_grouped_sum)
        tr_ret = [ind[2] for ind in list(tr_grouped_sum.index)]
        tr_Y = np.array(tr_ret).reshape(-1,1)
        tr_regression = LinearRegression(fit_intercept=True)
        tr_regression.fit(tr_X,tr_Y)
        tr_Y_pred = tr_regression.predict(tr_X)
        tr_Y_mean = np.mean(tr_Y)
        tr_SS_tot = np.sum((tr_Y - tr_Y_mean) ** 2)
        tr_SS_res = np.sum((tr_Y - tr_Y_pred) ** 2)
        tr_r2 = 1 - (tr_SS_res / tr_SS_tot)
        insampler2_list.append(tr_r2)
        

        #calculate outsample R2
        new_topics, new_probs = Topic_model.transform(te_headlines)
        te_topic_dist, _ = Topic_model.approximate_distribution(te_headlines)     
        te_df = te_df.reset_index(drop = True)
        te_contem_ret_topic_dist = pd.concat([te_df.drop(columns = ["rp_entity_id","headline","vocab_con_headline"]),pd.DataFrame(te_topic_dist)],axis = 1)
        te_grouped = te_contem_ret_topic_dist.groupby(['date',"comnam","ret"])
        te_grouped_sum = te_grouped.sum()
        if sentiment_type == "per_topic":
            te_df['topic'] = new_topics
            te_css_sum_by_topic = te_df.groupby('topic')['css'].sum()
            te_df.drop(columns = ['topic'], inplace = True)
            te_css_standardized_series = Standardization(te_css_sum_by_topic)
            te_grouped_sum.iloc[:, 1:] = te_grouped_sum.iloc[:, 1:].mul(te_css_standardized_series, axis=1)
        elif sentiment_type == "per_day_per_comp":
            te_grouped_sum.iloc[:, 1:] = te_grouped_sum.iloc[:, 1:].mul(te_grouped_sum['css'], axis=0)
        
        te_grouped_sum.drop(columns = ['css'], inplace = True)
        
        te_X = np.array(te_grouped_sum)
        te_ret = [ind[2] for ind in list(te_grouped_sum.index)]
        te_Y = np.array(te_ret).reshape(-1,1)
        te_regression = LinearRegression(fit_intercept=True)
        te_regression.fit(te_X,te_Y)
        te_Y_pred = tr_regression.predict(te_X)
        te_SS_tot = np.sum((te_Y - tr_Y_mean) ** 2)
        te_SS_res = np.sum((te_Y - te_Y_pred) ** 2)
        te_r2 = 1 - (te_SS_res / te_SS_tot)
        outsampler2_list.append(te_r2)

    insampler2_dict[tar_year] = insampler2_list
    outsampler2_dict[tar_year] = outsampler2_list

    print("Year {year} is done".format(year = tar_year))

print(insampler2_dict)
print(outsampler2_dict)



2024-07-15 17:21:33,936 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 34982/34982 [06:35<00:00, 88.50it/s] 
2024-07-15 17:28:19,212 - BERTopic - Embedding - Completed ✓
2024-07-15 17:28:19,213 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-15 17:28:21,721 - BERTopic - Dimensionality - Completed ✓
2024-07-15 17:28:21,751 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-07-15 17:28:44,036 - BERTopic - Cluster - Completed ✓
2024-07-15 17:28:44,144 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-07-15 17:29:01,049 - BERTopic - Representation - Completed ✓
100%|██████████| 1120/1120 [01:43<00:00, 10.84it/s]
Batches: 100%|██████████| 8746/8746 [01:30<00:00, 96.66it/s] 
2024-07-15 17:32:19,502 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-07-15 17:32:19,636 - BERTopic - Dimensionality - Completed ✓
2024-07-15 17:3

Year 2014 is done
{2014: [0.006511495023692193]}
{2014: [0.0004936368944844505]}


In [None]:
#Load Model
cluster_num = 60
hdb_min_cluster_size = 800
df_folder = "/shared/share_tm-finance/Processed_df_Sentiment/One_year_window"
embeddings_folder = "/shared/share_tm-finance/Embeddings_with_Sentiment/One_year_window"
saved_model_folder = "/shared/share_tm-finance/Stored_model"
datatype = 'contem'
modeltype = 'pcakmeans'
sentiment_type = 'per_day_per_comp'
save_model = False
count_num = 5
tar_year = 2022
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
df = pd.read_csv(df_folder+'/{type}_{year}_senti.csv'.format(year = tar_year, type = datatype))
red_headlines = df.vocab_con_headline.tolist()
# embeddings = np.load(embeddings_folder+"/{type}_{year}_senti_embeddings.npy".format(year = tar_year, type = datatype))

Topic_model = BERTopic.load(saved_model_folder+'/{model}/{year}_{topic_num}_{count}'.format(year = tar_year, topic_num = cluster_num, model = modeltype, count = count_num), embedding_model = embedding_model)
# topics, _ = Topic_model.fit_transform(red_headlines)

In [7]:
#Calculate Model Coherence Score

from gensim.models.coherencemodel import CoherenceModel
import gensim.corpora as corpora

# Preprocess Documents
documents = pd.DataFrame({"Document": tr_headlines,
                          "ID": range(len(tr_headlines)),
                          "Topic": topics})
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = Topic_model._preprocess_text(documents_per_topic.Document.values)

# Extract vectorizer and analyzer from BERTopic
vectorizer = Topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()

# Extract features for Topic Coherence evaluation

tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [[words for words, _ in Topic_model.get_topic(topic)] 
               for topic in range(len(set(topics))-1)]

# Evaluate
coherence_model = CoherenceModel(topics=topic_words, 
                                 texts=tokens, 
                                 corpus=corpus,
                                 dictionary=dictionary, 
                                 coherence='c_v')
coherence_score = coherence_model.get_coherence()
print(f"Coherence Score: {coherence_score}")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Coherence Score: 0.535476576998597


In [8]:
topic_info = Topic_model.get_topic_info()

topic_word_prob_matrix = []
topics_word_str_matrix = []
for topic in topic_info.Topic.unique():
    if topic != -1:  # Skip the outliers
        words, probabilities = zip(*Topic_model.get_topic(topic))
        topic_word_prob_matrix.append(probabilities)
        topics_word_str_matrix.append(words)

topic_word_matrix = np.array(topic_word_prob_matrix)
doc_topic_matrix = np.array(probs)
topics_matrix = np.array(topics_word_str_matrix)

model_output = {'topics': topics_matrix, 'doc_topic_matrix': doc_topic_matrix, 'topic-word-matrix': topic_word_matrix}

In [None]:
# from octis.evaluation_metrics.coherence_metrics import Coherence

# metric = Coherence(texts = topics_matrix, measure='c_v') # Initialize metric
# topic_coherence_score = metric.score(model_output) # Compute score of the metric
# topic_coherence_score

In [None]:
#Calculate Topic Coherence Score

from gensim.models.coherencemodel import CoherenceModel
import gensim.corpora as corpora


# Preprocess Documents
documents = pd.DataFrame({"Document": red_headlines,
                          "ID": range(len(red_headlines)),
                          "Topic": topics})
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = Topic_model._preprocess_text(documents_per_topic.Document.values)

# Extract vectorizer and analyzer from BERTopic
vectorizer = Topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()

# Extract features for Topic Coherence evaluation
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]

# Initialize an empty list to store coherence scores for each topic
coherence_scores = []

# Compute coherence score for each topic
for topic in range(len(set(topics))-1):
    topic_words = [[words for words, _ in Topic_model.get_topic(topic)]]
    coherence_model = CoherenceModel(topics=topic_words, 
                                     texts=tokens, 
                                     corpus=corpus,
                                     dictionary=dictionary, 
                                     coherence='c_v')
    coherence = coherence_model.get_coherence()
    print(f"Topic {topic} Coherence Score: {coherence}")
    coherence_scores.append((topic, coherence))

# Optionally, you can convert it to a DataFrame for better visualization
coherence_df = pd.DataFrame(coherence_scores, columns=['Topic', 'Coherence_Score'])
print(coherence_df)


In [9]:
#Calculate Model Diversity Score

def compute_model_diversity(topics):
    unique_words = set()
    total_words = 0

    for topic in topics:
        total_words += len(topic)
        for words in topic:
            unique_words.add(words)
    diversity_score = len(unique_words) / total_words
    return diversity_score

model_diversity = compute_model_diversity(topic_words)
print(f"Model Diversity: {model_diversity}")


Model Diversity: 0.7949152542372881


In [None]:
#Calculate Model Significance Score

from octis.evaluation_metrics.topic_significance_metrics import KL_uniform

metric = KL_uniform() # Initialize metric
significance_score = metric.score(model_output) # Compute score of the metric

In [None]:
# Composite Score Calculation
composite_score = 0.2 * coherence_score + 0.2 * model_diversity + 0.2 * significance_score + 4 * insampler2_list[-1]
print(f"Composite Score: {composite_score}")

# Define Thresholds for Filtering Topics
coherence_threshold = 0.5
diversity_threshold = 0.5
significane_threshold = 0.8
impact_threshold = 0.01

# Filter Topics
if coherence_score > coherence_threshold and model_diversity > diversity_threshold and significance_score > significane_threshold and insampler2_list[-1] > impact_threshold and composite_score > 1:
    print("Topics are meaningful and relevant.")
else:
    print("Topics need further refinement.")


Topics are meaningful and relevant.
