In [None]:
import os 
import openpyxl
import pandas as pd 
import numpy as np
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from bertopic.cluster import BaseCluster
import re
import collections
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from cuml.manifold import UMAP
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from cuml.cluster import HDBSCAN
from hdbscan.flat import (HDBSCAN_flat,
                          approximate_predict_flat,
                          membership_vector_flat,
                          all_points_membership_vectors_flat)
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
#from pycave.bayes import GaussianMixture as GMM
from sklearn.feature_extraction.text import CountVectorizer
import collections
from tqdm import tqdm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
#!nvidia-smi
# Check available GPUs
# os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [None]:
class ClusterModel:
    def __init__(self, model):
        self.model = model

    def fit(self, X, y=None):
        self.model.fit(X)
        self.labels_ = self.model.predict(X)
        return self

    def predict(self, X):
        predictions = self.model.predict(X)
        self.labels_ = predictions
        return predictions
    
# class MyGMM:
#     def __init__(self, num_components, trainer_params):
#         self.gmm = GMM(num_components=num_components,trainer_params=trainer_params)
#         self.labels_ = None
    
#     def fit(self,data):
#         self.gmm.fit(data)
#         self.labels_ = np.array(self.gmm.predict(data))
#         return self
    
#     def predict(self,data):
#         return np.array(self.gmm.predict(data))

class Dimensionality:
  """ Use this for pre-calculated reduced embeddings """
  def __init__(self, reduced_embeddings):
    self.reduced_embeddings = reduced_embeddings

  def fit(self, X):
    return self

  def transform(self, X):
    return self.reduced_embeddings

# Remove individual commenter first
def remove_individual(headline):
    by_pattern = re.search(r" (-+)? By", headline, flags = re.IGNORECASE)
    if by_pattern:
        return headline[:by_pattern.start()]
    else:
        return headline
    
# Tokenize the headline into tokens which are alphanumeric words including period . 
def custom_tokenizer(headline):
    tokens = re.findall(r"\b[a-zA-z0-9\.][a-zA-z0-9\.]+\b",headline.lower())  
    return tokens

# Remove tokens according to the principles 
def custom_processor(headline, remove_words):
    tokens = custom_tokenizer(headline)
    new_tokens = [token for token in tokens if token not in remove_words]
    return " ".join(new_tokens)

In [None]:
r2_dict = {}
r2_tr_dict = {}
r2_te_dict = {}
cluster_num = 60
hdb_min_cluster_size = 1400
df_folder = '/shared/share_tm-finance'
datatype = 'contem'
modeltype = 'pcakmeans'
save_model = True
for i in range(2023, 2024):
    tar_year = i
    
    df = pd.read_csv(df_folder+'/Processed_df/One_year_window/{type}_{year}.csv'.format(year = tar_year, type = datatype))

    # Load embeddings
    red_headlines = df.vocab_con_headline.tolist()
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = np.load(df_folder+"/Embeddings/One_year_window/{type}_{year}_embeddings.npy".format(year = tar_year, type = datatype))

    # Reduce dimensionality
    PCA_model = PCA(n_components=10)
    SVD_model = TruncatedSVD(n_components=10, random_state=42, n_iter=10) 
    umap_model = UMAP(n_neighbors=10, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
    # Cluster embeddings
    gmm_model = ClusterModel(GaussianMixture(n_components=100, covariance_type='full', random_state = 42))
    # gmm_model2 = MyGMM(num_components=cluster_num,trainer_params={"accelerator":'gpu',"devices":1})
    KMeans_model = KMeans(n_clusters=cluster_num)
    hdbscan_model = HDBSCAN(min_cluster_size = hdb_min_cluster_size,  metric='euclidean', cluster_selection_method='eom',\
                             gen_min_span_tree=True,prediction_data=False,min_samples = 50,verbose = True)
    # Vectorize
    vectorizer_model = CountVectorizer(stop_words="english", min_df=1, max_df = 50, ngram_range=(1, 2))
    
    model_combinations = {
    'umaphdbscan': (umap_model, hdbscan_model),
    'pcakmeans': (PCA_model, KMeans_model),
    'umapgmm': (umap_model, gmm_model),
    'pcagmm': (PCA_model, gmm_model),
    'svdkeans': (SVD_model, KMeans_model)
    }

    dim_red_model, cluster_model = model_combinations.get(modeltype, (PCA_model, KMeans_model))
        

    if modeltype == 'umaphdbscan_2':
        while True:
            reducer = UMAP(n_neighbors=10, n_components=10, min_dist=0.0, metric='cosine', random_state=42, n_epochs=1000, learning_rate=0.5)
            reduced_embeddings = reducer.fit_transform(embeddings)
            clusterer = HDBSCAN(min_cluster_size = hdb_min_cluster_size,  metric='euclidean', cluster_selection_method='eom',\
                                gen_min_span_tree=True,prediction_data=False,min_samples = 50,verbose = True).fit(reduced_embeddings)
            if len(set(clusterer.labels_)) >= cluster_num - 5 and len(set(clusterer.labels_)) <= cluster_num + 5:
                break
        dim_red_model = Dimensionality(reduced_embeddings)

    Topic_model = BERTopic(embedding_model=embedding_model, umap_model=dim_red_model, hdbscan_model=cluster_model, vectorizer_model=vectorizer_model,
                        calculate_probabilities = False, verbose = True, low_memory = True)

    r2 = []
    r2_tr = []
    r2_te = []
    m = np.random.randint(0,5)
    for i in range(1):
        
        if modeltype == 'umaphdbscan_1':
            reducer = UMAP(n_neighbors=10, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
            reduced_embeddings = reducer.fit_transform(embeddings)
            clusterer = HDBSCAN_flat(X = reduced_embeddings, cluster_selection_method='eom', metric='euclidean', n_clusters=60, min_cluster_size=500, \
                            gen_min_span_tree=True, prediction_data=False, min_samples = 50)
            dim_red_model = Dimensionality(reduced_embeddings)
            cluster_model = BaseCluster()
            Topic_model = BERTopic(embedding_model=embedding_model, umap_model=dim_red_model, hdbscan_model=cluster_model, vectorizer_model=vectorizer_model,
                        calculate_probabilities = False, verbose = True, low_memory = True)
        
        topics, _ = Topic_model.fit_transform(red_headlines, embeddings)
        topic_dist, _ = Topic_model.approximate_distribution(red_headlines)
        if i == m:
            #save the topic model
            if save_model == True:
                Topic_model.save(df_folder+'/Kevin/Bert_var/One_year_window/{model}/{year}_{topic_num}_{count}'.format(year = tar_year, topic_num = cluster_num, model = modeltype, count = m+1),
                serialization = "safetensors", save_ctfidf = True, save_embedding_model = embedding_model)
            
        contem_ret_topic_dist = pd.concat([df.drop(columns = ["rp_entity_id","headline","vocab_con_headline"]),pd.DataFrame(topic_dist)],axis = 1)
        grouped = contem_ret_topic_dist.groupby(['date',"comnam","ret"])
        grouped_sum = grouped.sum()

        X = np.array(grouped_sum)
        ret = [ind[2] for ind in list(grouped_sum.index)]
        Y = np.array(ret).reshape(-1,1)
        X_tr, X_te, Y_tr, Y_te = train_test_split(X,Y,test_size=0.2,random_state=66)
        
        sample_regression = LinearRegression(fit_intercept=True).fit(X_tr,Y_tr)
        R_square_tr = sample_regression.score(X_tr,Y_tr)
        R_square_te = sample_regression.score(X_te,Y_te)
        # Y_tr_pred = sample_regression.predict(X_tr)
        # Y_te_pred = sample_regression.predict(X_te)
        # mse_tr = mean_squared_error(Y_tr,Y_tr_pred)
        # mse_te= mean_squared_error(Y_te,Y_te_pred)
        full_regression = LinearRegression(fit_intercept=True).fit(X,Y)
        R_square = full_regression.score(X,Y)
        
        r2.append(R_square)
        r2_tr.append(R_square_tr)
        r2_te.append(R_square_te)
    r2_dict[tar_year] = r2
    r2_tr_dict[tar_year] = r2_tr
    r2_te_dict[tar_year] = r2_te
    print("Year {year} is done".format(year = tar_year))

print(r2_dict)
print(r2_tr_dict)
print(r2_te_dict)


In [None]:
#compute model cohenrence score
from gensim.models.coherencemodel import CoherenceModel
import gensim.corpora as corpora

# topic_model = BERTopic(verbose=True, n_gram_range=(1, 3))

documents = pd.DataFrame({"Document": red_headlines,
                          "ID": range(len(red_headlines)),
                          "Topic": topics})
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = Topic_model._preprocess_text(documents_per_topic.Document.values) 
vectorizer = Topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()

tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [[words for words, _ in Topic_model.get_topic(topic)] 
               for topic in range(len(set(topics))-1)]

coherence_model = CoherenceModel(topics=topic_words, 
                                 texts=tokens, 
                                 corpus=corpus,
                                 dictionary=dictionary, 
                                 coherence='c_v')
coherence_score = coherence_model.get_coherence()
print(f"Coherence Score: {coherence_score}")


In [None]:
#compute topic coherence score
from gensim.models.coherencemodel import CoherenceModel
import gensim.corpora as corpora
import pandas as pd

documents = pd.DataFrame({"Document": red_headlines,
                          "ID": range(len(red_headlines)),
                          "Topic": topics})
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = Topic_model._preprocess_text(documents_per_topic.Document.values)

vectorizer = Topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]

coherence_scores = []

# Compute coherence score for each topic
for topic in range(len(set(topics))-1):
    topic_words = [[words for words, _ in Topic_model.get_topic(topic)]]
    coherence_model = CoherenceModel(topics=topic_words, 
                                     texts=tokens, 
                                     corpus=corpus,
                                     dictionary=dictionary, 
                                     coherence='c_v')
    coherence = coherence_model.get_coherence()
    print(f"Topic {topic} Coherence Score: {coherence}")
    coherence_scores.append((topic, coherence))

coherence_df = pd.DataFrame(coherence_scores, columns=['Topic', 'Coherence_Score'])
print(coherence_df)


In [None]:
#compute model diversity score
def compute_topic_diversity(topics):
    unique_words = set()
    total_words = 0
    for topic in topics:
        words = topic[1]
        total_words += len(words)
        unique_words.update(words)
    diversity_score = len(unique_words) / total_words
    return diversity_score

model_diversity = compute_topic_diversity(topics)
print(f"Model Diversity: {model_diversity}")


In [None]:
# Composite Score Calculation
composite_score = 0.4 * coherence_score + 0.3 * model_diversity + 0.3 * R_square

# Define Thresholds for Filtering Topics
coherence_threshold = 0.5
diversity_threshold = 0.5
impact_threshold = 0.1

# Filter Topics
if coherence_score > coherence_threshold and model_diversity > diversity_threshold and R_square > impact_threshold:
    print("Topics are meaningful and relevant.")
else:
    print("Topics need further refinement.")
