# Preparation
Imporint packages, setting up GPU and functions for BERTopic.

In [57]:
import os 
import openpyxl
import pandas as pd 
import numpy as np
import pickle
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from bertopic.cluster import BaseCluster
import re
import collections
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from cuml.manifold import UMAP
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from cuml.cluster import HDBSCAN
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
# from pycave.bayes import GaussianMixture as GMM
from sklearn.feature_extraction.text import CountVectorizer
import collections
from tqdm import tqdm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_distances
from itertools import combinations
from scipy.spatial.distance import cosine

In [58]:
!nvidia-smi
# Check available GPUs
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Tue Sep 24 11:15:31 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A40                     Off | 00000000:01:00.0 Off |                    0 |
|  0%   40C    P0              77W / 300W |   2684MiB / 46068MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A40                     Off | 00000000:25:00.0 Off |  

In [59]:
class ClusterModel:
    def __init__(self, model):
        self.model = model

    def fit(self, X, y=None):
        self.model.fit(X)
        self.labels_ = self.model.predict(X)
        return self

    def predict(self, X):
        predictions = self.model.predict(X)
        self.labels_ = predictions
        return predictions

In [60]:
def model_setup(modeltype, cluster_num, hdb_min_cluster_size):    
    # Embeddings
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
    # Reduce dimensionality
    PCA_model = PCA(n_components=10)
    SVD_model = TruncatedSVD(n_components=10, random_state=42, n_iter=10) 
    umap_model = UMAP(n_neighbors=10, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
    # Cluster embeddings
    gmm_model = ClusterModel(GaussianMixture(n_components=cluster_num, covariance_type='full'))
    KMeans_model = KMeans(n_clusters=cluster_num)
    hdbscan_model = HDBSCAN(min_cluster_size=hdb_min_cluster_size, metric = "euclidean", cluster_selection_method="eom",
                            gen_min_span_tree = True, prediction_data = True, min_samples = 20, verbose = True)
    # Vectorize
    vectorizer_model = CountVectorizer(stop_words="english", min_df=1, max_df = 50, ngram_range=(1, 2))
    
    model_combinations = {
    'umaphdbscan': (umap_model, hdbscan_model),
    'pcakmeans': (PCA_model, KMeans_model),
    'umapgmm': (umap_model, gmm_model),
    'pcagmm': (PCA_model, gmm_model),
    'svdkeans': (SVD_model, KMeans_model),
    'pcahdbscan': (PCA_model, hdbscan_model)
    }

    dim_red_model, cluster_model = model_combinations.get(modeltype, (PCA_model, KMeans_model))

    Topic_model = BERTopic(embedding_model=embedding_model, umap_model=dim_red_model, hdbscan_model=cluster_model, vectorizer_model=vectorizer_model,
                        calculate_probabilities = False, verbose = True, low_memory = True)
    
    return Topic_model, embedding_model


def Standardization(css_sum_by_topic):
    css_sum_by_topic_df = css_sum_by_topic.reset_index()
    css_sum_by_topic_df.columns = ['topic', 'css']
    scaler = StandardScaler()
    css_sum_by_topic_df['css_standardized'] = scaler.fit_transform(css_sum_by_topic_df[['css']])
    css_standardized_series = css_sum_by_topic_df.set_index('topic')['css_standardized']
    
    return css_standardized_series
    

def train_model(saved_model_folder, df, red_headlines, embeddings, modeltype, topic_num, cluster_num, hdb_min_cluster_size, tar_year, save_model, reduce_outliers, i):
    
    # Perform the train-test split on indices
    indices = np.arange(len(red_headlines))
    tr_ind, te_ind = train_test_split(indices, test_size=0.2, shuffle= True, random_state=i)

    tr_df = df.iloc[tr_ind,:]
    te_df = df.iloc[te_ind,:]
    tr_headlines = [red_headlines[ind] for ind in tr_ind]
    te_headlines = [red_headlines[ind] for ind in te_ind]
    tr_embeddings = embeddings[tr_ind,:]
    te_embeddings = embeddings[te_ind,:]
    
    Topic_model, embedding_model = model_setup(modeltype, cluster_num, hdb_min_cluster_size)
    indices = np.arange(len(red_headlines))
    
    topics, probs = Topic_model.fit_transform(tr_headlines, embeddings = tr_embeddings)

    #save the topic model
    if save_model:
        Topic_model.save(saved_model_folder+f'/{tar_year}_{topic_num}_{i}',
        serialization = "safetensors", save_ctfidf = True, save_embedding_model = embedding_model)

    #reduce outliers
    if reduce_outliers:
        topics = Topic_model.reduce_outliers(tr_headlines, topics)
        Topic_model.update_topics(tr_headlines, topics=topics)
    
    #calculate insample R2
    tr_topic_dist, _ = Topic_model.approximate_distribution(tr_headlines)
    tr_df = tr_df.reset_index(drop = True)
    tr_contem_ret_topic_dist = pd.concat([tr_df.drop(columns = ["rp_entity_id","headline","vocab_con_headline"]),pd.DataFrame(tr_topic_dist)],axis = 1)
    tr_grouped = tr_contem_ret_topic_dist.groupby(['date',"comnam","ret"])
    tr_grouped_sum = tr_grouped.sum()
    tr_grouped_sum.drop(columns = ['css'], inplace = True)   

    #calculate outsample R2
    new_topics, _ = Topic_model.transform(te_headlines, embeddings = te_embeddings)

    if reduce_outliers:
        new_topics = Topic_model.reduce_outliers(te_headlines, new_topics)
        Topic_model.update_topics(te_headlines, topics=new_topics)
    
    te_topic_dist, _ = Topic_model.approximate_distribution(te_headlines)     
    te_df = te_df.reset_index(drop = True)
    te_contem_ret_topic_dist = pd.concat([te_df.drop(columns = ["rp_entity_id","headline","vocab_con_headline"]),pd.DataFrame(te_topic_dist)],axis = 1)
    te_grouped = te_contem_ret_topic_dist.groupby(['date',"comnam","ret"])
    te_grouped_sum = te_grouped.sum()
    te_grouped_sum.drop(columns = ['css'], inplace = True)
    
    return topics, probs, tr_topic_dist, Topic_model, tr_grouped_sum, te_grouped_sum
    

def linear_regression(tr_grouped_sum, te_grouped_sum, insampler2_list, outsampler2_list):
    
    # Linear regression for insample R2
    tr_X = np.array(tr_grouped_sum)
    tr_ret = [ind[2] for ind in list(tr_grouped_sum.index)]
    tr_Y = np.array(tr_ret).reshape(-1,1)
    tr_regression = LinearRegression(fit_intercept=True)
    tr_regression.fit(tr_X,tr_Y)
    tr_Y_pred = tr_regression.predict(tr_X)
    tr_Y_mean = np.mean(tr_Y)
    tr_SS_tot = np.sum((tr_Y - tr_Y_mean) ** 2)
    tr_SS_res = np.sum((tr_Y - tr_Y_pred) ** 2)
    tr_r2 = 1 - (tr_SS_res / tr_SS_tot)
    insampler2_list.append(tr_r2)

    # Linear regression for outsample R2
    te_X = np.array(te_grouped_sum)
    te_ret = [ind[2] for ind in list(te_grouped_sum.index)]
    te_Y = np.array(te_ret).reshape(-1,1)
    te_Y_pred = tr_regression.predict(te_X)
    te_SS_tot = np.sum((te_Y - tr_Y_mean) ** 2)
    te_SS_res = np.sum((te_Y - te_Y_pred) ** 2)
    te_r2 = 1 - (te_SS_res / te_SS_tot)
    outsampler2_list.append(te_r2)
    
    return

In [61]:
from sklearn.metrics.pairwise import cosine_distances
from itertools import combinations
from scipy.spatial.distance import cosine

# Load the model
def load_model(saved_model_folder, headlines, modeltype, cluster_num, tar_year, sentiment_type, count_num, reduce_outliers):
    Topic_model = BERTopic.load(saved_model_folder+f'/{tar_year}_{cluster_num}_{count_num}', embedding_model = embedding_model)
    indices = np.arange(len(headlines))
    tr_ind, te_ind = train_test_split(indices, test_size=0.2, shuffle= True, random_state=count_num)
    # tr_df = df.iloc[tr_ind,:]
    # te_df = df.iloc[te_ind,:]
    tr_headlines = [headlines[ind] for ind in tr_ind]
    # te_headlines = [headlines[ind] for ind in te_ind]
    # tr_embeddings = embeddings[tr_ind,:]
    # te_embeddings = embeddings[te_ind,:]
    Topic_df = Topic_model.get_document_info(tr_headlines)
    topics = Topic_df["Topic"].to_numpy()

    if reduce_outliers:
        topics = Topic_model.reduce_outliers(tr_headlines, topics)
        Topic_model.update_topics(tr_headlines, topics=topics)
    
    return Topic_model, topics


# Calculate pairwise cosine distances
def calculate_pairwise_cosine_distances(embeddings):
    
    return cosine_distances(embeddings)


# Compute the model diversity score
def compute_model_diversity(topics):
    unique_words = set()
    total_words = 0

    for topic in topics:
        total_words += len(topic)
        for words in topic:
            unique_words.add(words)
    diversity_score = len(unique_words) / total_words
    
    return diversity_score


# Calculate all the scores
def calculate_score(Topic_model, embedding_model, topics):
    topic_info = Topic_model.get_topics()
    topic_words = {topic: [word for word, _ in words] for topic, words in topic_info.items()}

    # Get embeddings for the words in each topic
    topic_embeddings = {}
    for topic, words in topic_words.items():
        embeddings = embedding_model.encode(words)
        topic_embeddings[topic] = embeddings

    topic_distances = {}
    for topic, embeddings in topic_embeddings.items():
        distances = calculate_pairwise_cosine_distances(embeddings)
        topic_distances[topic] = distances

    # Aggregate the scores by taking the mean distance
    topic_scores = {}
    for topic, distances in topic_distances.items():
        mean_distance = np.mean(distances)
        topic_scores[topic] = mean_distance

    # # Display the scores for each topic
    # for topic, score in topic_scores.items():
    #     print(f"Topic {topic}: Mean Cosine Distance = {score}")

    #calculate the mean score
    mean_score = np.mean(list(topic_scores.values()))

    sim = 0
    count = 0
    for topic1, topic2 in combinations(topic_info, 2):
        centroid1 = np.mean(topic_embeddings[topic1], axis=0)
        centroid2 = np.mean(topic_embeddings[topic2], axis=0)
        sim += 1 - cosine(centroid1, centroid2)
        count += 1
        
    # Calculate Model Diversity Score
    topic_words_list = [[words for words, _ in Topic_model.get_topic(topic)] 
                for topic in range(len(set(topics))-1)]

    model_diversity = compute_model_diversity(topic_words_list)
    
    return mean_score, sim / count, model_diversity

# Building Model and Calculating R-Square and Model Evaluation Score
Choosing 'model type', topic number', data type', and whther save model or not. \
Building up model combination, split the train and test data. Training the model with dataset, generating topics. \
Using linear regression to calculate the in-sample and out-sample R-Square for the topic weight of positive, negative, and neutral topic models.
Calculating cosine similarity within and between topics, and model diversity score.

In [62]:
insampler2_dict = {}
outsampler2_dict = {}
cos_in_topic_dict = {}
cos_bet_topic_dict = {}
model_diversity_dict = {}

modeltype = 'pcagmm'
topic_num = 120
save_model = False
hdb_min_cluster_size = 60
datatype = 'contem'
df_folder = "/shared/share_tm-finance/Final/Processed_df/One_year_window"
embeddings_folder = "/shared/share_tm-finance/Final/Embeddings/One_year_window"
saved_model_folder = "/shared/share_tm-finance/Final/Stored_model/three_models"
reduce_outliers = False
combine = True
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

first_count = 1
last_count = 5
div = last_count - first_count + 1

for i in range(2014, 2024):
    
    tar_year = i
    df = pd.read_csv(df_folder+'/{type}_{year}.csv'.format(year = tar_year, type = datatype))
    red_headlines = df.vocab_con_headline.tolist()
    embeddings = np.load(embeddings_folder+"/{type}_{year}_embeddings.npy".format(year = tar_year, type = datatype))
    indices = np.arange(len(red_headlines))
    pos_insampler2_list = []
    pos_outsampler2_list = []
    neg_insampler2_list = []
    neg_outsampler2_list = []
    neu_insampler2_list = []
    neu_outsampler2_list = []
    insampler2_list = []
    outsampler2_list = []

    #split the data into 3 part, positive css, negative css, neutral css
    pos_indices = df[df['css'] > 0].index
    neg_indices = df[df['css'] < 0].index
    neu_indices = df[df['css'] == 0].index
    pos_df = df.iloc[pos_indices,:]
    neg_df = df.iloc[neg_indices,:]
    neu_df = df.iloc[neu_indices,:]
    pos_headlines = [red_headlines[ind] for ind in pos_indices]
    neg_headlines = [red_headlines[ind] for ind in neg_indices]
    neu_headlines = [red_headlines[ind] for ind in neu_indices]
    pos_embeddings = embeddings[pos_indices,:]
    neg_embeddings = embeddings[neg_indices,:]
    neu_embeddings = embeddings[neu_indices,:]
    
    cos_in_topic_pos_sum = 0
    cos_bet_topic_pos_sum = 0
    model_diversity_pos_sum = 0
    cos_in_topic_neg_sum = 0
    cos_bet_topic_neg_sum = 0
    model_diversity_neg_sum = 0
    cos_in_topic_neu_sum = 0
    cos_bet_topic_neu_sum = 0
    model_diversity_neu_sum = 0

    #set pos_cluster_num, neg_cluster_num, neu_cluster_num based on the number of embeddings
    pos_cluster_num = int(topic_num * len(pos_embeddings) / len(embeddings))
    neg_cluster_num = int(topic_num * len(neg_embeddings) / len(embeddings))
    neu_cluster_num = int(topic_num * len(neu_embeddings) / len(embeddings))
    diff = topic_num - (pos_cluster_num + neg_cluster_num + neu_cluster_num)
    if pos_cluster_num < neg_cluster_num and pos_cluster_num < neu_cluster_num:
        pos_cluster_num += diff
    elif neg_cluster_num < pos_cluster_num and neg_cluster_num < neu_cluster_num:
        neg_cluster_num += diff
    else:
        neu_cluster_num += diff

    print("pos_cluster_num = ", pos_cluster_num)
    print("neg_cluster_num = ", neg_cluster_num)
    print("neu_cluster_num = ", neu_cluster_num)
    
    for i in range(first_count, last_count+1):
        saved_model_folder = f"/shared/share_tm-finance/Final/Stored_model/three_models/{modeltype}/pos"
        pos_topics, pos_probs, pos_tr_topic_dist, pos_Topic_model, pos_tr_grouped_sum, pos_te_grouped_sum = \
                train_model(saved_model_folder, pos_df, pos_headlines, pos_embeddings, 
                            modeltype, topic_num, pos_cluster_num, hdb_min_cluster_size, tar_year, save_model, reduce_outliers, i)
        mean_score, ave_sim, model_diversity = calculate_score(pos_Topic_model, embedding_model, pos_topics)
        cos_in_topic_pos_sum += mean_score
        cos_bet_topic_pos_sum += ave_sim
        model_diversity_pos_sum += model_diversity

        saved_model_folder = f"/shared/share_tm-finance/Final/Stored_model/three_models/{modeltype}/neg"
        neg_topics, neg_probs, neg_tr_topic_dist, neg_Topic_model, neg_tr_grouped_sum, neg_te_grouped_sum = \
                train_model(saved_model_folder, neg_df, neg_headlines, neg_embeddings,
                            modeltype, topic_num, neg_cluster_num, hdb_min_cluster_size, tar_year, save_model, reduce_outliers, i)
        mean_score, ave_sim, model_diversity = calculate_score(neg_Topic_model, embedding_model, neg_topics)
        cos_in_topic_neg_sum += mean_score
        cos_bet_topic_neg_sum += ave_sim
        model_diversity_neg_sum += model_diversity

        saved_model_folder = f"/shared/share_tm-finance/Final/Stored_model/three_models/{modeltype}/neu"
        neu_topics, neu_probs, neu_tr_topic_dist, neu_Topic_model, neu_tr_grouped_sum, neu_te_grouped_sum = \
                train_model(saved_model_folder, neu_df, neu_headlines, neu_embeddings,
                            modeltype, topic_num, neu_cluster_num, hdb_min_cluster_size, tar_year, save_model, reduce_outliers, i)
        mean_score, ave_sim, model_diversity = calculate_score(neu_Topic_model, embedding_model, neu_topics)
        cos_in_topic_neu_sum += mean_score
        cos_bet_topic_neu_sum += ave_sim
        model_diversity_neu_sum += model_diversity
        
        if combine:   
            #get the last column name of the last column of pos_tr_grouped_sum
            pos_last_col = int(pos_tr_grouped_sum.columns[-1])
            neg_tr_grouped_sum.columns = [str(int(col) + pos_last_col) for col in neg_tr_grouped_sum.columns]
            neg_last_col = int(neg_tr_grouped_sum.columns[-1])
            neu_tr_grouped_sum.columns = [str(int(col) + neg_last_col) for col in neu_tr_grouped_sum.columns]
            
            pos_last_col = int(pos_te_grouped_sum.columns[-1])
            neg_te_grouped_sum.columns = [str(int(col) + pos_last_col) for col in neg_te_grouped_sum.columns]
            neg_last_col = int(neg_te_grouped_sum.columns[-1])
            neu_te_grouped_sum.columns = [str(int(col) + neg_last_col) for col in neu_te_grouped_sum.columns]

            tr_grouped_sum = pd.concat([pos_tr_grouped_sum, neg_tr_grouped_sum, neu_tr_grouped_sum], axis = 1)
            tr_grouped_sum.fillna(0, inplace = True)
            te_grouped_sum = pd.concat([pos_te_grouped_sum, neg_te_grouped_sum, neu_te_grouped_sum], axis = 1)
            te_grouped_sum.fillna(0, inplace = True)
            linear_regression(tr_grouped_sum, te_grouped_sum, insampler2_list, outsampler2_list)
            
        else:
            linear_regression(pos_tr_grouped_sum, pos_te_grouped_sum, pos_insampler2_list, pos_outsampler2_list)
            linear_regression(neg_tr_grouped_sum, neg_te_grouped_sum, neg_insampler2_list, neg_outsampler2_list)
            linear_regression(neu_tr_grouped_sum, neu_te_grouped_sum, neu_insampler2_list, neu_outsampler2_list)
            
                                                                                                                                                                    

    if combine:
        insampler2_dict[tar_year] = np.mean(insampler2_list)
        outsampler2_dict[tar_year] = np.mean(outsampler2_list)
    else:
        sep_insampler2_list = (pos_insampler2_list* len(pos_embeddings) / len(embeddings)) + (neg_insampler2_list* len(neg_embeddings) / len(embeddings)) + (neu_insampler2_list* len(neu_embeddings) / len(embeddings))
        sep_outsampler2_list = (pos_outsampler2_list* len(pos_embeddings) / len(embeddings)) + (neg_outsampler2_list* len(neg_embeddings) / len(embeddings)) + (neu_outsampler2_list* len(neu_embeddings) / len(embeddings))
        insampler2_dict[tar_year] = np.mean(sep_insampler2_list)
        insampler2_dict[tar_year] = np.mean(sep_outsampler2_list)

    cos_in_topic_dict[tar_year] = (cos_in_topic_pos_sum* len(pos_embeddings) / len(embeddings) + cos_in_topic_neg_sum* len(neg_embeddings) / len(embeddings) + cos_in_topic_neu_sum* len(neu_embeddings) / len(embeddings)) / div
    cos_bet_topic_dict[tar_year] = (cos_bet_topic_pos_sum* len(pos_embeddings) / len(embeddings) + cos_bet_topic_neg_sum* len(neg_embeddings) / len(embeddings) + cos_bet_topic_neu_sum* len(neu_embeddings) / len(embeddings)) / div
    model_diversity_dict[tar_year] = (model_diversity_pos_sum* len(pos_embeddings) / len(embeddings) + model_diversity_neg_sum* len(neg_embeddings) / len(embeddings) + model_diversity_neu_sum* len(neu_embeddings) / len(embeddings)) / div
    
    print("Year {year} is done".format(year = tar_year))
        
if not combine:
    print("sep_insample = ", insampler2_dict)
    print("sep_outsample = ", insampler2_dict)
else:
    print("insample = ", insampler2_dict)
    print("outsample = ", outsampler2_dict)

print("cos_in_topic = ", cos_in_topic_dict)
print("cos_bet_topic = ", cos_bet_topic_dict)
print("model_diversity = ", model_diversity_dict)

pos_cluster_num =  43
neg_cluster_num =  18
neu_cluster_num =  59


2024-09-24 11:15:36,436 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-09-24 11:15:36,685 - BERTopic - Dimensionality - Completed ✓
2024-09-24 11:15:36,688 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-09-24 11:15:55,870 - BERTopic - Cluster - Completed ✓
2024-09-24 11:15:55,883 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-09-24 11:15:57,749 - BERTopic - Representation - Completed ✓
100%|██████████| 96/96 [00:05<00:00, 16.87it/s]
2024-09-24 11:16:03,852 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-09-24 11:16:03,864 - BERTopic - Dimensionality - Completed ✓
2024-09-24 11:16:03,864 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2024-09-24 11:16:03,931 - BERTopic - Cluster - Completed ✓
100%|██████████| 24/24 [00:01<00:00, 18.58it/s]
2024-09-24 11:16:05,951 - BERTopic - Dimensionality - Fitting the dimensionality reduction

Year 2014 is done
pos_cluster_num =  41
neg_cluster_num =  19
neu_cluster_num =  60


2024-09-24 11:22:45,816 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-09-24 11:22:46,043 - BERTopic - Dimensionality - Completed ✓
2024-09-24 11:22:46,045 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-09-24 11:23:02,767 - BERTopic - Cluster - Completed ✓
2024-09-24 11:23:02,778 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-09-24 11:23:04,366 - BERTopic - Representation - Completed ✓
100%|██████████| 88/88 [00:04<00:00, 18.75it/s]
2024-09-24 11:23:09,417 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-09-24 11:23:09,427 - BERTopic - Dimensionality - Completed ✓
2024-09-24 11:23:09,428 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2024-09-24 11:23:09,486 - BERTopic - Cluster - Completed ✓
100%|██████████| 22/22 [00:01<00:00, 20.44it/s]
2024-09-24 11:23:11,257 - BERTopic - Dimensionality - Fitting the dimensionality reduction

Year 2015 is done
pos_cluster_num =  41
neg_cluster_num =  19
neu_cluster_num =  60


2024-09-24 11:31:38,451 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-09-24 11:31:38,687 - BERTopic - Dimensionality - Completed ✓
2024-09-24 11:31:38,690 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-09-24 11:31:55,411 - BERTopic - Cluster - Completed ✓
2024-09-24 11:31:55,422 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-09-24 11:31:57,104 - BERTopic - Representation - Completed ✓
100%|██████████| 90/90 [00:05<00:00, 17.82it/s]
2024-09-24 11:32:02,542 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-09-24 11:32:02,553 - BERTopic - Dimensionality - Completed ✓
2024-09-24 11:32:02,554 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2024-09-24 11:32:02,617 - BERTopic - Cluster - Completed ✓
100%|██████████| 23/23 [00:01<00:00, 19.80it/s]
2024-09-24 11:32:04,525 - BERTopic - Dimensionality - Fitting the dimensionality reduction

Year 2016 is done
pos_cluster_num =  44
neg_cluster_num =  17
neu_cluster_num =  59


2024-09-24 11:38:30,155 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-09-24 11:38:30,408 - BERTopic - Dimensionality - Completed ✓
2024-09-24 11:38:30,412 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-09-24 11:38:53,546 - BERTopic - Cluster - Completed ✓
2024-09-24 11:38:53,558 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-09-24 11:38:55,327 - BERTopic - Representation - Completed ✓
100%|██████████| 99/99 [00:05<00:00, 18.34it/s]
2024-09-24 11:39:01,118 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-09-24 11:39:01,129 - BERTopic - Dimensionality - Completed ✓
2024-09-24 11:39:01,129 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2024-09-24 11:39:01,200 - BERTopic - Cluster - Completed ✓
100%|██████████| 25/25 [00:01<00:00, 20.17it/s]
2024-09-24 11:39:03,214 - BERTopic - Dimensionality - Fitting the dimensionality reduction

Year 2017 is done
pos_cluster_num =  45
neg_cluster_num =  18
neu_cluster_num =  57


2024-09-24 11:45:22,132 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-09-24 11:45:22,388 - BERTopic - Dimensionality - Completed ✓
2024-09-24 11:45:22,390 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-09-24 11:45:40,455 - BERTopic - Cluster - Completed ✓
2024-09-24 11:45:40,467 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-09-24 11:45:42,262 - BERTopic - Representation - Completed ✓
100%|██████████| 100/100 [00:05<00:00, 18.35it/s]
2024-09-24 11:45:48,115 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-09-24 11:45:48,127 - BERTopic - Dimensionality - Completed ✓
2024-09-24 11:45:48,127 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2024-09-24 11:45:48,200 - BERTopic - Cluster - Completed ✓
100%|██████████| 25/25 [00:01<00:00, 19.64it/s]
2024-09-24 11:45:50,204 - BERTopic - Dimensionality - Fitting the dimensionality reducti

Year 2018 is done
pos_cluster_num =  45
neg_cluster_num =  18
neu_cluster_num =  57


2024-09-24 11:51:49,699 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-09-24 11:51:49,970 - BERTopic - Dimensionality - Completed ✓
2024-09-24 11:51:49,972 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-09-24 11:52:11,429 - BERTopic - Cluster - Completed ✓
2024-09-24 11:52:11,443 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-09-24 11:52:13,375 - BERTopic - Representation - Completed ✓
100%|██████████| 106/106 [00:05<00:00, 18.75it/s]
2024-09-24 11:52:19,460 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-09-24 11:52:19,472 - BERTopic - Dimensionality - Completed ✓
2024-09-24 11:52:19,473 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2024-09-24 11:52:19,549 - BERTopic - Cluster - Completed ✓
100%|██████████| 27/27 [00:01<00:00, 20.66it/s]
2024-09-24 11:52:21,712 - BERTopic - Dimensionality - Fitting the dimensionality reducti

Year 2019 is done
pos_cluster_num =  44
neg_cluster_num =  21
neu_cluster_num =  55


2024-09-24 11:58:34,278 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-09-24 11:58:34,582 - BERTopic - Dimensionality - Completed ✓
2024-09-24 11:58:34,584 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-09-24 11:59:08,278 - BERTopic - Cluster - Completed ✓
2024-09-24 11:59:08,292 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-09-24 11:59:10,393 - BERTopic - Representation - Completed ✓
100%|██████████| 119/119 [00:06<00:00, 19.09it/s]
2024-09-24 11:59:17,040 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-09-24 11:59:17,055 - BERTopic - Dimensionality - Completed ✓
2024-09-24 11:59:17,056 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2024-09-24 11:59:17,139 - BERTopic - Cluster - Completed ✓
100%|██████████| 30/30 [00:01<00:00, 20.65it/s]
2024-09-24 11:59:19,421 - BERTopic - Dimensionality - Fitting the dimensionality reducti

Year 2020 is done
pos_cluster_num =  49
neg_cluster_num =  15
neu_cluster_num =  56


2024-09-24 12:07:40,621 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-09-24 12:07:40,948 - BERTopic - Dimensionality - Completed ✓
2024-09-24 12:07:40,951 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-09-24 12:08:03,637 - BERTopic - Cluster - Completed ✓
2024-09-24 12:08:03,652 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-09-24 12:08:05,992 - BERTopic - Representation - Completed ✓
100%|██████████| 129/129 [00:07<00:00, 17.45it/s]
2024-09-24 12:08:13,858 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-09-24 12:08:13,872 - BERTopic - Dimensionality - Completed ✓
2024-09-24 12:08:13,872 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2024-09-24 12:08:13,972 - BERTopic - Cluster - Completed ✓
100%|██████████| 33/33 [00:01<00:00, 19.60it/s]
2024-09-24 12:08:16,524 - BERTopic - Dimensionality - Fitting the dimensionality reducti

Year 2021 is done
pos_cluster_num =  44
neg_cluster_num =  21
neu_cluster_num =  55


2024-09-24 12:16:06,780 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-09-24 12:16:07,075 - BERTopic - Dimensionality - Completed ✓
2024-09-24 12:16:07,077 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-09-24 12:16:32,368 - BERTopic - Cluster - Completed ✓
2024-09-24 12:16:32,383 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-09-24 12:16:34,539 - BERTopic - Representation - Completed ✓
100%|██████████| 118/118 [00:06<00:00, 18.66it/s]
2024-09-24 12:16:41,287 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-09-24 12:16:41,300 - BERTopic - Dimensionality - Completed ✓
2024-09-24 12:16:41,300 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2024-09-24 12:16:41,384 - BERTopic - Cluster - Completed ✓
100%|██████████| 30/30 [00:01<00:00, 16.66it/s]
2024-09-24 12:16:44,017 - BERTopic - Dimensionality - Fitting the dimensionality reducti

Year 2022 is done
pos_cluster_num =  46
neg_cluster_num =  19
neu_cluster_num =  55


2024-09-24 12:23:37,884 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-09-24 12:23:38,207 - BERTopic - Dimensionality - Completed ✓
2024-09-24 12:23:38,210 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-09-24 12:24:01,516 - BERTopic - Cluster - Completed ✓
2024-09-24 12:24:01,533 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-09-24 12:24:03,817 - BERTopic - Representation - Completed ✓
100%|██████████| 129/129 [00:06<00:00, 19.13it/s]
2024-09-24 12:24:11,020 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-09-24 12:24:11,035 - BERTopic - Dimensionality - Completed ✓
2024-09-24 12:24:11,035 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2024-09-24 12:24:11,135 - BERTopic - Cluster - Completed ✓
100%|██████████| 33/33 [00:01<00:00, 21.28it/s]
2024-09-24 12:24:13,776 - BERTopic - Dimensionality - Fitting the dimensionality reducti

Year 2023 is done
insample =  {2014: 0.03689327291031637, 2015: 0.043791047771508926, 2016: 0.05735197835674559, 2017: 0.06718844734134102, 2018: 0.06440737403447125, 2019: 0.09248796483365047, 2020: 0.027448056494246465, 2021: 0.03505869734405691, 2022: 0.0699381898717106, 2023: 0.0880751023720991}
outsample =  {2014: 0.018370706196390453, 2015: 0.024473810690752475, 2016: 0.030574705745905394, 2017: 0.03479551266534171, 2018: 0.03270388590179858, 2019: 0.0506906866201573, 2020: 0.017831143408193428, 2021: 0.017844322480493502, 2022: 0.039581208363098044, 2023: 0.04555669223840655}
cos_in_topic =  {2014: 0.5995873513657399, 2015: 0.5986624116625243, 2016: 0.5970589176562049, 2017: 0.6010260203663165, 2018: 0.5966511843001344, 2019: 0.5945811978408713, 2020: 0.5990704629537117, 2021: 0.599390855507812, 2022: 0.5937319996176398, 2023: 0.5982299807220439}
cos_bet_topic =  {2014: 0.5301228224607887, 2015: 0.535797988299415, 2016: 0.5286174514300225, 2017: 0.5343596915442633, 2018: 0.53698

In [63]:
file_path = 'Data.xlsx'
wb = openpyxl.load_workbook(file_path)
program_list = [insampler2_dict, outsampler2_dict, cos_in_topic_dict, cos_bet_topic_dict, model_diversity_dict]
data_list = ['Insample_R2', 'Outsample_R2', 'Cos_in_topic', 'Cos_btn_topic', 'Diversity']

for i in range(len(data_list)):
    ws = wb[data_list[i]]
    for cell in ws['A']:
        if cell.value != None:
            for row_num in range(cell.row, cell.row + 3):
                model_cell = ws.cell(row=row_num, column=2).value
                if model_cell == f"{modeltype}_{topic_num}":
                    for year, value in program_list[i].items():
                        ws.cell(row=row_num, column=year - 2013 + 2, value=value)
wb.save(file_path)

In [64]:
file_path = 'Data.xlsx'
wb = openpyxl.load_workbook(file_path)
data_list = ['Insample_R2', 'Outsample_R2', 'Cos_in_topic', 'Cos_btn_topic', 'Diversity']
years = range(2014, 2024)
for data_type in data_list:
    ws = wb[data_type]
    for cell in ws['A']:
        if cell.value != None:
            plt.figure(figsize=(10, 6))
            plt.title(f'{cell.value} {data_type}')
            for row_num in range(cell.row, cell.row + 3):
                model_cell = ws.cell(row=row_num, column=2).value
                data = []
                for year in years:
                    data.append(ws.cell(row=row_num, column=year - 2013 + 2).value)
                plt.plot(years, data, label = model_cell)
            plt.xlabel('Year')
            plt.ylabel(f'{data_type}')
            plt.legend()
            plt.savefig(f'graph/{cell.value}_{data_type}.png')
            plt.close()    
             