# Preparation
Imporint packages, setting up GPU and functions for BERTopic.

In [1]:
import os 
import openpyxl
import pandas as pd 
import numpy as np
import pickle
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from bertopic.cluster import BaseCluster
import re
import collections
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from cuml.manifold import UMAP
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from cuml.cluster import HDBSCAN
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
# from pycave.bayes import GaussianMixture as GMM
from sklearn.feature_extraction.text import CountVectorizer
import collections
from tqdm import tqdm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
!nvidia-smi
# Check available GPUs
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

Tue Jul 30 10:07:16 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A40                     Off | 00000000:01:00.0 Off |                    0 |
|  0%   52C    P0              92W / 300W |  10786MiB / 46068MiB |     32%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A40                     Off | 00000000:25:00.0 Off |  

In [3]:
class ClusterModel:
    def __init__(self, model):
        self.model = model

    def fit(self, X, y=None):
        self.model.fit(X)
        self.labels_ = self.model.predict(X)
        return self

    def predict(self, X):
        predictions = self.model.predict(X)
        self.labels_ = predictions
        return predictions
    
# class MyGMM:
#     def __init__(self, num_components, trainer_params):
#         self.gmm = GMM(num_components=num_components,trainer_params=trainer_params)
#         self.labels_ = None
    
#     def fit(self,data):
#         self.gmm.fit(data)
#         self.labels_ = np.array(self.gmm.predict(data))
#         return self
    
#     def predict(self,data):
#         return np.array(self.gmm.predict(data))

class Dimensionality:
  """ Use this for pre-calculated reduced embeddings """
  def __init__(self, reduced_embeddings):
    self.reduced_embeddings = reduced_embeddings

  def fit(self, X):
    return self

  def transform(self, X):
    return self.reduced_embeddings

In [4]:
def model_setup(modeltype, cluster_num, hdb_min_cluster_size):    
    # Embeddings
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
    # Reduce dimensionality
    PCA_model = PCA(n_components=10)
    SVD_model = TruncatedSVD(n_components=10, random_state=42, n_iter=10) 
    umap_model = UMAP(n_neighbors=10, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
    # Cluster embeddings
    gmm_model = ClusterModel(GaussianMixture(n_components=cluster_num, covariance_type='full'))
    # gmm_model2 = MyGMM(num_components=cluster_num,trainer_params={"accelerator":'gpu',"devices":1})
    KMeans_model = KMeans(n_clusters=cluster_num)
    hdbscan_model = HDBSCAN(min_cluster_size=hdb_min_cluster_size, metric = "euclidean", cluster_selection_method="eom",
                            gen_min_span_tree = True, prediction_data = True, min_samples = 20, verbose = True)
    # Vectorize
    vectorizer_model = CountVectorizer(stop_words="english", min_df=1, max_df = 50, ngram_range=(1, 2))
    
    model_combinations = {
    'umaphdbscan': (umap_model, hdbscan_model),
    'pcakmeans': (PCA_model, KMeans_model),
    'umapgmm': (umap_model, gmm_model),
    'pcagmm': (PCA_model, gmm_model),
    'svdkeans': (SVD_model, KMeans_model),
    'pcahdbscan': (PCA_model, hdbscan_model)
    }

    dim_red_model, cluster_model = model_combinations.get(modeltype, (PCA_model, KMeans_model))

    Topic_model = BERTopic(embedding_model=embedding_model, umap_model=dim_red_model, hdbscan_model=cluster_model, vectorizer_model=vectorizer_model,
                        calculate_probabilities = False, verbose = True, low_memory = True)
    return Topic_model, embedding_model

def Standardization(css_sum_by_topic):
    css_sum_by_topic_df = css_sum_by_topic.reset_index()
    css_sum_by_topic_df.columns = ['topic', 'css']
    scaler = StandardScaler()
    css_sum_by_topic_df['css_standardized'] = scaler.fit_transform(css_sum_by_topic_df[['css']])
    css_standardized_series = css_sum_by_topic_df.set_index('topic')['css_standardized']
    return css_standardized_series
    

def train_model(saved_model_folder, df, red_headlines, embeddings, modeltype, topic_num, cluster_num, hdb_min_cluster_size, tar_year, sentiment_type, save_model, reduce_outliers, i):
    # Perform the train-test split on indices
    indices = np.arange(len(red_headlines))
    tr_ind, te_ind = train_test_split(indices, test_size=0.2, shuffle= True, random_state=i)

    # torch.cuda.empty_cache()
    # gc.collect()
    tr_df = df.iloc[tr_ind,:]
    te_df = df.iloc[te_ind,:]
    tr_headlines = [red_headlines[ind] for ind in tr_ind]
    te_headlines = [red_headlines[ind] for ind in te_ind]
    tr_embeddings = embeddings[tr_ind,:]
    te_embeddings = embeddings[te_ind,:]
    
    Topic_model, embedding_model = model_setup(modeltype, cluster_num, hdb_min_cluster_size)
    indices = np.arange(len(red_headlines))

    if modeltype == 'pcahdbscan':
         # Perform the train-test split on indices
        tr_ind, te_ind = train_test_split(indices, test_size=0.2, shuffle= True, random_state=i)
        tr_headlines = [red_headlines[ind] for ind in tr_ind]
        tr_embeddings = embeddings[tr_ind,:]
        topics, probs = Topic_model.fit_transform(tr_headlines, embeddings = tr_embeddings)
        topic_number = len(set(topics))
        print('topic_number:', topic_number)
        print('hdb_min_cluster_size:', hdb_min_cluster_size)

        smaller_count = 0
        larger_count = 0
        last_small = False
        last_large = False
        while topic_number < cluster_num - 5 or topic_number > cluster_num + 5:
            if topic_number < cluster_num - 5:
                if last_small == True:
                    smaller_count -= 1
                smaller_count += 1
                last_small = True
                last_large = False
                hdb_min_cluster_size -= 5 - larger_count
                Topic_model, embedding_model = model_setup(modeltype, cluster_num, hdb_min_cluster_size)
                topics, probs = Topic_model.fit_transform(tr_headlines, embeddings = tr_embeddings)  
            elif topic_number > cluster_num + 5:
                if last_large == True:
                    larger_count -= 1
                larger_count += 1
                last_large = True
                last_small = False
                hdb_min_cluster_size += 5 - smaller_count
                Topic_model, embedding_model = model_setup(modeltype, cluster_num, hdb_min_cluster_size)
                topics, probs = Topic_model.fit_transform(tr_headlines, embeddings = tr_embeddings)
            topic_number = len(set(topics))
            print('topic_number:', topic_number)
            print('hdb_min_cluster_size:', hdb_min_cluster_size)
            print('smaller_count:', smaller_count)
            print('larger_count:', larger_count)
            if smaller_count == 5 and larger_count == 5:
                break
    
    topics, probs = Topic_model.fit_transform(tr_headlines, embeddings = tr_embeddings)

    #save the topic model
    if save_model:
        Topic_model.save(saved_model_folder+f'/{sentiment_type}/{modeltype}/{tar_year}_{topic_num}_{i}',
        serialization = "safetensors", save_ctfidf = True, save_embedding_model = embedding_model)

    if reduce_outliers:
        topics = Topic_model.reduce_outliers(tr_headlines, topics)
        Topic_model.update_topics(tr_headlines, topics=topics)
    
    #calculate insample R2
    tr_topic_dist, _ = Topic_model.approximate_distribution(tr_headlines)
    tr_df = tr_df.reset_index(drop = True)
    tr_contem_ret_topic_dist = pd.concat([tr_df.drop(columns = ["rp_entity_id","headline","vocab_con_headline"]),pd.DataFrame(tr_topic_dist)],axis = 1)
    tr_grouped = tr_contem_ret_topic_dist.groupby(['date',"comnam","ret"])
    tr_grouped_sum = tr_grouped.sum()
    tr_grouped_sum.drop(columns = ['css'], inplace = True)   

    #calculate outsample R2
    new_topics, new_probs = Topic_model.transform(te_headlines, embeddings = te_embeddings)

    if reduce_outliers:
        new_topics = Topic_model.reduce_outliers(te_headlines, new_topics)
        Topic_model.update_topics(te_headlines, topics=new_topics)
    
    te_topic_dist, _ = Topic_model.approximate_distribution(te_headlines)     
    te_df = te_df.reset_index(drop = True)
    te_contem_ret_topic_dist = pd.concat([te_df.drop(columns = ["rp_entity_id","headline","vocab_con_headline"]),pd.DataFrame(te_topic_dist)],axis = 1)
    te_grouped = te_contem_ret_topic_dist.groupby(['date',"comnam","ret"])
    te_grouped_sum = te_grouped.sum()
    te_grouped_sum.drop(columns = ['css'], inplace = True)
    
    
    return topics, probs, tr_topic_dist, Topic_model, tr_grouped_sum, te_grouped_sum
    
def linear_regression(tr_grouped_sum, te_grouped_sum, insampler2_list, outsampler2_list):
    tr_X = np.array(tr_grouped_sum)
    tr_ret = [ind[2] for ind in list(tr_grouped_sum.index)]
    tr_Y = np.array(tr_ret).reshape(-1,1)
    tr_regression = LinearRegression(fit_intercept=True)
    tr_regression.fit(tr_X,tr_Y)
    tr_Y_pred = tr_regression.predict(tr_X)
    tr_Y_mean = np.mean(tr_Y)
    tr_SS_tot = np.sum((tr_Y - tr_Y_mean) ** 2)
    tr_SS_res = np.sum((tr_Y - tr_Y_pred) ** 2)
    tr_r2 = 1 - (tr_SS_res / tr_SS_tot)
    insampler2_list.append(tr_r2)

    te_X = np.array(te_grouped_sum)
    te_ret = [ind[2] for ind in list(te_grouped_sum.index)]
    te_Y = np.array(te_ret).reshape(-1,1)
    te_Y_pred = tr_regression.predict(te_X)
    te_SS_tot = np.sum((te_Y - tr_Y_mean) ** 2)
    te_SS_res = np.sum((te_Y - te_Y_pred) ** 2)
    te_r2 = 1 - (te_SS_res / te_SS_tot)
    outsampler2_list.append(te_r2)
    return

# Building Model and Calculating R-Square
Choosing 'model type', topic number', data type', and whther save model or not. \
Building up model combination, split the train and test data. Training the model with dataset, generating topics. \
Using linear regression to calculate the in-sample and out-sample R-Square for the topic weight of positive, negative, and neutral topic models.

In [5]:
insampler2_dict = {}
outsampler2_dict = {}
sep_insampler2_dict = {}
sep_outsampler2_dict = {}

modeltype = 'pcagmm'
sentiment_type = 'no_senti'
topic_num = 120
cluster_num = 120
save_model = True
hdb_min_cluster_size = 60
datatype = 'contem'
df_folder = "/shared/share_tm-finance/Processed_df_Sentiment/One_year_window"
embeddings_folder = "/shared/share_tm-finance/Embeddings_with_Sentiment/One_year_window"
saved_model_folder = "/shared/share_tm-finance/Stored_model/new_data"
reduce_outliers = False
combine = True
for i in range(2014, 2024):
    
    tar_year = i
    df = pd.read_csv(df_folder+'/{type}_{year}_senti.csv'.format(year = tar_year, type = datatype))
    red_headlines = df.vocab_con_headline.tolist()
    embeddings = np.load(embeddings_folder+"/{type}_{year}_senti_embeddings.npy".format(year = tar_year, type = datatype))
    indices = np.arange(len(red_headlines))
    pos_insampler2_list = []
    pos_outsampler2_list = []
    neg_insampler2_list = []
    neg_outsampler2_list = []
    neu_insampler2_list = []
    neu_outsampler2_list = []
    insampler2_list = []
    outsampler2_list = []

    #split the data into 3 part, positive css, negative css, neutral css
    pos_indices = df[df['css'] > 0].index
    neg_indices = df[df['css'] < 0].index
    neu_indices = df[df['css'] == 0].index
    pos_df = df.iloc[pos_indices,:]
    neg_df = df.iloc[neg_indices,:]
    neu_df = df.iloc[neu_indices,:]
    pos_headlines = [red_headlines[ind] for ind in pos_indices]
    neg_headlines = [red_headlines[ind] for ind in neg_indices]
    neu_headlines = [red_headlines[ind] for ind in neu_indices]
    pos_embeddings = embeddings[pos_indices,:]
    neg_embeddings = embeddings[neg_indices,:]
    neu_embeddings = embeddings[neu_indices,:]
    
    #set pos_cluster_num, neg_cluster_num, neu_cluster_num based on the number of embeddings
    pos_cluster_num = int(cluster_num * len(pos_embeddings) / len(embeddings))
    neg_cluster_num = int(cluster_num * len(neg_embeddings) / len(embeddings))
    neu_cluster_num = int(cluster_num * len(neu_embeddings) / len(embeddings))
    diff = cluster_num - (pos_cluster_num + neg_cluster_num + neu_cluster_num)
    if pos_cluster_num < neg_cluster_num and pos_cluster_num < neu_cluster_num:
        pos_cluster_num += diff
    elif neg_cluster_num < pos_cluster_num and neg_cluster_num < neu_cluster_num:
        neg_cluster_num += diff
    else:
        neu_cluster_num += diff

    print("pos_cluster_num = ", pos_cluster_num)
    print("neg_cluster_num = ", neg_cluster_num)
    print("neu_cluster_num = ", neu_cluster_num)
    
    for i in range(1, 6):
        saved_model_folder = "/shared/share_tm-finance/Stored_model/three_models/pos_topic"
        pos_topics, pos_probs, pos_tr_topic_dist, pos_Topic_model, pos_tr_grouped_sum, pos_te_grouped_sum = \
                train_model(saved_model_folder, pos_df, pos_headlines, pos_embeddings, 
                            modeltype, topic_num, pos_cluster_num, hdb_min_cluster_size, tar_year, sentiment_type, save_model, reduce_outliers, i)
        saved_model_folder = "/shared/share_tm-finance/Stored_model/three_models/neg_topic"
        neg_topics, neg_probs, neg_tr_topic_dist, neg_Topic_model, neg_tr_grouped_sum, neg_te_grouped_sum = \
                train_model(saved_model_folder, neg_df, neg_headlines, neg_embeddings,
                            modeltype, topic_num, neg_cluster_num, hdb_min_cluster_size, tar_year, sentiment_type, save_model, reduce_outliers, i)
        saved_model_folder = "/shared/share_tm-finance/Stored_model/three_models/neu_topic"
        neu_topics, neu_probs, neu_tr_topic_dist, neu_Topic_model, neu_tr_grouped_sum, neu_te_grouped_sum = \
                train_model(saved_model_folder, neu_df, neu_headlines, neu_embeddings,
                            modeltype, topic_num, neu_cluster_num, hdb_min_cluster_size, tar_year, sentiment_type, save_model, reduce_outliers, i)
        if combine:   
            #get the last column name of the last column of pos_tr_grouped_sum
            pos_last_col = int(pos_tr_grouped_sum.columns[-1])
            neg_tr_grouped_sum.columns = [str(int(col) + pos_last_col) for col in neg_tr_grouped_sum.columns]
            neg_last_col = int(neg_tr_grouped_sum.columns[-1])
            neu_tr_grouped_sum.columns = [str(int(col) + neg_last_col) for col in neu_tr_grouped_sum.columns]
            
            pos_last_col = int(pos_te_grouped_sum.columns[-1])
            neg_te_grouped_sum.columns = [str(int(col) + pos_last_col) for col in neg_te_grouped_sum.columns]
            neg_last_col = int(neg_te_grouped_sum.columns[-1])
            neu_te_grouped_sum.columns = [str(int(col) + neg_last_col) for col in neu_te_grouped_sum.columns]

            tr_grouped_sum = pd.concat([pos_tr_grouped_sum, neg_tr_grouped_sum, neu_tr_grouped_sum], axis = 1)
            tr_grouped_sum.fillna(0, inplace = True)
            te_grouped_sum = pd.concat([pos_te_grouped_sum, neg_te_grouped_sum, neu_te_grouped_sum], axis = 1)
            te_grouped_sum.fillna(0, inplace = True)
            linear_regression(tr_grouped_sum, te_grouped_sum, insampler2_list, outsampler2_list)
        else:
            linear_regression(pos_tr_grouped_sum, pos_te_grouped_sum, pos_insampler2_list, pos_outsampler2_list)
            linear_regression(neg_tr_grouped_sum, neg_te_grouped_sum, neg_insampler2_list, neg_outsampler2_list)
            linear_regression(neu_tr_grouped_sum, neu_te_grouped_sum, neu_insampler2_list, neu_outsampler2_list)
            
                                                                                                                                                                    

    if combine:
        insampler2_dict[tar_year] = insampler2_list
        outsampler2_dict[tar_year] = outsampler2_list
    else:
        sep_insampler2_list = (pos_insampler2_list* len(pos_embeddings) / len(embeddings)) + (neg_insampler2_list* len(neg_embeddings) / len(embeddings)) + (neu_insampler2_list* len(neu_embeddings) / len(embeddings))
        sep_outsampler2_list = (pos_outsampler2_list* len(pos_embeddings) / len(embeddings)) + (neg_outsampler2_list* len(neg_embeddings) / len(embeddings)) + (neu_outsampler2_list* len(neu_embeddings) / len(embeddings))
        sep_insampler2_dict[tar_year] = sep_insampler2_list
        sep_outsampler2_dict[tar_year] = sep_outsampler2_list

    print("Year {year} is done".format(year = tar_year))
        
if not combine:
    print("sep_insample = ", sep_insampler2_dict)
    print("sep_outsample = ", sep_outsampler2_dict)
else:
    print("insample = ", insampler2_dict)
    print("outsample = ", outsampler2_dict)

pos_cluster_num =  43
neg_cluster_num =  18
neu_cluster_num =  59


2024-07-30 00:55:25,661 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-30 00:55:25,931 - BERTopic - Dimensionality - Completed ✓
2024-07-30 00:55:25,935 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-07-30 00:55:44,959 - BERTopic - Cluster - Completed ✓
2024-07-30 00:55:44,976 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-07-30 00:55:46,930 - BERTopic - Representation - Completed ✓
100%|██████████| 96/96 [00:06<00:00, 14.37it/s]
2024-07-30 00:55:54,553 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-07-30 00:55:54,570 - BERTopic - Dimensionality - Completed ✓
2024-07-30 00:55:54,570 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2024-07-30 00:55:54,638 - BERTopic - Cluster - Completed ✓
100%|██████████| 24/24 [00:01<00:00, 15.15it/s]
2024-07-30 00:55:56,703 - BERTopic - Dimensionality - Fitting the dimensionality reduction

Year 2014 is done
pos_cluster_num =  41
neg_cluster_num =  19
neu_cluster_num =  60


2024-07-30 01:03:03,128 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-30 01:03:03,384 - BERTopic - Dimensionality - Completed ✓
2024-07-30 01:03:03,387 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-07-30 01:03:20,868 - BERTopic - Cluster - Completed ✓
2024-07-30 01:03:20,881 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-07-30 01:03:22,411 - BERTopic - Representation - Completed ✓
100%|██████████| 88/88 [00:05<00:00, 16.99it/s]
2024-07-30 01:03:28,356 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-07-30 01:03:28,365 - BERTopic - Dimensionality - Completed ✓
2024-07-30 01:03:28,366 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2024-07-30 01:03:28,423 - BERTopic - Cluster - Completed ✓
100%|██████████| 22/22 [00:01<00:00, 20.59it/s]
2024-07-30 01:03:29,956 - BERTopic - Dimensionality - Fitting the dimensionality reduction

Year 2015 is done
pos_cluster_num =  41
neg_cluster_num =  19
neu_cluster_num =  60


2024-07-30 01:11:50,115 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-30 01:11:50,385 - BERTopic - Dimensionality - Completed ✓
2024-07-30 01:11:50,387 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-07-30 01:12:09,447 - BERTopic - Cluster - Completed ✓
2024-07-30 01:12:09,462 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-07-30 01:12:11,446 - BERTopic - Representation - Completed ✓
100%|██████████| 90/90 [00:05<00:00, 16.13it/s]
2024-07-30 01:12:17,916 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-07-30 01:12:17,934 - BERTopic - Dimensionality - Completed ✓
2024-07-30 01:12:17,934 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2024-07-30 01:12:17,994 - BERTopic - Cluster - Completed ✓
100%|██████████| 23/23 [00:01<00:00, 15.63it/s]
2024-07-30 01:12:19,910 - BERTopic - Dimensionality - Fitting the dimensionality reduction

Year 2016 is done
pos_cluster_num =  44
neg_cluster_num =  17
neu_cluster_num =  59


2024-07-30 01:18:30,416 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-30 01:18:30,677 - BERTopic - Dimensionality - Completed ✓
2024-07-30 01:18:30,679 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-07-30 01:18:55,246 - BERTopic - Cluster - Completed ✓
2024-07-30 01:18:55,259 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-07-30 01:18:56,977 - BERTopic - Representation - Completed ✓
100%|██████████| 99/99 [00:05<00:00, 19.25it/s]
2024-07-30 01:19:02,907 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-07-30 01:19:02,919 - BERTopic - Dimensionality - Completed ✓
2024-07-30 01:19:02,919 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2024-07-30 01:19:02,989 - BERTopic - Cluster - Completed ✓
100%|██████████| 25/25 [00:01<00:00, 20.59it/s]
2024-07-30 01:19:04,701 - BERTopic - Dimensionality - Fitting the dimensionality reduction

Year 2017 is done
pos_cluster_num =  45
neg_cluster_num =  18
neu_cluster_num =  57


2024-07-30 01:25:16,798 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-30 01:25:17,066 - BERTopic - Dimensionality - Completed ✓
2024-07-30 01:25:17,069 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-07-30 01:25:35,012 - BERTopic - Cluster - Completed ✓
2024-07-30 01:25:35,027 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-07-30 01:25:37,063 - BERTopic - Representation - Completed ✓
100%|██████████| 100/100 [00:06<00:00, 16.49it/s]
2024-07-30 01:25:44,051 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-07-30 01:25:44,063 - BERTopic - Dimensionality - Completed ✓
2024-07-30 01:25:44,063 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2024-07-30 01:25:44,139 - BERTopic - Cluster - Completed ✓
100%|██████████| 25/25 [00:01<00:00, 15.25it/s]
2024-07-30 01:25:46,329 - BERTopic - Dimensionality - Fitting the dimensionality reducti

Year 2018 is done
pos_cluster_num =  45
neg_cluster_num =  18
neu_cluster_num =  57


2024-07-30 01:31:53,912 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-30 01:31:54,195 - BERTopic - Dimensionality - Completed ✓
2024-07-30 01:31:54,197 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-07-30 01:32:21,631 - BERTopic - Cluster - Completed ✓
2024-07-30 01:32:21,645 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-07-30 01:32:23,957 - BERTopic - Representation - Completed ✓
100%|██████████| 106/106 [00:06<00:00, 17.65it/s]
2024-07-30 01:32:30,849 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-07-30 01:32:30,861 - BERTopic - Dimensionality - Completed ✓
2024-07-30 01:32:30,862 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2024-07-30 01:32:30,940 - BERTopic - Cluster - Completed ✓
100%|██████████| 27/27 [00:01<00:00, 18.93it/s]
2024-07-30 01:32:32,874 - BERTopic - Dimensionality - Fitting the dimensionality reducti

Year 2019 is done
pos_cluster_num =  44
neg_cluster_num =  21
neu_cluster_num =  55


2024-07-30 01:39:58,264 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-30 01:39:58,565 - BERTopic - Dimensionality - Completed ✓
2024-07-30 01:39:58,567 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-07-30 01:40:14,930 - BERTopic - Cluster - Completed ✓
2024-07-30 01:40:14,945 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-07-30 01:40:16,983 - BERTopic - Representation - Completed ✓
100%|██████████| 119/119 [00:05<00:00, 20.93it/s]
2024-07-30 01:40:23,516 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-07-30 01:40:23,530 - BERTopic - Dimensionality - Completed ✓
2024-07-30 01:40:23,530 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2024-07-30 01:40:23,611 - BERTopic - Cluster - Completed ✓
100%|██████████| 30/30 [00:01<00:00, 19.49it/s]
2024-07-30 01:40:25,669 - BERTopic - Dimensionality - Fitting the dimensionality reducti

Year 2020 is done
pos_cluster_num =  49
neg_cluster_num =  15
neu_cluster_num =  56


2024-07-30 01:48:33,824 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-30 01:48:34,150 - BERTopic - Dimensionality - Completed ✓
2024-07-30 01:48:34,153 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-07-30 01:49:00,379 - BERTopic - Cluster - Completed ✓
2024-07-30 01:49:00,394 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-07-30 01:49:02,677 - BERTopic - Representation - Completed ✓
100%|██████████| 129/129 [00:06<00:00, 19.67it/s]
2024-07-30 01:49:10,242 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-07-30 01:49:10,257 - BERTopic - Dimensionality - Completed ✓
2024-07-30 01:49:10,258 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2024-07-30 01:49:10,358 - BERTopic - Cluster - Completed ✓
100%|██████████| 33/33 [00:01<00:00, 18.35it/s]
2024-07-30 01:49:12,661 - BERTopic - Dimensionality - Fitting the dimensionality reducti

Year 2021 is done
pos_cluster_num =  44
neg_cluster_num =  21
neu_cluster_num =  55


2024-07-30 01:56:37,655 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-30 01:56:37,955 - BERTopic - Dimensionality - Completed ✓
2024-07-30 01:56:37,958 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-07-30 01:56:57,253 - BERTopic - Cluster - Completed ✓
2024-07-30 01:56:57,266 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-07-30 01:56:59,375 - BERTopic - Representation - Completed ✓
100%|██████████| 118/118 [00:06<00:00, 19.29it/s]
2024-07-30 01:57:06,375 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-07-30 01:57:06,387 - BERTopic - Dimensionality - Completed ✓
2024-07-30 01:57:06,388 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2024-07-30 01:57:06,473 - BERTopic - Cluster - Completed ✓
100%|██████████| 30/30 [00:01<00:00, 21.33it/s]
2024-07-30 01:57:08,394 - BERTopic - Dimensionality - Fitting the dimensionality reducti

Year 2022 is done
pos_cluster_num =  46
neg_cluster_num =  19
neu_cluster_num =  55


2024-07-30 02:03:39,753 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-30 02:03:40,082 - BERTopic - Dimensionality - Completed ✓
2024-07-30 02:03:40,085 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-07-30 02:04:06,691 - BERTopic - Cluster - Completed ✓
2024-07-30 02:04:06,705 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-07-30 02:04:08,804 - BERTopic - Representation - Completed ✓
100%|██████████| 129/129 [00:06<00:00, 20.44it/s]
2024-07-30 02:04:15,998 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-07-30 02:04:16,013 - BERTopic - Dimensionality - Completed ✓
2024-07-30 02:04:16,013 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2024-07-30 02:04:16,106 - BERTopic - Cluster - Completed ✓
100%|██████████| 33/33 [00:01<00:00, 19.08it/s]
2024-07-30 02:04:18,287 - BERTopic - Dimensionality - Fitting the dimensionality reducti

Year 2023 is done
insample =  {2014: [0.04258870554226124, 0.04382523060319299, 0.04674526498804754, 0.04082850684092221, 0.04053916224231324], 2015: [0.048482713183386816, 0.0484078503287213, 0.04932107918794737, 0.05214337432115568, 0.047369216043357865], 2016: [0.0553660626866298, 0.05531766153457118, 0.06199601898055862, 0.06307569731666007, 0.05169539243244614], 2017: [0.07314965245738714, 0.06692103360951396, 0.07860218157578491, 0.07471833218610746, 0.07160894151762731], 2018: [0.06624872795116943, 0.06821703963058712, 0.0718321547961045, 0.06483634305872732, 0.07953583212216198], 2019: [0.09739968737478966, 0.08679908200346997, 0.09460123229508588, 0.10129263293934099, 0.09913882789139994], 2020: [0.029337675499302107, 0.029267841133186145, 0.03111174592981003, 0.03200948772758727, 0.0312502963453396], 2021: [0.0343635610021763, 0.0406028114031024, 0.03457949447218356, 0.03730023252007253, 0.0349973130243274], 2022: [0.07888319696129686, 0.07136454422111127, 0.07263458728785521

In [None]:
#save the metrics to an excel file

if combine:
    file_path = 'combine_metrics.xlsx'
    wb = openpyxl.load_workbook(file_path)
    #create a new sheet
    ws = wb.create_sheet(f'{modeltype}_{datatype}')
    df = pd.DataFrame({'insample2014': insampler2_dict[2014], 'insample2015': insampler2_dict[2015], 'insample2016': insampler2_dict[2016], 'insample2017': insampler2_dict[2017], 'insample2018': insampler2_dict[2018], 'insample2019': insampler2_dict[2019], 'insample2020': insampler2_dict[2020], 'insample2021': insampler2_dict[2021], 'insample2022': insampler2_dict[2022], 'insample2023': insampler2_dict[2023], \
                       'outsample2014': outsampler2_dict[2014], 'outsample2015': outsampler2_dict[2015], 'outsample2016': outsampler2_dict[2016], 'outsample2017': outsampler2_dict[2017], 'outsample2018': outsampler2_dict[2018], 'outsample2019': outsampler2_dict[2019], 'outsample2020': outsampler2_dict[2020], 'outsample2021': outsampler2_dict[2021], 'outsample2022': outsampler2_dict[2022], 'outsample2023': outsampler2_dict[2023]})
    #write the dataframe to the excel file
    writer = pd.ExcelWriter(file_path, engine = 'openpyxl')
    writer.book = wb
    writer.sheets = ws
    df.to_excel(writer, index=False)

else:
    file_path = 'separate_metrics.xlsx'
    wb = openpyxl.load_workbook(file_path)
    #create a new sheet
    ws = wb.create_sheet(f'{modeltype}_{datatype}')
    df = pd.DataFrame({'insample2014': sep_insampler2_dict[2014], 'insample2015': sep_insampler2_dict[2015], 'insample2016': sep_insampler2_dict[2016], 'insample2017': sep_insampler2_dict[2017], 'insample2018': sep_insampler2_dict[2018], 'insample2019': sep_insampler2_dict[2019], 'insample2020': sep_insampler2_dict[2020], 'insample2021': sep_insampler2_dict[2021], 'insample2022': sep_insampler2_dict[2022], 'insample2023': sep_insampler2_dict[2023], \
                       'outsample2014': sep_outsampler2_dict[2014], 'outsample2015': sep_outsampler2_dict[2015], 'outsample2016': sep_outsampler2_dict[2016], 'outsample2017': sep_outsampler2_dict[2017], 'outsample2018': sep_outsampler2_dict[2018], 'outsample2019': sep_outsampler2_dict[2019], 'outsample2020': sep_outsampler2_dict[2020], 'outsample2021': sep_outsampler2_dict[2021], 'outsample2022': sep_outsampler2_dict[2022], 'outsample2023': sep_outsampler2_dict[2023]})
    #write the dataframe to the excel file
    writer = pd.ExcelWriter(file_path, engine = 'openpyxl')
    writer.book = wb
    writer.sheets = ws
    df.to_excel(writer, index=False)


# Computing Model Evaluation Score
Calculating cosine similarity within and between topics, and model diversity score.

In [5]:
from sklearn.metrics.pairwise import cosine_distances
from itertools import combinations
from scipy.spatial.distance import cosine

# Load the model
def load_model(saved_model_folder, headlines, modeltype, cluster_num, tar_year, sentiment_type, count_num, reduce_outliers):
    Topic_model = BERTopic.load(saved_model_folder+f'/{sentiment_type}/{modeltype}/{tar_year}_{cluster_num}_{count_num}', embedding_model = embedding_model)
    indices = np.arange(len(headlines))
    tr_ind, te_ind = train_test_split(indices, test_size=0.2, shuffle= True, random_state=count_num)
    # tr_df = df.iloc[tr_ind,:]
    # te_df = df.iloc[te_ind,:]
    tr_headlines = [headlines[ind] for ind in tr_ind]
    # te_headlines = [headlines[ind] for ind in te_ind]
    # tr_embeddings = embeddings[tr_ind,:]
    # te_embeddings = embeddings[te_ind,:]
    Topic_df = Topic_model.get_document_info(tr_headlines)
    topics = Topic_df["Topic"].to_numpy()

    if reduce_outliers:
        topics = Topic_model.reduce_outliers(tr_headlines, topics)
        Topic_model.update_topics(tr_headlines, topics=topics)
    
    return Topic_model, topics

# Calculate pairwise cosine distances
def calculate_pairwise_cosine_distances(embeddings):
    return cosine_distances(embeddings)

# Compute the model diversity score
def compute_model_diversity(topics):
        unique_words = set()
        total_words = 0

        for topic in topics:
            total_words += len(topic)
            for words in topic:
                unique_words.add(words)
        diversity_score = len(unique_words) / total_words
        return diversity_score

# Calculate all the scores
def calculate_score(Topic_model, embedding_model, topics):
    topic_info = Topic_model.get_topics()
    topic_words = {topic: [word for word, _ in words] for topic, words in topic_info.items()}

    # Get embeddings for the words in each topic
    topic_embeddings = {}
    for topic, words in topic_words.items():
        embeddings = embedding_model.encode(words)
        topic_embeddings[topic] = embeddings

    topic_distances = {}
    for topic, embeddings in topic_embeddings.items():
        distances = calculate_pairwise_cosine_distances(embeddings)
        topic_distances[topic] = distances

    # Aggregate the scores by taking the mean distance
    topic_scores = {}
    for topic, distances in topic_distances.items():
        mean_distance = np.mean(distances)
        topic_scores[topic] = mean_distance

    # # Display the scores for each topic
    # for topic, score in topic_scores.items():
    #     print(f"Topic {topic}: Mean Cosine Distance = {score}")

    #calculate the mean score
    mean_score = np.mean(list(topic_scores.values()))

    sim = 0
    count = 0
    for topic1, topic2 in combinations(topic_info, 2):
        centroid1 = np.mean(topic_embeddings[topic1], axis=0)
        centroid2 = np.mean(topic_embeddings[topic2], axis=0)
        sim += 1 - cosine(centroid1, centroid2)
        count += 1
        

    # Calculate Model Diversity Score
    topic_words_list = [[words for words, _ in Topic_model.get_topic(topic)] 
                for topic in range(len(set(topics))-1)]

    model_diversity = compute_model_diversity(topic_words_list)
    return mean_score, sim / count, model_diversity

In [6]:
#Combine calculation
modeltype = 'pcagmm'
sentiment_type = 'with_senti'
topic_num = 120
cluster_num = 120
sentiment_sign = True
reduce_outliers = False
hdb_min_cluster_size = 60
datatype = 'contem'
df_folder = "/shared/share_tm-finance/Processed_df_Sentiment/One_year_window"
embeddings_folder = "/shared/share_tm-finance/Embeddings_with_Sentiment/One_year_window"
saved_model_folder = "/shared/share_tm-finance/Stored_model/new_data"
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

cos_in_topic_pos_dic = {}
cos_bet_topic_pos_dic = {}
model_diversity_pos_dic = {}
cos_in_topic_neg_dic = {}
cos_bet_topic_neg_dic = {}
model_diversity_neg_dic = {}
cos_in_topic_neu_dic = {}
cos_bet_topic_neu_dic = {}
model_diversity_neu_dic = {}
cos_in_topic_dic = {}
cos_bet_topic_dic = {}
model_diversity_dic = {}

first_count = 1
last_count = 5
div = last_count - first_count + 1

for i in range(2014, 2024):
    tar_year = i
    df = pd.read_csv(df_folder+'/{type}_{year}_senti.csv'.format(year = tar_year, type = datatype))
    red_headlines = df.vocab_con_headline.tolist()
    embeddings = np.load(embeddings_folder+f"/{datatype}_{tar_year}_senti_embeddings.npy")

    if sentiment_sign:
        #split the data into 3 part, positive css, negative css, neutral css
        pos_indices = df[df['css'] > 0].index
        neg_indices = df[df['css'] < 0].index
        neu_indices = df[df['css'] == 0].index
        pos_df = df.iloc[pos_indices,:]
        neg_df = df.iloc[neg_indices,:]
        neu_df = df.iloc[neu_indices,:]
        pos_headlines = [red_headlines[ind] for ind in pos_indices]
        neg_headlines = [red_headlines[ind] for ind in neg_indices]
        neu_headlines = [red_headlines[ind] for ind in neu_indices]
        pos_embeddings = embeddings[pos_indices,:]
        neg_embeddings = embeddings[neg_indices,:]
        neu_embeddings = embeddings[neu_indices,:]
    
        cos_in_topic_pos_list = []
        cos_bet_topic_pos_list = []
        model_diversity_pos_list = []
        cos_in_topic_neg_list = []
        cos_bet_topic_neg_list = []
        model_diversity_neg_list = []
        cos_in_topic_neu_list = []
        cos_bet_topic_neu_list = []
        model_diversity_neu_list = []
        
        cos_in_topic_pos_sum = 0
        cos_bet_topic_pos_sum = 0
        model_diversity_pos_sum = 0
        cos_in_topic_neg_sum = 0
        cos_bet_topic_neg_sum = 0
        model_diversity_neg_sum = 0
        cos_in_topic_neu_sum = 0
        cos_bet_topic_neu_sum = 0
        model_diversity_neu_sum = 0
    else:
        cos_in_topic_list = []
        cos_bet_topic_list = []
        model_diversity_list = []
        
        cos_in_topic_sum = 0
        cos_bet_topic_sum = 0
        model_diversity_sum = 0

    
    for count_num in range(first_count, last_count+1):
        #embeddings = np.load(embeddings_folder+f"/{datatype}_{tar_year}_senti_embeddings.npy")
        if sentiment_sign:
            saved_model_folder = "/shared/share_tm-finance/Stored_model/three_models/pos_topic"
            Topic_model, pos_topics = load_model(saved_model_folder, pos_headlines, modeltype, topic_num, tar_year, sentiment_type, count_num, reduce_outliers)
            mean_score, ave_sim, model_diversity = calculate_score(Topic_model, embedding_model, pos_topics)
            cos_in_topic_pos_sum += mean_score
            cos_bet_topic_pos_sum += ave_sim
            model_diversity_pos_sum += model_diversity

            saved_model_folder = "/shared/share_tm-finance/Stored_model/three_models/neg_topic"
            Topic_model, neg_topics = load_model(saved_model_folder, neg_headlines, modeltype, topic_num, tar_year, sentiment_type, count_num, reduce_outliers)
            mean_score, ave_sim, model_diversity = calculate_score(Topic_model, embedding_model, neg_topics)
            cos_in_topic_neg_sum += mean_score
            cos_bet_topic_neg_sum += ave_sim
            model_diversity_neg_sum += model_diversity

            saved_model_folder = "/shared/share_tm-finance/Stored_model/three_models/neu_topic"
            Topic_model, neu_topics = load_model(saved_model_folder, neu_headlines, modeltype, topic_num, tar_year, sentiment_type, count_num, reduce_outliers)
            mean_score, ave_sim, model_diversity = calculate_score(Topic_model, embedding_model, neu_topics)
            cos_in_topic_neu_sum += mean_score
            cos_bet_topic_neu_sum += ave_sim
            model_diversity_neu_sum += model_diversity
        else:
            Topic_model, topics = load_model(saved_model_folder, red_headlines, modeltype, topic_num, tar_year, sentiment_type, count_num, reduce_outliers)
            mean_score, ave_sim, model_diversity = calculate_score(Topic_model, embedding_model, topics)
            cos_in_topic_sum += mean_score
            cos_bet_topic_sum += ave_sim
            model_diversity_sum += model_diversity

    if sentiment_sign:
        cos_in_topic_pos_list.append(cos_in_topic_pos_sum / div)
        cos_bet_topic_pos_list.append(cos_bet_topic_pos_sum / div)
        model_diversity_pos_list.append(model_diversity_pos_sum / div)
        cos_in_topic_neg_list.append(cos_in_topic_neg_sum / div)
        cos_bet_topic_neg_list.append(cos_bet_topic_neg_sum / div)
        model_diversity_neg_list.append(model_diversity_neg_sum / div)
        cos_in_topic_neu_list.append(cos_in_topic_neu_sum / div)
        cos_bet_topic_neu_list.append(cos_bet_topic_neu_sum / div)
        model_diversity_neu_list.append(model_diversity_neu_sum / div)

        cos_in_topic_pos_dic[tar_year] = cos_in_topic_pos_list
        cos_bet_topic_pos_dic[tar_year] = cos_bet_topic_pos_list
        model_diversity_pos_dic[tar_year] = model_diversity_pos_list
        cos_in_topic_neg_dic[tar_year] = cos_in_topic_neg_list
        cos_bet_topic_neg_dic[tar_year] = cos_bet_topic_neg_list
        model_diversity_neg_dic[tar_year] = model_diversity_neg_list
        cos_in_topic_neu_dic[tar_year] = cos_in_topic_neu_list
        cos_bet_topic_neu_dic[tar_year] = cos_bet_topic_neu_list
        model_diversity_neu_dic[tar_year] = model_diversity_neu_list
    else:
        cos_in_topic_list.append(cos_in_topic_sum / div)
        cos_bet_topic_list.append(cos_bet_topic_sum / div)
        model_diversity_list.append(model_diversity_sum / div)

        cos_in_topic_dic[tar_year] = cos_in_topic_list
        cos_bet_topic_dic[tar_year] = cos_bet_topic_list
        model_diversity_dic[tar_year] = model_diversity_list

    print("Year {year} is done".format(year = tar_year))

if sentiment_sign:
    print("cos_in_topic_pos_dic = ", cos_in_topic_pos_dic)
    print("cos_bet_topic_pos_dic = ", cos_bet_topic_pos_dic)
    print("model_diversity_pos_dic = ", model_diversity_pos_dic)
    print("cos_in_topic_neg_dic = ", cos_in_topic_neg_dic)
    print("cos_bet_topic_neg_dic = ", cos_bet_topic_neg_dic)
    print("model_diversity_neg_dic = ", model_diversity_neg_dic)
    print("cos_in_topic_neu_dic = ", cos_in_topic_neu_dic)
    print("cos_bet_topic_neu_dic = ", cos_bet_topic_neu_dic)
    print("model_diversity_neu_dic = ", model_diversity_neu_dic)
else:
    print("cos_in_topic_dic = ", cos_in_topic_dic)
    print("cos_bet_topic_dic = ", cos_bet_topic_dic)
    print("model_diversity_dic = ", model_diversity_dic)


Year 2014 is done
Year 2015 is done
Year 2016 is done
Year 2017 is done
Year 2018 is done
Year 2019 is done
Year 2020 is done
Year 2021 is done
Year 2022 is done
Year 2023 is done
cos_in_topic_pos_dic =  {2014: [0.5865281701087952], 2015: [0.5917509317398071], 2016: [0.5812535881996155], 2017: [0.5831337213516236], 2018: [0.5845444917678833], 2019: [0.5881577253341674], 2020: [0.5946476817131042], 2021: [0.5891137599945069], 2022: [0.5830946803092957], 2023: [0.592152190208435]}
cos_bet_topic_pos_dic =  {2014: [0.5313330498799544], 2015: [0.5353699041105628], 2016: [0.5189727343479507], 2017: [0.5270285278554849], 2018: [0.536076260959789], 2019: [0.5405996319745795], 2020: [0.5440749181463561], 2021: [0.5466436705122311], 2022: [0.5321845698065104], 2023: [0.5247200382915889]}
model_diversity_pos_dic =  {2014: [0.7185714285714286], 2015: [0.7595], 2016: [0.7639999999999999], 2017: [0.7539534883720929], 2018: [0.7395454545454545], 2019: [0.7531818181818182], 2020: [0.7697674418604652],

In [None]:
df_2014_list = [cos_in_topic_pos_dic[2014][0], cos_bet_topic_pos_dic[2014][0], model_diversity_pos_dic[2014][0], cos_in_topic_neg_dic[2014][0], cos_bet_topic_neg_dic[2014][0], model_diversity_neg_dic[2014][0], cos_in_topic_neu_dic[2014][0], cos_bet_topic_neu_dic[2014][0], model_diversity_neu_dic[2014][0]]
df_2015_list = [cos_in_topic_pos_dic[2015][0], cos_bet_topic_pos_dic[2015][0], model_diversity_pos_dic[2015][0], cos_in_topic_neg_dic[2015][0], cos_bet_topic_neg_dic[2015][0], model_diversity_neg_dic[2015][0], cos_in_topic_neu_dic[2015][0], cos_bet_topic_neu_dic[2015][0], model_diversity_neu_dic[2015][0]]
df_2016_list = [cos_in_topic_pos_dic[2016][0], cos_bet_topic_pos_dic[2016][0], model_diversity_pos_dic[2016][0], cos_in_topic_neg_dic[2016][0], cos_bet_topic_neg_dic[2016][0], model_diversity_neg_dic[2016][0], cos_in_topic_neu_dic[2016][0], cos_bet_topic_neu_dic[2016][0], model_diversity_neu_dic[2016][0]]
df_2017_list = [cos_in_topic_pos_dic[2017][0], cos_bet_topic_pos_dic[2017][0], model_diversity_pos_dic[2017][0], cos_in_topic_neg_dic[2017][0], cos_bet_topic_neg_dic[2017][0], model_diversity_neg_dic[2017][0], cos_in_topic_neu_dic[2017][0], cos_bet_topic_neu_dic[2017][0], model_diversity_neu_dic[2017][0]]
df_2018_list = [cos_in_topic_pos_dic[2018][0], cos_bet_topic_pos_dic[2018][0], model_diversity_pos_dic[2018][0], cos_in_topic_neg_dic[2018][0], cos_bet_topic_neg_dic[2018][0], model_diversity_neg_dic[2018][0], cos_in_topic_neu_dic[2018][0], cos_bet_topic_neu_dic[2018][0], model_diversity_neu_dic[2018][0]]
df_2019_list = [cos_in_topic_pos_dic[2019][0], cos_bet_topic_pos_dic[2019][0], model_diversity_pos_dic[2019][0], cos_in_topic_neg_dic[2019][0], cos_bet_topic_neg_dic[2019][0], model_diversity_neg_dic[2019][0], cos_in_topic_neu_dic[2019][0], cos_bet_topic_neu_dic[2019][0], model_diversity_neu_dic[2019][0]]
df_2020_list = [cos_in_topic_pos_dic[2020][0], cos_bet_topic_pos_dic[2020][0], model_diversity_pos_dic[2020][0], cos_in_topic_neg_dic[2020][0], cos_bet_topic_neg_dic[2020][0], model_diversity_neg_dic[2020][0], cos_in_topic_neu_dic[2020][0], cos_bet_topic_neu_dic[2020][0], model_diversity_neu_dic[2020][0]]
df_2021_list = [cos_in_topic_pos_dic[2021][0], cos_bet_topic_pos_dic[2021][0], model_diversity_pos_dic[2021][0], cos_in_topic_neg_dic[2021][0], cos_bet_topic_neg_dic[2021][0], model_diversity_neg_dic[2021][0], cos_in_topic_neu_dic[2021][0], cos_bet_topic_neu_dic[2021][0], model_diversity_neu_dic[2021][0]]
df_2022_list = [cos_in_topic_pos_dic[2022][0], cos_bet_topic_pos_dic[2022][0], model_diversity_pos_dic[2022][0], cos_in_topic_neg_dic[2022][0], cos_bet_topic_neg_dic[2022][0], model_diversity_neg_dic[2022][0], cos_in_topic_neu_dic[2022][0], cos_bet_topic_neu_dic[2022][0], model_diversity_neu_dic[2022][0]]
df_2023_list = [cos_in_topic_pos_dic[2023][0], cos_bet_topic_pos_dic[2023][0], model_diversity_pos_dic[2023][0], cos_in_topic_neg_dic[2023][0], cos_bet_topic_neg_dic[2023][0], model_diversity_neg_dic[2023][0], cos_in_topic_neu_dic[2023][0], cos_bet_topic_neu_dic[2023][0], model_diversity_neu_dic[2023][0]]

df = pd.DataFrame({'2014': df_2014_list, '2015': df_2015_list, '2016': df_2016_list, '2017': df_2017_list, '2018': df_2018_list, '2019': df_2019_list, '2020': df_2020_list, '2021': df_2021_list, '2022': df_2022_list, '2023': df_2023_list})

file_path = 'score_metrics.xlsx'
df.to_excel(file_path, index=False)