In [None]:
import os 
import openpyxl
import pandas as pd 
import numpy as np
from lda import LDA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
# import torch
# import gc

  from tqdm.autonotebook import tqdm, trange


In [None]:
!nvidia-smi
# Check available GPUs
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [4]:
def Standardization(css_sum_by_topic):
    css_sum_by_topic_df = css_sum_by_topic.reset_index()
    css_sum_by_topic_df.columns = ['topic', 'css']
    scaler = StandardScaler()
    css_sum_by_topic_df['css_standardized'] = scaler.fit_transform(css_sum_by_topic_df[['css']])
    css_standardized_series = css_sum_by_topic_df.set_index('topic')['css_standardized']
    return css_standardized_series

In [None]:
def linear_regression(tr_grouped_sum, te_grouped_sum, insampler2_list, outsampler2_list):
    tr_X = np.array(tr_grouped_sum)
    tr_ret = [ind[2] for ind in list(tr_grouped_sum.index)]
    tr_Y = np.array(tr_ret).reshape(-1,1)
    tr_regression = LinearRegression(fit_intercept=True)
    tr_regression.fit(tr_X,tr_Y)
    tr_Y_pred = tr_regression.predict(tr_X)
    tr_Y_mean = np.mean(tr_Y)
    tr_SS_tot = np.sum((tr_Y - tr_Y_mean) ** 2)
    tr_SS_res = np.sum((tr_Y - tr_Y_pred) ** 2)
    tr_r2 = 1 - (tr_SS_res / tr_SS_tot)
    insampler2_list.append(tr_r2)

    te_X = np.array(te_grouped_sum)
    te_ret = [ind[2] for ind in list(te_grouped_sum.index)]
    te_Y = np.array(te_ret).reshape(-1,1)
    te_Y_pred = tr_regression.predict(te_X)
    te_SS_tot = np.sum((te_Y - tr_Y_mean) ** 2)
    te_SS_res = np.sum((te_Y - te_Y_pred) ** 2)
    te_r2 = 1 - (te_SS_res / te_SS_tot)
    outsampler2_list.append(te_r2)

    return

def train_model(saved_model_folder, df, red_headlines, topic_num, cluster_num, tar_year, year_num, save_model, i):
    # Perform the train-test split on indices
    indices = np.arange(len(red_headlines))
    tr_ind, te_ind = train_test_split(indices, test_size=0.2, shuffle= True, random_state=i)

    # torch.cuda.empty_cache()
    # gc.collect()
    tr_df = df.iloc[tr_ind,:]
    te_df = df.iloc[te_ind,:]
    tr_headlines = [red_headlines[ind] for ind in tr_ind]
    te_headlines = [red_headlines[ind] for ind in te_ind]
    
    vectorizer = CountVectorizer(ngram_range=(1, 2))
    tr_doc_term = vectorizer.fit_transform(tr_headlines)
    te_doc_term = vectorizer.fit_transform(te_headlines)

    lda_model = LDA(n_topics = cluster_num, n_iter = 100, random_state = 66)
    lda_model.fit(tr_doc_term)

    #save the topic model
    if save_model == True:
        os.makedirs(saved_model_folder+f'/{tar_year - year_num + 1}_{tar_year}_{topic_num}_{i}', exist_ok = True)            
        with open(saved_model_folder+f'/{tar_year - year_num + 1}_{tar_year}_{topic_num}_{i}/{tar_year - year_num + 1}_{tar_year}_{topic_num}_{i}_model', 'wb') as file:
            pickle.dump(lda_model,file)
    
    #calculate insample R2
    tr_topic_dist = lda_model.doc_topic_
    tr_df = tr_df.reset_index(drop = True)
    tr_contem_ret_topic_dist = pd.concat([tr_df.drop(columns = ["rp_entity_id","headline","vocab_con_headline"]),pd.DataFrame(tr_topic_dist)],axis = 1)
    tr_grouped = tr_contem_ret_topic_dist.groupby(['date',"comnam","ret"])
    tr_grouped_sum = tr_grouped.sum()
    data = []
    for i, topic_dist in enumerate(tr_topic_dist):
        topic = topic_dist.argmax()
        data.append({'Document': i, 'Topic': topic, 'Topic Distribution': topic_dist})
    Topic_df = pd.DataFrame(data)
    topics = Topic_df["Topic"].to_numpy()
    
    tr_grouped_sum.drop(columns = ['css'], inplace = True)   

    #calculate outsample R2
    te_topic_dist = lda_model.transform(te_doc_term)
    te_df = te_df.reset_index(drop = True)
    te_contem_ret_topic_dist = pd.concat([te_df.drop(columns = ["rp_entity_id","headline","vocab_con_headline"]),pd.DataFrame(te_topic_dist)],axis = 1)
    te_grouped = te_contem_ret_topic_dist.groupby(['date',"comnam","ret"])
    te_grouped_sum = te_grouped.sum()
    new_data = []
    for i, topic_dist in enumerate(te_topic_dist):
        topic = topic_dist.argmax()
        new_data.append({'Document': i, 'Topic': topic, 'Topic Distribution': topic_dist})
    new_Topic_df = pd.DataFrame(new_data)
    new_topics = new_Topic_df["Topic"].to_numpy()
    
    te_grouped_sum.drop(columns = ['css'], inplace = True)

    return lda_model, tr_grouped_sum, te_grouped_sum

In [None]:
from itertools import combinations
from sklearn.metrics.pairwise import cosine_similarity

# Calculate Model Diversity Score
def compute_model_diversity(topics):
    unique_words = set()
    total_words = 0
    for topic in topics:
        total_words += len(topic)
        for words in topic:
            unique_words.add(words)
    diversity_score = len(unique_words) / total_words
    return diversity_score


def calculate_score(lda_model, vectorizer, embedding_model):
    topic_words = {}  # Stores the top words for each topic
    topic_words_list = []  # Stores lists of top words for all topics (used for computing model diversity)
    # Retrieve the vocabulary from the vectorizer
    vocab = vectorizer.get_feature_names_out()

    # Loop through each topic's word distribution in the LDA model
    for topic_idx, topic_dist_data in enumerate(lda_model.topic_word_):
        top_words_idx = np.argsort(topic_dist_data)[::-1][:10] # Get the indices of the top 10 words for the current topic, sorted by their probability in descending order
        top_words = [vocab[i] for i in top_words_idx]
        topic_words[topic_idx] = top_words
        topic_words_list.append(top_words)
    # Initialize a dictionary to store the embeddings of the words for each topic
    topic_embeddings = {} # topics * words * embeddings
    for topic, words in topic_words.items():
        embeddings = embedding_model.encode(words) # Use a pre-trained embedding model to encode the words into vector representations (embeddings)
        topic_embeddings[topic] = embeddings


    # Calculate the intra-topic similarity (similarity between words within the same topic)
    in_topic_similarity = {}  # Dictionary to store intra-topic similarity scores for each topic
    # Loop through each topic and its word embeddings
    for topic, embeddings in topic_embeddings.items():
        distances = 0  # Initialize a variable to accumulate the cosine similarity between word pairs
        for i in range(len(embeddings)): # Loop through all pairs of word embeddings within the topic
            for j in range(i+1, len(embeddings)):
                distances += cosine_similarity(embeddings[i].reshape(1, -1), embeddings[j].reshape(1, -1))
        distances /= (len(embeddings) - 0) * (len(embeddings) - 1) / 2
        in_topic_similarity[topic] = distances[0][0]
    in_topic_similarity = np.mean(list(in_topic_similarity.values())) # Calculate the mean intra-topic similarity across all topics
    # for topic, score in in_topic_similarity.items():
    #     print(f"Topic {topic}: Mean Cosine Distance = {score}")


    # Calculate the between-topic similarity (similarity between different topics)
    distances = 0
    count = 0
    for topic1, topic2 in combinations(topic_words, 2):
        # Compute the centroid (average embedding) for each topic by averaging the word embeddings
        for i in range(len(topic_embeddings[topic1])):
            for j in range(len(topic_embeddings[topic2])):
                distances += cosine_similarity(topic_embeddings[topic1][i].reshape(1, -1), topic_embeddings[topic2][j].reshape(1, -1))
                count += 1

    btn_topic_similarity = distances[0][0] / count


    # Compute model diversity
    model_diversity = compute_model_diversity(topic_words_list)

    return in_topic_similarity, btn_topic_similarity, model_diversity

In [None]:
insampler2_dict = {}
outsampler2_dict = {}
cos_in_topic_dict = {}
cos_bet_topic_dict = {}
model_diversity_dict = {}
cos_in_topic_dic = {}
cos_bet_topic_dic = {}
model_diversity_dic = {}

modeltype = 'lda'
models = 'three_models'
combine = True
sentiment_type = 'no_senti'
topic_num = 120
save_model = False
datatype = 'contem'
year_num = 1
year_list = range(2023, 2024)
df_folder = f"/shared/share_tm-finance/Final/Processed_df/{year_num}_year_window"
embeddings_folder = f"/shared/share_tm-finance/Final/Embeddings/{year_num}_year_window"
saved_model_folder = f"/shared/share_tm-finance/Final/Stored_model/{year_num}_year_window/one_model/{modeltype}"
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

first_count = 1
last_count = 5
div = last_count - first_count + 1

for i in year_list:
    
    tar_year = i
    df = pd.read_csv(df_folder+f'/{datatype}_{tar_year - year_num + 1}_{tar_year}.csv')
    red_headlines = df.vocab_con_headline.tolist()
    embeddings = np.load(embeddings_folder+f"/{datatype}_{tar_year - year_num + 1}_{tar_year}_embeddings.npy")
    indices = np.arange(len(red_headlines))
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(red_headlines)

    pos_insampler2_list = []
    pos_outsampler2_list = []
    neg_insampler2_list = []
    neg_outsampler2_list = []
    neu_insampler2_list = []
    neu_outsampler2_list = []
    insampler2_list = []
    outsampler2_list = []
    
    pos_indices = df[df['css'] > 0].index
    neg_indices = df[df['css'] < 0].index
    neu_indices = df[df['css'] == 0].index
    pos_df = df.iloc[pos_indices,:]
    neg_df = df.iloc[neg_indices,:]
    neu_df = df.iloc[neu_indices,:]
    pos_headlines = [red_headlines[ind] for ind in pos_indices]
    neg_headlines = [red_headlines[ind] for ind in neg_indices]
    neu_headlines = [red_headlines[ind] for ind in neu_indices]
    pos_embeddings = embeddings[pos_indices,:]
    neg_embeddings = embeddings[neg_indices,:]
    neu_embeddings = embeddings[neu_indices,:]
    
    cos_in_topic_pos_sum = 0
    cos_bet_topic_pos_sum = 0
    model_diversity_pos_sum = 0
    cos_in_topic_neg_sum = 0
    cos_bet_topic_neg_sum = 0
    model_diversity_neg_sum = 0
    cos_in_topic_neu_sum = 0
    cos_bet_topic_neu_sum = 0
    model_diversity_neu_sum = 0

    #set pos_cluster_num, neg_cluster_num, neu_cluster_num based on the number of embeddings
    pos_cluster_num = int(topic_num * len(pos_embeddings) / len(embeddings))
    neg_cluster_num = int(topic_num * len(neg_embeddings) / len(embeddings))
    neu_cluster_num = int(topic_num * len(neu_embeddings) / len(embeddings))
    diff = topic_num - (pos_cluster_num + neg_cluster_num + neu_cluster_num)
    if pos_cluster_num < neg_cluster_num and pos_cluster_num < neu_cluster_num:
        pos_cluster_num += diff
    elif neg_cluster_num < pos_cluster_num and neg_cluster_num < neu_cluster_num:
        neg_cluster_num += diff
    else:
        neu_cluster_num += diff
    
    cos_in_topic_pos_sum = 0
    cos_bet_topic_pos_sum = 0
    model_diversity_pos_sum = 0
    cos_in_topic_neg_sum = 0
    cos_bet_topic_neg_sum = 0
    model_diversity_neg_sum = 0
    cos_in_topic_neu_sum = 0
    cos_bet_topic_neu_sum = 0
    model_diversity_neu_sum = 0
        
    for i in range(first_count, last_count + 1):

        saved_model_folder = f"/shared/share_tm-finance/Final/Stored_model/{year_num}_year_window/three_models/{modeltype}/pos"
        pos_lda_model, pos_tr_grouped_sum, pos_te_grouped_sum = \
                train_model(saved_model_folder, pos_df, pos_headlines, topic_num, pos_cluster_num, 
                            tar_year, year_num, sentiment_type, save_model, i)
        mean_score, ave_sim, model_diversity = calculate_score(pos_lda_model, vectorizer, embedding_model)
        cos_in_topic_pos_sum += mean_score
        cos_bet_topic_pos_sum += ave_sim
        model_diversity_pos_sum += model_diversity
    
        saved_model_folder = f"/shared/share_tm-finance/Final/Stored_model/{year_num}_year_window/three_models/{modeltype}/neg"
        neg_lda_model, neg_tr_grouped_sum, neg_te_grouped_sum = \
                train_model(saved_model_folder, neg_df, neg_headlines, topic_num, neg_cluster_num, 
                            tar_year, year_num, sentiment_type, save_model, i)
        mean_score, ave_sim, model_diversity = calculate_score(neg_lda_model, vectorizer, embedding_model)
        cos_in_topic_neg_sum += mean_score
        cos_bet_topic_neg_sum += ave_sim
        model_diversity_neg_sum += model_diversity
        
        saved_model_folder = f"/shared/share_tm-finance/Final/Stored_model/{year_num}_year_window/three_models/{modeltype}/neu"
        neu_lda_model, neu_tr_grouped_sum, neu_te_grouped_sum = \
                train_model(saved_model_folder, neu_df, neu_headlines, topic_num, neu_cluster_num, 
                            tar_year, year_num, sentiment_type, save_model, i, True)
        mean_score, ave_sim, model_diversity = calculate_score(neu_lda_model, vectorizer, embedding_model)
        cos_in_topic_neu_sum += mean_score
        cos_bet_topic_neu_sum += ave_sim
        model_diversity_neu_sum += model_diversity           

        if combine:
            #get the last column name of the last column of pos_tr_grouped_sum
            pos_last_col = int(pos_tr_grouped_sum.columns[-1])
            neg_tr_grouped_sum.columns = [str(int(col) + pos_last_col) for col in neg_tr_grouped_sum.columns]
            neg_last_col = int(neg_tr_grouped_sum.columns[-1])
            neu_tr_grouped_sum.columns = [str(int(col) + neg_last_col) for col in neu_tr_grouped_sum.columns]
            
            pos_last_col = int(pos_te_grouped_sum.columns[-1])
            neg_te_grouped_sum.columns = [str(int(col) + pos_last_col) for col in neg_te_grouped_sum.columns]
            neg_last_col = int(neg_te_grouped_sum.columns[-1])
            neu_te_grouped_sum.columns = [str(int(col) + neg_last_col) for col in neu_te_grouped_sum.columns]

            tr_grouped_sum = pd.concat([pos_tr_grouped_sum, neg_tr_grouped_sum, neu_tr_grouped_sum], axis = 1)
            tr_grouped_sum.fillna(0, inplace = True)
            te_grouped_sum = pd.concat([pos_te_grouped_sum, neg_te_grouped_sum, neu_te_grouped_sum], axis = 1)
            te_grouped_sum.fillna(0, inplace = True)
            linear_regression(tr_grouped_sum, te_grouped_sum, insampler2_list, outsampler2_list)
        else:
            linear_regression(pos_tr_grouped_sum, pos_te_grouped_sum, pos_insampler2_list, pos_outsampler2_list)
            linear_regression(neg_tr_grouped_sum, neg_te_grouped_sum, neg_insampler2_list, neg_outsampler2_list)
            linear_regression(neu_tr_grouped_sum, neu_te_grouped_sum, neu_insampler2_list, neu_outsampler2_list)
    
    if combine:
        insampler2_dict[tar_year] = insampler2_list.mean()
        outsampler2_dict[tar_year] = outsampler2_list.mean()
    else:
        sep_insampler2_list = (pos_insampler2_list* len(pos_embeddings) / len(embeddings)) + (neg_insampler2_list* len(neg_embeddings) / len(embeddings)) + (neu_insampler2_list* len(neu_embeddings) / len(embeddings))
        sep_outsampler2_list = (pos_outsampler2_list* len(pos_embeddings) / len(embeddings)) + (neg_outsampler2_list* len(neg_embeddings) / len(embeddings)) + (neu_outsampler2_list* len(neu_embeddings) / len(embeddings))
        insampler2_dict[tar_year] = sep_insampler2_list.mean()
        insampler2_dict[tar_year] = sep_outsampler2_list.mean()

    cos_in_topic_dict[tar_year] = (cos_in_topic_pos_sum* len(pos_embeddings) / len(embeddings) + cos_in_topic_neg_sum* len(neg_embeddings) / len(embeddings) + cos_in_topic_neu_sum* len(neu_embeddings) / len(embeddings)) / div
    cos_bet_topic_dict[tar_year] = (cos_bet_topic_pos_sum* len(pos_embeddings) / len(embeddings) + cos_bet_topic_neg_sum* len(neg_embeddings) / len(embeddings) + cos_bet_topic_neu_sum* len(neu_embeddings) / len(embeddings)) / div
    model_diversity_dict[tar_year] = (model_diversity_pos_sum* len(pos_embeddings) / len(embeddings) + model_diversity_neg_sum* len(neg_embeddings) / len(embeddings) + model_diversity_neu_sum* len(neu_embeddings) / len(embeddings)) / div

    print("Year {year} is done".format(year = tar_year))

if not combine:
    print("sep_insample = ", insampler2_dict)
    print("sep_outsample = ", insampler2_dict)
else:
    print("insample = ", insampler2_dict)
    print("outsample = ", outsampler2_dict)

print("cos_in_topic = ", cos_in_topic_dict)
print("cos_bet_topic = ", cos_bet_topic_dict)
print("model_diversity = ", model_diversity_dict)


In [None]:
file_path = f'{topic_num}_Data.xlsx'
wb = openpyxl.load_workbook(file_path)
program_list = [insampler2_dict, outsampler2_dict, cos_in_topic_dict, cos_bet_topic_dict, model_diversity_dict]
data_list = ['Insample_R2', 'Outsample_R2', 'Cos_in_topic', 'Cos_btn_topic', 'Diversity']

for i in range(len(data_list)):
    ws = wb[data_list[i]]
    for cell in ws['A']:
        if cell.value == models:
            for row_num in range(cell.row, cell.row + 17):
                sentiment_type_cell = ws.cell(row=row_num, column=2)
                if sentiment_type_cell.value == sentiment_type:
                    for row_num in range(sentiment_type_cell.row, sentiment_type_cell.row + 4):
                        model_cell = ws.cell(row=row_num, column=3).value
                        if model_cell == f"{modeltype}_{topic_num}":
                            for year, value in program_list[i].items():
                                ws.cell(row=row_num, column=year - 2013 + 3, value=value)
                    break
wb.save(file_path)