In [2]:
import os 
import openpyxl
import pandas as pd 
import numpy as np
from lda import LDA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sentence_transformers import SentenceTransformer
# import torch
# import gc

  from tqdm.autonotebook import tqdm, trange


In [None]:
!nvidia-smi
# Check available GPUs
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [4]:
def Standardization(css_sum_by_topic):
    css_sum_by_topic_df = css_sum_by_topic.reset_index()
    css_sum_by_topic_df.columns = ['topic', 'css']
    scaler = StandardScaler()
    css_sum_by_topic_df['css_standardized'] = scaler.fit_transform(css_sum_by_topic_df[['css']])
    css_standardized_series = css_sum_by_topic_df.set_index('topic')['css_standardized']
    return css_standardized_series

In [12]:
def linear_regression(tr_grouped_sum, te_grouped_sum, insampler2_list, outsampler2_list):
    tr_X = np.array(tr_grouped_sum)
    tr_ret = [ind[2] for ind in list(tr_grouped_sum.index)]
    tr_Y = np.array(tr_ret).reshape(-1,1)
    tr_regression = LinearRegression(fit_intercept=True)
    tr_regression.fit(tr_X,tr_Y)
    tr_Y_pred = tr_regression.predict(tr_X)
    tr_Y_mean = np.mean(tr_Y)
    tr_SS_tot = np.sum((tr_Y - tr_Y_mean) ** 2)
    tr_SS_res = np.sum((tr_Y - tr_Y_pred) ** 2)
    tr_r2 = 1 - (tr_SS_res / tr_SS_tot)
    insampler2_list.append(tr_r2)

    te_X = np.array(te_grouped_sum)
    te_ret = [ind[2] for ind in list(te_grouped_sum.index)]
    te_Y = np.array(te_ret).reshape(-1,1)
    te_Y_pred = tr_regression.predict(te_X)
    te_SS_tot = np.sum((te_Y - tr_Y_mean) ** 2)
    te_SS_res = np.sum((te_Y - te_Y_pred) ** 2)
    te_r2 = 1 - (te_SS_res / te_SS_tot)
    outsampler2_list.append(te_r2)

    return

def train_model(saved_model_folder, df, red_headlines, modeltype, topic_num, cluster_num, tar_year, sentiment_type, save_model, i, neutral = False):
    # Perform the train-test split on indices
    indices = np.arange(len(red_headlines))
    tr_ind, te_ind = train_test_split(indices, test_size=0.2, shuffle= True, random_state=i)

    # torch.cuda.empty_cache()
    # gc.collect()
    tr_df = df.iloc[tr_ind,:]
    te_df = df.iloc[te_ind,:]
    tr_headlines = [red_headlines[ind] for ind in tr_ind]
    te_headlines = [red_headlines[ind] for ind in te_ind]
    
    vectorizer = CountVectorizer()
    tr_doc_term = vectorizer.fit_transform(tr_headlines)
    te_doc_term = vectorizer.fit_transform(te_headlines)

    lda_model = LDA(n_topics = cluster_num, n_iter = 100, random_state = 66)
    lda_model.fit(tr_doc_term)

    #save the topic model
    if save_model == True:
        os.makedirs(saved_model_folder+f'/{sentiment_type}/{modeltype}/{tar_year}_{topic_num}_{i}', exist_ok = True)            
        with open(saved_model_folder+f'/{sentiment_type}/{modeltype}/{tar_year}_{topic_num}_{i}/{tar_year}_{topic_num}_{i}_model', 'wb') as file:
            pickle.dump(lda_model,file)
    
    #calculate insample R2
    tr_topic_dist = lda_model.doc_topic_
    # tr_df = tr_df.reset_index(drop = True)
    tr_contem_ret_topic_dist = pd.concat([tr_df.drop(columns = ["rp_entity_id","headline","vocab_con_headline"]),pd.DataFrame(tr_topic_dist)],axis = 1)
    tr_grouped = tr_contem_ret_topic_dist.groupby(['date',"comnam","ret"])
    tr_grouped_sum = tr_grouped.sum()
    data = []
    for i, topic_dist in enumerate(tr_topic_dist):
        topic = topic_dist.argmax()
        data.append({'Document': i, 'Topic': topic, 'Topic Distribution': topic_dist})
    Topic_df = pd.DataFrame(data)
    topics = Topic_df["Topic"].to_numpy()
    
    if sentiment_type == "per_topic":
        tr_df['topic'] = topics
        tr_css_sum_by_topic = tr_df.groupby('topic')['css'].sum()
        tr_df.drop(columns = ['topic'], inplace = True)
        tr_css_standardized_series = Standardization(tr_css_sum_by_topic)
        tr_grouped_sum.iloc[:, 1:] = tr_grouped_sum.iloc[:, 1:].mul(tr_css_standardized_series, axis=1)
    elif sentiment_type == "per_ret":
        if not neutral:
            tr_grouped_sum.iloc[:, 1:] = tr_grouped_sum.iloc[:, 1:].mul(tr_grouped_sum['css'], axis=0)
        tr_grouped_sum.drop(columns = ['css'], inplace = True)
    elif sentiment_type == "only_senti":
        tr_grouped_sum = tr_grouped_sum[['css']]
    elif sentiment_type == "no_senti":
        tr_grouped_sum.drop(columns = ['css'], inplace = True)   

    #calculate outsample R2
    te_topic_dist = lda_model.transform(te_doc_term)
    #te_df = te_df.reset_index(drop = True)
    te_contem_ret_topic_dist = pd.concat([te_df.drop(columns = ["rp_entity_id","headline","vocab_con_headline"]),pd.DataFrame(te_topic_dist)],axis = 1)
    te_grouped = te_contem_ret_topic_dist.groupby(['date',"comnam","ret"])
    te_grouped_sum = te_grouped.sum()
    new_data = []
    for i, topic_dist in enumerate(te_topic_dist):
        topic = topic_dist.argmax()
        new_data.append({'Document': i, 'Topic': topic, 'Topic Distribution': topic_dist})
    new_Topic_df = pd.DataFrame(new_data)
    new_topics = new_Topic_df["Topic"].to_numpy()
    
    if sentiment_type == "per_topic":
        te_df['topic'] = new_topics
        te_css_sum_by_topic = te_df.groupby('topic')['css'].sum()
        te_df.drop(columns = ['topic'], inplace = True)
        te_css_standardized_series = Standardization(te_css_sum_by_topic)
        te_grouped_sum.iloc[:, 1:] = te_grouped_sum.iloc[:, 1:].mul(te_css_standardized_series, axis=1)
    elif sentiment_type == "per_ret":
        if not neutral:
            te_grouped_sum.iloc[:, 1:] = te_grouped_sum.iloc[:, 1:].mul(te_grouped_sum['css'], axis=0)
        te_grouped_sum.drop(columns = ['css'], inplace = True)
    elif sentiment_type == "only_senti":
        te_grouped_sum = te_grouped_sum[['css']]
    elif sentiment_type == "no_senti":
        te_grouped_sum.drop(columns = ['css'], inplace = True)

    return lda_model, tr_topic_dist, tr_grouped_sum, te_grouped_sum, Topic_df

In [None]:
from sklearn.metrics.pairwise import cosine_distances
from itertools import combinations
from scipy.spatial.distance import cosine

# Calculate pairwise cosine distances
def calculate_pairwise_cosine_distances(embeddings):
    return cosine_distances(embeddings)

# Calculate Model Diversity Score
def compute_model_diversity(topics):
    unique_words = set()
    total_words = 0

    for topic in topics:
        total_words += len(topic)
        for words in topic:
            unique_words.add(words)
    diversity_score = len(unique_words) / total_words
    return diversity_score

def load_model(saved_model_folder, modeltype, tar_year, cluster_num, count_num, sentiment_type):
    # indices = np.arange(len(red_headlines))
    # tr_ind, te_ind = train_test_split(indices, test_size=0.2, shuffle= True, random_state=count_num)
    # tr_df = df.iloc[tr_ind,:]
    # te_df = df.iloc[te_ind,:]
    # tr_headlines = [red_headlines[ind] for ind in tr_ind]
    # te_headlines = [red_headlines[ind] for ind in te_ind]
    # tr_embeddings = embeddings[tr_ind,:]
    # te_embeddings = embeddings[te_ind,:]

    model_path = f'{saved_model_folder}/{sentiment_type}/{modeltype}/{tar_year}_{cluster_num}_{count_num}/{tar_year}_{cluster_num}_{count_num}_model'
    with open(model_path, 'rb') as file:
        lda_model = pickle.load(file)
    topic_dist = lda_model.doc_topic_
    return lda_model, topic_dist

def calculate_score(lda_model, topic_dist, vectorizer, embedding_model):
    data = []
    for i, topic_dist_data in enumerate(topic_dist):
        topic = topic_dist_data.argmax()
        data.append({'Document': i, 'Topic': topic, 'Topic Distribution': topic_dist_data})
    Topic_df = pd.DataFrame(data)
    topics = Topic_df["Topic"].to_numpy()

    topic_words = {}
    topic_words_list = []
    topic_info = {}
    vocab = vectorizer.get_feature_names_out()
        
    for topic_idx, topic_dist_data in enumerate(lda_model.topic_word_):
        top_words_idx = np.argsort(topic_dist_data)[::-1][:10]  # Get top 10 words for this topic
        top_words = [vocab[i] for i in top_words_idx]
        topic_words[topic_idx] = top_words
        top_words_freq = [(vocab[i], topic_dist_data[i]) for i in top_words_idx]
        topic_info[topic_idx] = top_words_freq
        topic_words_list.append(top_words)

        # Get embeddings for the words in each topic
    topic_embeddings = {}
    for topic, words in topic_words.items():
        embeddings = embedding_model.encode(words)
        topic_embeddings[topic] = embeddings

    #calculate in-topic similarity score
    topic_distances = {}
    for topic, embeddings in topic_embeddings.items():
        distances = calculate_pairwise_cosine_distances(embeddings)
        topic_distances[topic] = distances

    # Aggregate the scores by taking the mean distance
    topic_scores = {}
    for topic, distances in topic_distances.items():
        mean_distance = np.mean(distances)
        topic_scores[topic] = mean_distance

    # # Display the scores for each topic
    # for topic, score in topic_scores.items():
    #     print(f"Topic {topic}: Mean Cosine Distance = {score}")

    #calculate the mean score
    mean_score = np.mean(list(topic_scores.values()))

    #calculate between-topic similarity score
    sim = 0
    count = 0
    for topic1, topic2 in combinations(topic_info, 2):
        centroid1 = np.mean(topic_embeddings[topic1], axis=0)
        centroid2 = np.mean(topic_embeddings[topic2], axis=0)
        sim += 1 - cosine(centroid1, centroid2)
        count += 1

    model_diversity = compute_model_diversity(topic_words_list)
    return mean_score, sim / count, model_diversity


In [None]:
insampler2_dict = {}
outsampler2_dict = {}
cos_in_topic_dict = {}
cos_bet_topic_dict = {}
model_diversity_dict = {}
cos_in_topic_dic = {}
cos_bet_topic_dic = {}
model_diversity_dic = {}

modeltype = 'lda'
sentiment_type = 'per_ret'
topic_num = 120
save_model = False
sentiment_sign = False
combine = False
datatype = 'contem'
df_folder = "/shared/share_tm-finance/Processed_df_Sentiment/One_year_window"
embeddings_folder = "/shared/share_tm-finance/Embeddings_with_Sentiment/One_year_window"
saved_model_folder = "/shared/share_tm-finance/Stored_model/new_data"
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

first_count = 1
last_count = 5
div = last_count - first_count + 1

for i in range(2023, 2024):
    
    tar_year = i
    df = pd.read_csv(df_folder+'/{type}_{year}_senti.csv'.format(year = tar_year, type = datatype))
    red_headlines = df.vocab_con_headline.tolist()
    embeddings = np.load(embeddings_folder+"/{type}_{year}_senti_embeddings.npy".format(year = tar_year, type = datatype))
    indices = np.arange(len(red_headlines))
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(red_headlines)

    pos_insampler2_list = []
    pos_outsampler2_list = []
    neg_insampler2_list = []
    neg_outsampler2_list = []
    neu_insampler2_list = []
    neu_outsampler2_list = []
    insampler2_list = []
    outsampler2_list = []
    
    pos_indices = df[df['css'] > 0].index
    neg_indices = df[df['css'] < 0].index
    neu_indices = df[df['css'] == 0].index
    pos_df = df.iloc[pos_indices,:]
    neg_df = df.iloc[neg_indices,:]
    neu_df = df.iloc[neu_indices,:]
    pos_headlines = [red_headlines[ind] for ind in pos_indices]
    neg_headlines = [red_headlines[ind] for ind in neg_indices]
    neu_headlines = [red_headlines[ind] for ind in neu_indices]
    pos_embeddings = embeddings[pos_indices,:]
    neg_embeddings = embeddings[neg_indices,:]
    neu_embeddings = embeddings[neu_indices,:]
    
    cos_in_topic_pos_sum = 0
    cos_bet_topic_pos_sum = 0
    model_diversity_pos_sum = 0
    cos_in_topic_neg_sum = 0
    cos_bet_topic_neg_sum = 0
    model_diversity_neg_sum = 0
    cos_in_topic_neu_sum = 0
    cos_bet_topic_neu_sum = 0
    model_diversity_neu_sum = 0

    #set pos_cluster_num, neg_cluster_num, neu_cluster_num based on the number of embeddings
    pos_cluster_num = int(topic_num * len(pos_embeddings) / len(embeddings))
    neg_cluster_num = int(topic_num * len(neg_embeddings) / len(embeddings))
    neu_cluster_num = int(topic_num * len(neu_embeddings) / len(embeddings))
    diff = topic_num - (pos_cluster_num + neg_cluster_num + neu_cluster_num)
    if pos_cluster_num < neg_cluster_num and pos_cluster_num < neu_cluster_num:
        pos_cluster_num += diff
    elif neg_cluster_num < pos_cluster_num and neg_cluster_num < neu_cluster_num:
        neg_cluster_num += diff
    else:
        neu_cluster_num += diff
    
    cos_in_topic_pos_sum = 0
    cos_bet_topic_pos_sum = 0
    model_diversity_pos_sum = 0
    cos_in_topic_neg_sum = 0
    cos_bet_topic_neg_sum = 0
    model_diversity_neg_sum = 0
    cos_in_topic_neu_sum = 0
    cos_bet_topic_neu_sum = 0
    model_diversity_neu_sum = 0
        
    for i in range(first_count, last_count + 1):

        saved_model_folder = "/shared/share_tm-finance/Stored_model/three_models/pos_topic"
        pos_lda_model, pos_tr_topic_dist, pos_tr_grouped_sum, pos_te_grouped_sum, pos_Topic_df = \
                train_model(saved_model_folder, pos_df, pos_headlines, modeltype, topic_num, pos_cluster_num, tar_year, sentiment_type, save_model, i)
        mean_score, ave_sim, model_diversity = calculate_score(pos_lda_model, pos_tr_topic_dist, vectorizer, embedding_model)
        cos_in_topic_pos_sum += mean_score
        cos_bet_topic_pos_sum += ave_sim
        model_diversity_pos_sum += model_diversity
       
        saved_model_folder = "/shared/share_tm-finance/Stored_model/three_models/neg_topic"
        neg_lda_model, neg_tr_topic_dist, neg_tr_grouped_sum, neg_te_grouped_sum, neg_Topic_df = \
                train_model(saved_model_folder, neg_df, neg_headlines, modeltype, topic_num, neg_cluster_num, tar_year, sentiment_type, save_model, i)
        mean_score, ave_sim, model_diversity = calculate_score(neg_lda_model, neg_tr_topic_dist, vectorizer, embedding_model)
        cos_in_topic_neg_sum += mean_score
        cos_bet_topic_neg_sum += ave_sim
        model_diversity_neg_sum += model_diversity
        
        saved_model_folder = "/shared/share_tm-finance/Stored_model/three_models/neu_topic"
        neu_lda_model, neu_tr_topic_dist, neu_tr_grouped_sum, neu_te_grouped_sum, neu_Topic_df = \
                train_model(saved_model_folder, neu_df, neu_headlines, modeltype, topic_num, neu_cluster_num, tar_year, sentiment_type, save_model, i, True)
        mean_score, ave_sim, model_diversity = calculate_score(neu_lda_model, neu_tr_topic_dist, vectorizer, embedding_model)
        cos_in_topic_neu_sum += mean_score
        cos_bet_topic_neu_sum += ave_sim
        model_diversity_neu_sum += model_diversity        

        if combine:
            #get the last column name of the last column of pos_tr_grouped_sum
            pos_last_col = int(pos_tr_grouped_sum.columns[-1])
            neg_tr_grouped_sum.columns = [str(int(col) + pos_last_col) for col in neg_tr_grouped_sum.columns]
            neg_last_col = int(neg_tr_grouped_sum.columns[-1])
            neu_tr_grouped_sum.columns = [str(int(col) + neg_last_col) for col in neu_tr_grouped_sum.columns]
            
            pos_last_col = int(pos_te_grouped_sum.columns[-1])
            neg_te_grouped_sum.columns = [str(int(col) + pos_last_col) for col in neg_te_grouped_sum.columns]
            neg_last_col = int(neg_te_grouped_sum.columns[-1])
            neu_te_grouped_sum.columns = [str(int(col) + neg_last_col) for col in neu_te_grouped_sum.columns]

            tr_grouped_sum = pd.concat([pos_tr_grouped_sum, neg_tr_grouped_sum, neu_tr_grouped_sum], axis = 1)
            tr_grouped_sum.fillna(0, inplace = True)
            te_grouped_sum = pd.concat([pos_te_grouped_sum, neg_te_grouped_sum, neu_te_grouped_sum], axis = 1)
            te_grouped_sum.fillna(0, inplace = True)
            linear_regression(tr_grouped_sum, te_grouped_sum, insampler2_list, outsampler2_list)
        else:
            linear_regression(pos_tr_grouped_sum, pos_te_grouped_sum, pos_insampler2_list, pos_outsampler2_list)
            linear_regression(neg_tr_grouped_sum, neg_te_grouped_sum, neg_insampler2_list, neg_outsampler2_list)
            linear_regression(neu_tr_grouped_sum, neu_te_grouped_sum, neu_insampler2_list, neu_outsampler2_list)
    
    if combine:
        insampler2_dict[tar_year] = insampler2_list.mean()
        outsampler2_dict[tar_year] = outsampler2_list.mean()
    else:
        sep_insampler2_list = (pos_insampler2_list* len(pos_embeddings) / len(embeddings)) + (neg_insampler2_list* len(neg_embeddings) / len(embeddings)) + (neu_insampler2_list* len(neu_embeddings) / len(embeddings))
        sep_outsampler2_list = (pos_outsampler2_list* len(pos_embeddings) / len(embeddings)) + (neg_outsampler2_list* len(neg_embeddings) / len(embeddings)) + (neu_outsampler2_list* len(neu_embeddings) / len(embeddings))
        insampler2_dict[tar_year] = sep_insampler2_list.mean()
        insampler2_dict[tar_year] = sep_outsampler2_list.mean()

    cos_in_topic_dict[tar_year] = (cos_in_topic_pos_sum* len(pos_embeddings) / len(embeddings) + cos_in_topic_neg_sum* len(neg_embeddings) / len(embeddings) + cos_in_topic_neu_sum* len(neu_embeddings) / len(embeddings)) / div
    cos_bet_topic_dict[tar_year] = (cos_bet_topic_pos_sum* len(pos_embeddings) / len(embeddings) + cos_bet_topic_neg_sum* len(neg_embeddings) / len(embeddings) + cos_bet_topic_neu_sum* len(neu_embeddings) / len(embeddings)) / div
    model_diversity_dict[tar_year] = (model_diversity_pos_sum* len(pos_embeddings) / len(embeddings) + model_diversity_neg_sum* len(neg_embeddings) / len(embeddings) + model_diversity_neu_sum* len(neu_embeddings) / len(embeddings)) / div

    print("Year {year} is done".format(year = tar_year))

if not combine:
    print("sep_insample = ", insampler2_dict)
    print("sep_outsample = ", insampler2_dict)
else:
    print("insample = ", insampler2_dict)
    print("outsample = ", outsampler2_dict)

print("cos_in_topic = ", cos_in_topic_dict)
print("cos_bet_topic = ", cos_bet_topic_dict)
print("model_diversity = ", model_diversity_dict)


In [None]:
file_path = 'Data.xlsx'
wb = openpyxl.load_workbook(file_path)
program_list = [insampler2_dict, outsampler2_dict, cos_in_topic_dict, cos_bet_topic_dict, model_diversity_dict]
data_list = ['Insample_R2', 'Outsample_R2', 'Cos_in_topic', 'Cos_btn_topic', 'Diversity']

for i in len(data_list):
    ws = wb[data_list[i]]
    for cell in ws['A']:
        if cell.value != None:
            for row_num in range(cell.row, cell.row + 3):
                model_cell = ws.cell(row=row_num, column=2).value
                if model_cell == f"{modeltype}_{topic_num}":
                    for year, value in program_list[i].items():
                        ws.cell(row=row_num, column=year - 2013 + 2, value=value)
wb.save(file_path)

In [None]:
file_path = 'Data.xlsx'
wb = openpyxl.load_workbook(file_path)
data_list = ['Insample_R2', 'Outsample_R2', 'Cos_in_topic', 'Cos_btn_topic', 'Diversity']
years = range(2014, 2024)
for data_type in data_list:
    ws = wb[data_type]
    for cell in ws['A']:
        if cell.value != None:
            plt.figure(figsize=(10, 6))
            plt.title(f'{cell.value} {data_type}')
            for row_num in range(cell.row, cell.row + 3):
                model_cell = ws.cell(row=row_num, column=2).value
                data = []
                for year in years:
                    data.append(ws.cell(row=row_num, column=year - 2013 + 2).value)
                plt.plot(years, data, label = model_cell)
            plt.xlabel('Year')
            plt.ylabel(f'{data_type}')
            plt.legend()
            plt.savefig(f'graph/{cell.value}_{data_type}.png')
            plt.close()  