# Preparation
Imporint packages, setting up GPU and functions for BERTopic.

In [None]:
import os 
import openpyxl
import pandas as pd 
import numpy as np
import pickle
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from bertopic.cluster import BaseCluster
import re
import collections
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from cuml.manifold import UMAP
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from cuml.cluster import HDBSCAN
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.feature_extraction.text import CountVectorizer
import collections
from tqdm import tqdm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt

In [None]:
!nvidia-smi
# Check available GPUs
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [59]:
class ClusterModel:
    def __init__(self, model):
        self.model = model

    def fit(self, X, y=None):
        self.model.fit(X)
        self.labels_ = self.model.predict(X)
        return self

    def predict(self, X):
        predictions = self.model.predict(X)
        self.labels_ = predictions
        return predictions

In [None]:
def model_setup(modeltype, cluster_num, hdb_min_cluster_size):    
    # Embeddings
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
    # Reduce dimensionality
    PCA_model = PCA(n_components=10)
    SVD_model = TruncatedSVD(n_components=10, random_state=42, n_iter=10) 
    umap_model = UMAP(n_neighbors=10, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
    # Cluster embeddings
    gmm_model = ClusterModel(GaussianMixture(n_components=cluster_num, covariance_type='full'))
    KMeans_model = KMeans(n_clusters=cluster_num)
    hdbscan_model = HDBSCAN(min_cluster_size=hdb_min_cluster_size, metric = "euclidean", cluster_selection_method="eom",
                            gen_min_span_tree = True, prediction_data = True, min_samples = 20, verbose = True)
    # Vectorize
    vectorizer_model = CountVectorizer(stop_words="english", min_df=1, max_df = 50, ngram_range=(1, 2))
    
    model_combinations = {
    'umaphdbscan': (umap_model, hdbscan_model),
    'pcakmeans': (PCA_model, KMeans_model),
    'umapgmm': (umap_model, gmm_model),
    'pcagmm': (PCA_model, gmm_model),
    'svdkeans': (SVD_model, KMeans_model),
    'pcahdbscan': (PCA_model, hdbscan_model)
    }

    dim_red_model, cluster_model = model_combinations.get(modeltype, (PCA_model, KMeans_model))

    Topic_model = BERTopic(embedding_model=embedding_model, umap_model=dim_red_model, hdbscan_model=cluster_model, vectorizer_model=vectorizer_model,
                        calculate_probabilities = False, verbose = True, low_memory = True)
    
    return Topic_model, embedding_model


def Standardization(css_sum_by_topic):
    css_sum_by_topic_df = css_sum_by_topic.reset_index()
    css_sum_by_topic_df.columns = ['topic', 'css']
    scaler = StandardScaler()
    css_sum_by_topic_df['css_standardized'] = scaler.fit_transform(css_sum_by_topic_df[['css']])
    css_standardized_series = css_sum_by_topic_df.set_index('topic')['css_standardized']
    
    return css_standardized_series
    

def train_model(saved_model_folder, df, red_headlines, embeddings, modeltype, topic_num, cluster_num, hdb_min_cluster_size, tar_year, year_num, sentiment_type, save_model, reduce_outliers, i, neutral = False):    
    # Perform the train-test split on indices
    indices = np.arange(len(red_headlines))
    tr_ind, te_ind = train_test_split(indices, test_size=0.2, shuffle= True, random_state=i)

    tr_df = df.iloc[tr_ind,:]
    te_df = df.iloc[te_ind,:]
    tr_headlines = [red_headlines[ind] for ind in tr_ind]
    te_headlines = [red_headlines[ind] for ind in te_ind]
    tr_embeddings = embeddings[tr_ind,:]
    te_embeddings = embeddings[te_ind,:]
    
    Topic_model, embedding_model = model_setup(modeltype, cluster_num, hdb_min_cluster_size)
    indices = np.arange(len(red_headlines))
    
    topics, probs = Topic_model.fit_transform(tr_headlines, embeddings = tr_embeddings)

    #save the topic model
    if save_model:
        Topic_model.save(saved_model_folder+f'/{tar_year - year_num + 1}_{tar_year}_{topic_num}_{i}',
        serialization = "safetensors", save_ctfidf = True, save_embedding_model = embedding_model)

    #reduce outliers
    if reduce_outliers:
        topics = Topic_model.reduce_outliers(tr_headlines, topics)
        Topic_model.update_topics(tr_headlines, topics=topics)
    
    #calculate insample R2
    tr_topic_dist, _ = Topic_model.approximate_distribution(tr_headlines)
    tr_df = tr_df.reset_index(drop = True)
    tr_contem_ret_topic_dist = pd.concat([tr_df.drop(columns = ["rp_entity_id","headline","vocab_con_headline"]),pd.DataFrame(tr_topic_dist)],axis = 1)
    tr_grouped = tr_contem_ret_topic_dist.groupby(['date',"comnam","ret"])
    tr_grouped_sum = tr_grouped.sum()
    if sentiment_type == "per_topic":
        tr_df['topic'] = topics
        tr_css_sum_by_topic = tr_df.groupby('topic')['css'].sum()
        tr_df.drop(columns = ['topic'], inplace = True)
        tr_css_standardized_series = Standardization(tr_css_sum_by_topic)
        if modeltype == 'pcahdbscan':
            tr_css_standardized_series = tr_css_standardized_series[1:]
        tr_grouped_sum.iloc[:, 1:] = tr_grouped_sum.iloc[:, 1:].mul(tr_css_standardized_series, axis=1)
        tr_grouped_sum.drop(columns = ['css'], inplace = True)
    elif sentiment_type == "per_ret":
        if not neutral:
            tr_grouped_sum.iloc[:, 1:] = tr_grouped_sum.iloc[:, 1:].mul(tr_grouped_sum['css'], axis=0)
        tr_grouped_sum.drop(columns = ['css'], inplace = True)
    elif sentiment_type == "only_senti":
        tr_grouped_sum = tr_grouped_sum[['css']]
    elif sentiment_type == "no_senti":
        tr_grouped_sum.drop(columns = ['css'], inplace = True)  

    
    #calculate outsample R2
    new_topics, new_probs = Topic_model.transform(te_headlines, embeddings = te_embeddings)

    if reduce_outliers:
        new_topics = Topic_model.reduce_outliers(te_headlines, new_topics)
        Topic_model.update_topics(te_headlines, topics=new_topics)
    
    te_topic_dist, _ = Topic_model.approximate_distribution(te_headlines)     
    te_df = te_df.reset_index(drop = True)
    te_contem_ret_topic_dist = pd.concat([te_df.drop(columns = ["rp_entity_id","headline","vocab_con_headline"]),pd.DataFrame(te_topic_dist)],axis = 1)
    te_grouped = te_contem_ret_topic_dist.groupby(['date',"comnam","ret"])
    te_grouped_sum = te_grouped.sum()
    if sentiment_type == "per_topic":
        te_df['topic'] = new_topics
        te_css_sum_by_topic = te_df.groupby('topic')['css'].sum()
        te_df.drop(columns = ['topic'], inplace = True)
        te_css_standardized_series = Standardization(te_css_sum_by_topic)
        if modeltype == 'pcahdbscan':
            te_css_standardized_series = te_css_standardized_series[1:]
        te_grouped_sum.iloc[:, 1:] = te_grouped_sum.iloc[:, 1:].mul(te_css_standardized_series, axis=1)
        te_grouped_sum.drop(columns = ['css'], inplace = True)
    elif sentiment_type == "per_ret":
        if not neutral:
            te_grouped_sum.iloc[:, 1:] = te_grouped_sum.iloc[:, 1:].mul(te_grouped_sum['css'], axis=0)
        te_grouped_sum.drop(columns = ['css'], inplace = True)
    elif sentiment_type == "only_senti":
        te_grouped_sum = te_grouped_sum[['css']]
    elif sentiment_type == "no_senti":
        te_grouped_sum.drop(columns = ['css'], inplace = True)
    
    return topics, Topic_model, tr_grouped_sum, te_grouped_sum
    

def linear_regression(tr_grouped_sum, te_grouped_sum, insampler2_list, outsampler2_list):
    
    # Linear regression for insample R2
    tr_X = np.array(tr_grouped_sum)
    tr_ret = [ind[2] for ind in list(tr_grouped_sum.index)]
    tr_Y = np.array(tr_ret).reshape(-1,1)
    tr_regression = LinearRegression(fit_intercept=True)
    tr_regression.fit(tr_X,tr_Y)
    tr_Y_pred = tr_regression.predict(tr_X)
    tr_Y_mean = np.mean(tr_Y)
    tr_SS_tot = np.sum((tr_Y - tr_Y_mean) ** 2)
    tr_SS_res = np.sum((tr_Y - tr_Y_pred) ** 2)
    tr_r2 = 1 - (tr_SS_res / tr_SS_tot)
    insampler2_list.append(tr_r2)

    # Linear regression for outsample R2
    te_X = np.array(te_grouped_sum)
    te_ret = [ind[2] for ind in list(te_grouped_sum.index)]
    te_Y = np.array(te_ret).reshape(-1,1)
    te_Y_pred = tr_regression.predict(te_X)
    te_SS_tot = np.sum((te_Y - tr_Y_mean) ** 2)
    te_SS_res = np.sum((te_Y - te_Y_pred) ** 2)
    te_r2 = 1 - (te_SS_res / te_SS_tot)
    outsampler2_list.append(te_r2)
    
    return

In [None]:
from itertools import combinations
from sklearn.metrics.pairwise import cosine_similarity


# Compute the model diversity score
def compute_model_diversity(topics):
    unique_words = set()
    total_words = 0
    for topic in topics:
        total_words += len(topic)
        for words in topic:
            unique_words.add(words)
    diversity_score = len(unique_words) / total_words
    
    return diversity_score


# Calculate all the scores
def calculate_score(Topic_model, embedding_model, topics):
    topic_info = Topic_model.get_topics()
    topic_words = {topic: [word for word, _ in words] for topic, words in topic_info.items()}

    # Get embeddings for the words in each topic
    topic_embeddings = {}
    for topic, words in topic_words.items():
        embeddings = embedding_model.encode(words)
        topic_embeddings[topic] = embeddings


    # Calculate the intra-topic similarity (similarity between words within the same topic)
    in_topic_similarity = {}  # Dictionary to store intra-topic similarity scores for each topic
    # Loop through each topic and its word embeddings
    for topic, embeddings in topic_embeddings.items():
        distances = 0  # Initialize a variable to accumulate the cosine similarity between word pairs
        for i in range(len(embeddings)): # Loop through all pairs of word embeddings within the topic
            for j in range(i+1, len(embeddings)):
                distances += cosine_similarity(embeddings[i].reshape(1, -1), embeddings[j].reshape(1, -1))
        distances /= (len(embeddings) - 0) * (len(embeddings) - 1) / 2
        in_topic_similarity[topic] = distances[0][0]
    in_topic_similarity = np.mean(list(in_topic_similarity.values())) # Calculate the mean intra-topic similarity across all topics
    # for topic, score in in_topic_similarity.items():
    #     print(f"Topic {topic}: Mean Cosine Distance = {score}")


    # Calculate the between-topic similarity (similarity between different topics)
    distances = 0
    count = 0
    for topic1, topic2 in combinations(topic_words, 2):
        # Compute the centroid (average embedding) for each topic by averaging the word embeddings
        for i in range(len(topic_embeddings[topic1])):
            for j in range(len(topic_embeddings[topic2])):
                distances += cosine_similarity(topic_embeddings[topic1][i].reshape(1, -1), topic_embeddings[topic2][j].reshape(1, -1))
                count += 1

    btn_topic_similarity = distances[0][0] / count

        
    # Calculate Model Diversity Score
    topic_words_list = [[words for words, _ in Topic_model.get_topic(topic)] 
                for topic in range(len(set(topics))-1)]
    model_diversity = compute_model_diversity(topic_words_list)
    
    return in_topic_similarity, btn_topic_similarity, model_diversity

# Building Model and Calculating R-Square and Model Evaluation Score
Choosing 'model type', topic number', data type', and whther save model or not. \
Building up model combination, split the train and test data. Training the model with dataset, generating topics. \
Using linear regression to calculate the in-sample and out-sample R-Square for the topic weight of positive, negative, and neutral topic models.
Calculating cosine similarity within and between topics, and model diversity score.

In [None]:
insampler2_dict = {}
outsampler2_dict = {}
cos_in_topic_dict = {}
cos_bet_topic_dict = {}
model_diversity_dict = {}

modeltype = 'pcakmeans' # umaphdbscan, pcakmeans, umapgmm, pcagmm, svdkmeans, pcahdbscan
models = 'three_models' # three_models or one_model
combine = True
sentiment_type = 'no_senti' # no_senti, with_senti, per_topic, per_ret, only_senti
topic_num = 120
save_model = False
reduce_outliers = False
datatype = 'contem' # contem or future
year_num = 1
year_list = range(2023, 2024)
df_folder = f"/shared/share_tm-finance/Final/Processed_df/{year_num}_year_window"
embeddings_folder = f"/shared/share_tm-finance/Final/Embeddings/{year_num}_year_window"
saved_model_folder = f"/shared/share_tm-finance/Final/Stored_model/{year_num}_year_window/one_model/{modeltype}"
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

first_count = 1
last_count = 5
div = last_count - first_count + 1

# Results from the hyperparameter tuning script
if topic_num == 120:
    min_cluster_size_list = [210, 230, 235, 215, 205, 220, 220, 205, 240, 230]
elif topic_num == 60:
    min_cluster_size_list = [350, 340, 345, 410, 365, 400, 360, 400, 410, 425]

pos_min_cluster_size_list = [205, 200, 210, 220, 200, 210, 210, 220, 200, 200]
neg_min_cluster_size_list = [200, 150, 200, 200, 220, 205, 180, 220, 225, 210]
neu_min_cluster_size_list = [200, 200, 200, 200, 200, 200, 190, 200, 195, 185]

for i in year_list:
    
    tar_year = i
    df = pd.read_csv(df_folder+f'/{datatype}_{tar_year - year_num + 1}_{tar_year}.csv')
    red_headlines = df.vocab_con_headline.tolist()
    embeddings = np.load(embeddings_folder+f"/{datatype}_{tar_year - year_num + 1}_{tar_year}_embeddings.npy")
    indices = np.arange(len(red_headlines))
    pos_min_cluster_size = pos_min_cluster_size_list[year_list.index(i)]
    neg_min_cluster_size = neg_min_cluster_size_list[year_list.index(i)]
    neu_min_cluster_size = neu_min_cluster_size_list[year_list.index(i)]
    hdb_min_cluster_size = min_cluster_size_list[year_list.index(i)]
    pos_insampler2_list = []
    pos_outsampler2_list = []
    neg_insampler2_list = []
    neg_outsampler2_list = []
    neu_insampler2_list = []
    neu_outsampler2_list = []
    insampler2_list = []
    outsampler2_list = []

    #split the data into 3 part, positive css, negative css, neutral css
    pos_indices = df[df['css'] > 0].index
    neg_indices = df[df['css'] < 0].index
    neu_indices = df[df['css'] == 0].index
    pos_df = df.iloc[pos_indices,:]
    neg_df = df.iloc[neg_indices,:]
    neu_df = df.iloc[neu_indices,:]
    pos_headlines = [red_headlines[ind] for ind in pos_indices]
    neg_headlines = [red_headlines[ind] for ind in neg_indices]
    neu_headlines = [red_headlines[ind] for ind in neu_indices]
    pos_embeddings = embeddings[pos_indices,:]
    neg_embeddings = embeddings[neg_indices,:]
    neu_embeddings = embeddings[neu_indices,:]
    
    cos_in_topic_pos_sum = 0
    cos_bet_topic_pos_sum = 0
    model_diversity_pos_sum = 0
    cos_in_topic_neg_sum = 0
    cos_bet_topic_neg_sum = 0
    model_diversity_neg_sum = 0
    cos_in_topic_neu_sum = 0
    cos_bet_topic_neu_sum = 0
    model_diversity_neu_sum = 0

    #set pos_cluster_num, neg_cluster_num, neu_cluster_num based on the number of embeddings
    pos_cluster_num = int(topic_num * len(pos_embeddings) / len(embeddings))
    neg_cluster_num = int(topic_num * len(neg_embeddings) / len(embeddings))
    neu_cluster_num = int(topic_num * len(neu_embeddings) / len(embeddings))
    diff = topic_num - (pos_cluster_num + neg_cluster_num + neu_cluster_num)
    if pos_cluster_num < neg_cluster_num and pos_cluster_num < neu_cluster_num:
        pos_cluster_num += diff
    elif neg_cluster_num < pos_cluster_num and neg_cluster_num < neu_cluster_num:
        neg_cluster_num += diff
    else:
        neu_cluster_num += diff

    print("pos_cluster_num = ", pos_cluster_num)
    print("neg_cluster_num = ", neg_cluster_num)
    print("neu_cluster_num = ", neu_cluster_num)
    
    for i in range(first_count, last_count+1):
        saved_model_folder = f"/shared/share_tm-finance/Final/Stored_model/{year_num}_year_window/three_models/{modeltype}/pos"
        pos_topics, pos_Topic_model, pos_tr_grouped_sum, pos_te_grouped_sum = \
                train_model(saved_model_folder, pos_df, pos_headlines, pos_embeddings, 
                            modeltype, topic_num, pos_cluster_num, hdb_min_cluster_size, tar_year, save_model, reduce_outliers, i)
        mean_score, ave_sim, model_diversity = calculate_score(pos_Topic_model, embedding_model, pos_topics)
        cos_in_topic_pos_sum += mean_score
        cos_bet_topic_pos_sum += ave_sim
        model_diversity_pos_sum += model_diversity

        saved_model_folder = f"/shared/share_tm-finance/Final/Stored_model/{year_num}_year_window/three_models/{modeltype}/neg"
        neg_topics, neg_Topic_model, neg_tr_grouped_sum, neg_te_grouped_sum = \
                train_model(saved_model_folder, neg_df, neg_headlines, neg_embeddings,
                            modeltype, topic_num, neg_cluster_num, hdb_min_cluster_size, tar_year, save_model, reduce_outliers, i)
        mean_score, ave_sim, model_diversity = calculate_score(neg_Topic_model, embedding_model, neg_topics)
        cos_in_topic_neg_sum += mean_score
        cos_bet_topic_neg_sum += ave_sim
        model_diversity_neg_sum += model_diversity

        saved_model_folder = f"/shared/share_tm-finance/Final/Stored_model/{year_num}_year_window/three_models/{modeltype}/neu"
        neu_topics, neu_Topic_model, neu_tr_grouped_sum, neu_te_grouped_sum = \
                train_model(saved_model_folder, neu_df, neu_headlines, neu_embeddings,
                            modeltype, topic_num, neu_cluster_num, hdb_min_cluster_size, tar_year, save_model, reduce_outliers, i)
        mean_score, ave_sim, model_diversity = calculate_score(neu_Topic_model, embedding_model, neu_topics)
        cos_in_topic_neu_sum += mean_score
        cos_bet_topic_neu_sum += ave_sim
        model_diversity_neu_sum += model_diversity
        
        if combine:   
            #get the last column name of the last column of pos_tr_grouped_sum
            pos_last_col = int(pos_tr_grouped_sum.columns[-1])
            neg_tr_grouped_sum.columns = [str(int(col) + pos_last_col) for col in neg_tr_grouped_sum.columns]
            neg_last_col = int(neg_tr_grouped_sum.columns[-1])
            neu_tr_grouped_sum.columns = [str(int(col) + neg_last_col) for col in neu_tr_grouped_sum.columns]
            
            pos_last_col = int(pos_te_grouped_sum.columns[-1])
            neg_te_grouped_sum.columns = [str(int(col) + pos_last_col) for col in neg_te_grouped_sum.columns]
            neg_last_col = int(neg_te_grouped_sum.columns[-1])
            neu_te_grouped_sum.columns = [str(int(col) + neg_last_col) for col in neu_te_grouped_sum.columns]

            tr_grouped_sum = pd.concat([pos_tr_grouped_sum, neg_tr_grouped_sum, neu_tr_grouped_sum], axis = 1)
            tr_grouped_sum.fillna(0, inplace = True)
            te_grouped_sum = pd.concat([pos_te_grouped_sum, neg_te_grouped_sum, neu_te_grouped_sum], axis = 1)
            te_grouped_sum.fillna(0, inplace = True)
            linear_regression(tr_grouped_sum, te_grouped_sum, insampler2_list, outsampler2_list)
            
        else:
            linear_regression(pos_tr_grouped_sum, pos_te_grouped_sum, pos_insampler2_list, pos_outsampler2_list)
            linear_regression(neg_tr_grouped_sum, neg_te_grouped_sum, neg_insampler2_list, neg_outsampler2_list)
            linear_regression(neu_tr_grouped_sum, neu_te_grouped_sum, neu_insampler2_list, neu_outsampler2_list)
            
                                                                                                                                                                    

    if combine:
        insampler2_dict[tar_year] = np.mean(insampler2_list)
        outsampler2_dict[tar_year] = np.mean(outsampler2_list)
    else:
        sep_insampler2_list = (pos_insampler2_list* len(pos_embeddings) / len(embeddings)) + (neg_insampler2_list* len(neg_embeddings) / len(embeddings)) + (neu_insampler2_list* len(neu_embeddings) / len(embeddings))
        sep_outsampler2_list = (pos_outsampler2_list* len(pos_embeddings) / len(embeddings)) + (neg_outsampler2_list* len(neg_embeddings) / len(embeddings)) + (neu_outsampler2_list* len(neu_embeddings) / len(embeddings))
        insampler2_dict[tar_year] = np.mean(sep_insampler2_list)
        insampler2_dict[tar_year] = np.mean(sep_outsampler2_list)

    cos_in_topic_dict[tar_year] = (cos_in_topic_pos_sum* len(pos_embeddings) / len(embeddings) + cos_in_topic_neg_sum* len(neg_embeddings) / len(embeddings) + cos_in_topic_neu_sum* len(neu_embeddings) / len(embeddings)) / div
    cos_bet_topic_dict[tar_year] = (cos_bet_topic_pos_sum* len(pos_embeddings) / len(embeddings) + cos_bet_topic_neg_sum* len(neg_embeddings) / len(embeddings) + cos_bet_topic_neu_sum* len(neu_embeddings) / len(embeddings)) / div
    model_diversity_dict[tar_year] = (model_diversity_pos_sum* len(pos_embeddings) / len(embeddings) + model_diversity_neg_sum* len(neg_embeddings) / len(embeddings) + model_diversity_neu_sum* len(neu_embeddings) / len(embeddings)) / div
    
    print("Year {year} is done".format(year = tar_year))
        
if not combine:
    print("sep_insample = ", insampler2_dict)
    print("sep_outsample = ", insampler2_dict)
else:
    print("insample = ", insampler2_dict)
    print("outsample = ", outsampler2_dict)

print("cos_in_topic = ", cos_in_topic_dict)
print("cos_bet_topic = ", cos_bet_topic_dict)
print("model_diversity = ", model_diversity_dict)

In [None]:
tr_grouped_sum2 = tr_grouped_sum.copy()
tr_grouped_sum2 = tr_grouped_sum2.groupby('comnam').sum()
tr_grouped_sum2.reset_index(level=0, inplace=True)
tr_grouped_sum2 = tr_grouped_sum2.T
tr_grouped_sum2.reset_index(drop = True, inplace = True)
tr_grouped_sum2.columns = tr_grouped_sum2.iloc[0]
tr_grouped_sum2 = tr_grouped_sum2.drop(tr_grouped_sum2.index[0])
tr_grouped_sum2.reset_index(drop = True, inplace = True)

pos_df = pos_Topic_model.get_topic_info()
pos_df.drop(columns = ['Topic'], inplace = True)
neg_df = neg_Topic_model.get_topic_info()
neg_df.drop(columns = ['Topic'], inplace = True)
neu_df = neu_Topic_model.get_topic_info()
neu_df.drop(columns = ['Topic'], inplace = True)
topic_info = pd.concat([pos_df, neg_df, neu_df], axis = 0)
topic_info.reset_index(drop = True, inplace = True)

topic_exposure = pd.concat([tr_grouped_sum2, topic_info], axis = 1)

data = topic_exposure

In [None]:
import matplotlib.pyplot as plt

def get_topic_info(topic_number):
    """Fetch and plot topic representation and company exposures for a given topic number.
       Also, list companies with exposure greater than 100."""
    # Filter row by topic number
    topic_row = data[data.index == topic_number]
    
    if topic_row.empty:
        print("Topic number not found.")
        return
    
    # Extract topic representation
    representation = topic_row['Representation'].values[0]
    print(f"Representation of Topic {topic_number}: {representation}")
    
    # Extract company exposures and filter for companies with exposure > 100
    exposures = topic_row.iloc[:, :-4].T  # All columns except the last four
    exposures.columns = ['Exposure']
    
    # Identify companies with exposure greater than 100
    high_exposure_companies = exposures[exposures['Exposure'] > 100]
    if not high_exposure_companies.empty:
        print("\nCompanies with exposure greater than 100:")
        for company in high_exposure_companies.index:
            print(f"- {company}: {high_exposure_companies.loc[company, 'Exposure']}")
    else:
        print("No companies with exposure greater than 100 for this topic.")
    
    # Plot exposures without x-axis numbers
    plt.figure(figsize=(14, 8))  # Make the plot bigger
    plt.bar(data.columns[:-4], topic_row.iloc[:, :-4].values[0])
    plt.xticks([])  # Hide x-axis numbers
    plt.xlabel("Company Number")
    plt.ylabel("Exposure Level")
    plt.show()

    return exposures

def plot_company_exposure(company_name):
    """Plot exposure levels of a specific company across all topics and display the representation of the highest exposure topic."""
    if company_name not in data.columns:
        print("Company not found.")
        return
    
    # Extract exposures for the company across all topics
    exposures = data[[company_name]]
    exposures.columns = ['Exposure']
    
    # Find the topic with the highest exposure for this company
    max_exposure_topic = exposures['Exposure'].idxmax()
    max_exposure_value = exposures['Exposure'].max()
    highest_topic_representation = data.loc[max_exposure_topic, 'Representation']
    
    # Display the representation of the topic with the highest exposure
    print(f"Highest Exposure Topic for {company_name}: Topic {max_exposure_topic}")
    print(f"Exposure Level: {max_exposure_value}")
    print(f"Representation: {highest_topic_representation}")
    
    # Plot exposures for all topics
    exposures.plot(kind='bar', legend=False, title=f"{company_name} Exposure to All Topics")
    plt.xlabel("Topic Number")
    plt.ylabel("Exposure Level")
    plt.show()

In [None]:
exposures = get_topic_info(17)  # Replace 0 with the desired topic number

In [None]:
plot_company_exposure("AMAZON COM INC")  # Replace "3M CO" with the desired company name

In [None]:
file_path = f'{topic_num}_Data.xlsx'
wb = openpyxl.load_workbook(file_path)
program_list = [insampler2_dict, outsampler2_dict, cos_in_topic_dict, cos_bet_topic_dict, model_diversity_dict]
data_list = ['Insample_R2', 'Outsample_R2', 'Cos_in_topic', 'Cos_btn_topic', 'Diversity']

for i in range(len(data_list)):
    ws = wb[data_list[i]]
    for cell in ws['A']:
        if cell.value == models:
            for row_num in range(cell.row, cell.row + 17):
                sentiment_type_cell = ws.cell(row=row_num, column=2)
                if sentiment_type_cell.value == sentiment_type:
                    for row_num in range(sentiment_type_cell.row, sentiment_type_cell.row + 4):
                        model_cell = ws.cell(row=row_num, column=3).value
                        if model_cell == f"{modeltype}_{topic_num}":
                            for year, value in program_list[i].items():
                                ws.cell(row=row_num, column=year - 2013 + 3, value=value)
                    break
wb.save(file_path)