In [8]:
import os
import pandas as pd 
from sentence_transformers import SentenceTransformer
from cuml.manifold import UMAP
from cuml.cluster import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import torch
import gc
from scipy.linalg import block_diag

def uh(min_cluster_size):
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
    umap_model = UMAP(n_components = 10,random_state=42)
    hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size,metric = "euclidean", cluster_selection_method="eom",
                            gen_min_span_tree = True, prediction_data = False, min_samples = 40)
    vectorizer_model = CountVectorizer()
    Topic_model = BERTopic(embedding_model=embedding_model, umap_model=umap_model, hdbscan_model=hdbscan_model, vectorizer_model=vectorizer_model,
                        calculate_probabilities = False,verbose = True)
    return Topic_model

def tr_te_split(headlines,df,embeddings,k=1):
    indices = np.arange(len(headlines))
    tr_ind, te_ind = train_test_split(indices, test_size=0.2, shuffle= True, random_state=k)
    tr_df = df.iloc[tr_ind,:]
    te_df = df.iloc[te_ind,:]
    tr_headlines = [headlines[ind] for ind in tr_ind]
    te_headlines = [headlines[ind] for ind in te_ind]
    tr_embeddings = embeddings[tr_ind,:]
    return tr_df,te_df,tr_headlines,te_headlines,tr_embeddings

def find_min_cluster_size(min_cluster_size,min_cluster_size_list,headlines,embeddings,target_num_cluster):
    Topic_model = uh(min_cluster_size)
    Topic_model.fit(headlines,embeddings)
    num_cluster = Topic_model.get_topic_info().shape[0]-1
    if num_cluster > (target_num_cluster + round(5*target_num_cluster/120)) or num_cluster < (target_num_cluster - round(5*target_num_cluster/120)):
        no_tuning = 0
        tried_min_cluster_size_list = []
    while (num_cluster > (target_num_cluster + round(5*target_num_cluster/120)) or num_cluster < (target_num_cluster - round(5*target_num_cluster/120))) and (min_cluster_size not in tried_min_cluster_size_list):
        no_tuning +=1
        tried_min_cluster_size_list.append(min_cluster_size)
        if num_cluster > (target_num_cluster + round(5*target_num_cluster/120)):
            if np.array(min_cluster_size_list)[np.array(min_cluster_size_list) > min_cluster_size].size >0:
                min_cluster_size = np.array(min_cluster_size_list)[np.array(min_cluster_size_list) > min_cluster_size][0]
                Topic_model = uh(min_cluster_size)
                Topic_model.fit(headlines,embeddings)
                num_cluster = Topic_model.get_topic_info().shape[0]-1
            else:
                print(f"after {no_tuning-1}th tuning, the final min_cluster_size is {min_cluster_size}")
                return min_cluster_size
        else:
            if np.array(min_cluster_size_list)[np.array(min_cluster_size_list) < min_cluster_size].size > 0:
                min_cluster_size = np.array(min_cluster_size_list)[np.array(min_cluster_size_list) < min_cluster_size][-1]
                Topic_model = uh(min_cluster_size)
                Topic_model.fit(headlines,embeddings)
                num_cluster = Topic_model.get_topic_info().shape[0]-1
            else:
                print(f"after {no_tuning-1}th tuning, the final min_cluster_size is {min_cluster_size}")
                return min_cluster_size
        print(f"after {no_tuning}th tuning, the min_cluster_size is {min_cluster_size}")
    return min_cluster_size

def linear_regression(tr_topic_dist,te_topic_dist,tr_df,te_df):
    tr_ret_topic_dist = pd.concat([tr_df.drop(columns = ["rp_entity_id","headline","vocab_con_headline","css"]),pd.DataFrame(tr_topic_dist)],axis = 1)
    tr_grouped = tr_ret_topic_dist.groupby(['date',"comnam","ret"])
    tr_grouped_sum = tr_grouped.sum()
    tr_X = np.array(tr_grouped_sum)
    tr_ret = [ind[2] for ind in list(tr_grouped_sum.index)]
    tr_Y = np.array(tr_ret).reshape(-1,1)
    tr_Y_mean = np.mean(tr_Y)
    regression = LinearRegression(fit_intercept=True)
    regression.fit(tr_X,tr_Y)
    tr_r2 = regression.score(tr_X,tr_Y)

    te_ret_topic_dist = pd.concat([te_df.drop(columns = ["rp_entity_id","headline","vocab_con_headline","css"]),pd.DataFrame(te_topic_dist)],axis = 1)
    te_grouped = te_ret_topic_dist.groupby(['date',"comnam","ret"])
    te_grouped_sum = te_grouped.sum()
    te_X = np.array(te_grouped_sum)
    te_ret = [ind[2] for ind in list(te_grouped_sum.index)]
    te_Y = np.array(te_ret).reshape(-1,1)
    te_Y_pred = regression.predict(te_X)
    te_sst = np.sum((te_Y-tr_Y_mean)**2)
    te_sse = np.sum((te_Y-te_Y_pred)**2)
    te_r2 = 1-te_sse/te_sst
    return tr_r2,te_r2


In [11]:
df_folder = "/shared/share_tm-finance/Final/Processed_df/One_year_window"
embeddings_folder = "/shared/share_tm-finance/Final/Embeddings/One_year_window"

year_list = range(2014,2015)
num_clusters = 120

com_tr_r2_dict = dict()
com_te_r2_dict = dict()
mean_com_tr_r2_dict = dict()
mean_com_te_r2_dict = dict()
sep_tr_r2_dict = dict()
sep_te_r2_dict = dict()
mean_sep_tr_r2_dict = dict()
mean_sep_te_r2_dict = dict()
num_clusters_dict = dict()
mean_num_clusters_dict = dict()

print("pre-computations finished")

for year in year_list:
    print(f"computation for {year} starts")
    df = pd.read_csv(df_folder+f"/contem_{year}.csv")
    headlines = df.vocab_con_headline.tolist()
    embeddings = np.load(embeddings_folder+f"/contem_{year}_embeddings.npy")

    print(f"The df and embeddings in {year} finished")

    pos_indices = df[df['css'] > 0].index
    neg_indices = df[df['css'] < 0].index
    neu_indices = df[df['css'] == 0].index
    pos_df = df.iloc[pos_indices,:]
    neg_df = df.iloc[neg_indices,:]
    neu_df = df.iloc[neu_indices,:]
    pos_headlines = [headlines[ind] for ind in pos_indices]
    neg_headlines = [headlines[ind] for ind in neg_indices]
    neu_headlines = [headlines[ind] for ind in neu_indices]
    pos_embeddings = embeddings[pos_indices,:]
    neg_embeddings = embeddings[neg_indices,:]
    neu_embeddings = embeddings[neu_indices,:]

    #set pos_cluster_num, neg_cluster_num, neu_cluster_num based on the number of embeddings
    pos_cluster_num = int(num_clusters * len(pos_embeddings) / len(embeddings))
    neg_cluster_num = int(num_clusters * len(neg_embeddings) / len(embeddings))
    neu_cluster_num = int(num_clusters * len(neu_embeddings) / len(embeddings))
    diff = num_clusters - (pos_cluster_num + neg_cluster_num + neu_cluster_num)
    if pos_cluster_num < neg_cluster_num and pos_cluster_num < neu_cluster_num:
        pos_cluster_num += diff
    elif neg_cluster_num < pos_cluster_num and neg_cluster_num < neu_cluster_num:
        neg_cluster_num += diff
    else:
        neu_cluster_num += diff

    print(f"pos_cluster_num in {year} is:", pos_cluster_num)
    print(f"neg_cluster_num in {year} is:", neg_cluster_num)
    print(f"neu_cluster_num in {year} is:", neu_cluster_num)

    
    pos_min_cluster_size,neg_min_cluster_size,neu_min_cluster_size = 205,200,200
    pos_tr_df, pos_te_df, pos_tr_headlines,pos_te_headlines,pos_tr_embeddings = tr_te_split(pos_headlines,pos_df,pos_embeddings)
    neg_tr_df, neg_te_df, neg_tr_headlines,neg_te_headlines,neg_tr_embeddings = tr_te_split(neg_headlines,neg_df,neg_embeddings)
    neu_tr_df, neu_te_df, neu_tr_headlines,neu_te_headlines,neu_tr_embeddings = tr_te_split(neu_headlines,neu_df,neu_embeddings)
    pos_tr_df.reset_index(drop=True,inplace=True)
    pos_te_df.reset_index(drop=True,inplace=True)
    neg_tr_df.reset_index(drop=True,inplace=True)
    neg_te_df.reset_index(drop=True,inplace=True)
    neu_tr_df.reset_index(drop=True,inplace=True)
    neu_te_df.reset_index(drop=True,inplace=True)
    
    com_tr_r2_list = []
    com_te_r2_list = []
    sep_tr_r2_list = []
    sep_te_r2_list = []
    num_clusters_list = []

    for i in range(1,6):
        print(f"The {i}th computation in {year} starts")
        torch.cuda.empty_cache()
        gc.collect()

        pos_topic_model = uh(pos_min_cluster_size)
        neg_topic_model = uh(neg_min_cluster_size)
        neu_topic_model = uh(neu_min_cluster_size)
        pos_topic_model.fit(pos_tr_headlines,pos_tr_embeddings)
        neg_topic_model.fit(neg_tr_headlines,neg_tr_embeddings)
        neu_topic_model.fit(neu_tr_headlines,neu_tr_embeddings)
        print(f"real pos num_cluster is {pos_topic_model.get_topic_info().shape[0]-1}")
        print(f"real neg num_cluster is {neg_topic_model.get_topic_info().shape[0]-1}")
        print(f"real neu num_cluster is {neu_topic_model.get_topic_info().shape[0]-1}")
        num_clusters = pos_topic_model.get_topic_info().shape[0]+neg_topic_model.get_topic_info().shape[0]+neu_topic_model.get_topic_info().shape[0]-3
        print(f"The final number of clusters in {year} is {num_clusters}")
        num_clusters_list.append(num_clusters)
        print(F"Model fitting for {i}th computation in {year} finished")

        pos_tr_topic_dist, _ = pos_topic_model.approximate_distribution(pos_tr_headlines)
        neg_tr_topic_dist, _ = neg_topic_model.approximate_distribution(neg_tr_headlines)
        neu_tr_topic_dist, _ = neu_topic_model.approximate_distribution(neu_tr_headlines)
        pos_te_topic_dist, _ = pos_topic_model.approximate_distribution(pos_te_headlines)
        neg_te_topic_dist, _ = neg_topic_model.approximate_distribution(neg_te_headlines)
        neu_te_topic_dist, _ = neu_topic_model.approximate_distribution(neu_te_headlines)
        
        # This is separate version of R square
        pos_tr_r2, pos_te_r2 = linear_regression(pos_tr_topic_dist,pos_te_topic_dist,pos_tr_df,pos_te_df)
        neg_tr_r2, neg_te_r2 = linear_regression(neg_tr_topic_dist,neg_te_topic_dist,neg_tr_df,neg_te_df)
        neu_tr_r2, neu_te_r2 = linear_regression(neu_tr_topic_dist,neu_te_topic_dist,neu_tr_df,neu_te_df)
        sep_tr_r2 = (pos_tr_r2* len(pos_embeddings) / len(embeddings)) + (neg_tr_r2* len(neg_embeddings) / len(embeddings)) + (neu_tr_r2* len(neu_embeddings) / len(embeddings))
        sep_tr_r2_list.append(sep_tr_r2)
        sep_te_r2 = (pos_te_r2* len(pos_embeddings) / len(embeddings)) + (neg_te_r2* len(neg_embeddings) / len(embeddings)) + (neu_te_r2* len(neu_embeddings) / len(embeddings))
        sep_te_r2_list.append(sep_te_r2)

        # This is combine version of R square
        combined_tr_df = pd.concat([pos_tr_df,neg_tr_df,neu_tr_df],axis = 0)
        combined_tr_df.reset_index(drop=True,inplace=True)
        combined_tr_topic_dist = block_diag(pos_tr_topic_dist,neg_tr_topic_dist,neu_tr_topic_dist)
        combined_te_df = pd.concat([pos_te_df,neg_te_df,neu_te_df],axis = 0)
        combined_te_df.reset_index(drop=True,inplace=True)
        combined_te_topic_dist = block_diag(pos_te_topic_dist,neg_te_topic_dist,neu_te_topic_dist)
        com_tr_r2, com_te_r2 = linear_regression(combined_tr_topic_dist,combined_te_topic_dist,combined_tr_df,combined_te_df)
        com_tr_r2_list.append(com_tr_r2)
        com_te_r2_list.append(com_te_r2)

        print(F"Computations for {i}th computation in {year} finished")

    num_clusters_dict[year] = num_clusters_list
    mean_num_clusters_dict[year] = np.mean(num_clusters_list)
    
    mean_sep_tr_r2 = np.mean(sep_tr_r2_list)
    mean_sep_te_r2 = np.mean(sep_te_r2_list)
    sep_tr_r2_dict[year] = sep_tr_r2_list
    sep_te_r2_dict[year] = sep_te_r2_list
    mean_sep_tr_r2_dict[year] = mean_sep_tr_r2
    mean_sep_te_r2_dict[year] = mean_sep_te_r2

    mean_com_tr_r2 = np.mean(com_tr_r2_list)
    mean_com_te_r2 = np.mean(com_te_r2_list)
    com_tr_r2_dict[year] = com_tr_r2_list
    com_te_r2_dict[year] = com_te_r2_list
    mean_com_tr_r2_dict[year] = mean_com_tr_r2
    mean_com_te_r2_dict[year] = mean_com_te_r2
    print(f"comutation for {year} ends")



pre-computations finished
computation for 2014 starts
The df and embeddings in 2014 finished
pos_cluster_num in 2014 is: 43
neg_cluster_num in 2014 is: 18
neu_cluster_num in 2014 is: 59
The 1th computation in 2014 starts


2024-09-30 11:33:45,463 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-09-30 11:33:49,189 - BERTopic - Dimensionality - Completed ✓
2024-09-30 11:33:49,193 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-09-30 11:33:53,473 - BERTopic - Cluster - Completed ✓
2024-09-30 11:33:53,490 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-09-30 11:33:54,243 - BERTopic - Representation - Completed ✓
2024-09-30 11:33:54,550 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-09-30 11:33:55,328 - BERTopic - Dimensionality - Completed ✓
2024-09-30 11:33:55,330 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-09-30 11:33:57,152 - BERTopic - Cluster - Completed ✓
2024-09-30 11:33:57,160 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-09-30 11:33:57,377 - BERTopic - Representation - Completed ✓
2024-09-30 1

real pos num_cluster is 55
real neg num_cluster is 25
real neu num_cluster is 59
The final number of clusters in 2014 is 139
Model fitting for 1th computation in 2014 finished


100%|██████████| 96/96 [00:03<00:00, 25.78it/s]
100%|██████████| 37/37 [00:01<00:00, 31.31it/s]
100%|██████████| 132/132 [00:04<00:00, 31.54it/s]
100%|██████████| 24/24 [00:01<00:00, 23.78it/s]
100%|██████████| 10/10 [00:00<00:00, 47.05it/s]
100%|██████████| 33/33 [00:00<00:00, 35.30it/s]


Computations for 1th computation in 2014 finished
The 2th computation in 2014 starts


2024-09-30 11:34:26,495 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-09-30 11:34:30,238 - BERTopic - Dimensionality - Completed ✓
2024-09-30 11:34:30,242 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-09-30 11:34:33,920 - BERTopic - Cluster - Completed ✓
2024-09-30 11:34:33,936 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-09-30 11:34:34,471 - BERTopic - Representation - Completed ✓
2024-09-30 11:34:34,513 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-09-30 11:34:38,417 - BERTopic - Dimensionality - Completed ✓
2024-09-30 11:34:38,420 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-09-30 11:34:41,151 - BERTopic - Cluster - Completed ✓
2024-09-30 11:34:41,157 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-09-30 11:34:41,333 - BERTopic - Representation - Completed ✓
2024-09-30 1

real pos num_cluster is 2
real neg num_cluster is 3
real neu num_cluster is 59
The final number of clusters in 2014 is 64
Model fitting for 2th computation in 2014 finished


100%|██████████| 96/96 [00:03<00:00, 31.83it/s]
100%|██████████| 37/37 [00:00<00:00, 47.03it/s]
100%|██████████| 132/132 [00:04<00:00, 28.73it/s]
100%|██████████| 24/24 [00:00<00:00, 33.63it/s]
100%|██████████| 10/10 [00:00<00:00, 43.88it/s]
100%|██████████| 33/33 [00:01<00:00, 26.60it/s]


Computations for 2th computation in 2014 finished
The 3th computation in 2014 starts


2024-09-30 11:35:09,111 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-09-30 11:35:12,773 - BERTopic - Dimensionality - Completed ✓
2024-09-30 11:35:12,776 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-09-30 11:35:17,010 - BERTopic - Cluster - Completed ✓
2024-09-30 11:35:17,027 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-09-30 11:35:17,628 - BERTopic - Representation - Completed ✓
2024-09-30 11:35:17,853 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-09-30 11:35:18,589 - BERTopic - Dimensionality - Completed ✓
2024-09-30 11:35:18,591 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-09-30 11:35:20,566 - BERTopic - Cluster - Completed ✓
2024-09-30 11:35:20,576 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-09-30 11:35:20,787 - BERTopic - Representation - Completed ✓
2024-09-30 1

real pos num_cluster is 54
real neg num_cluster is 2
real neu num_cluster is 58
The final number of clusters in 2014 is 114
Model fitting for 3th computation in 2014 finished


100%|██████████| 96/96 [00:03<00:00, 25.23it/s]
100%|██████████| 37/37 [00:01<00:00, 36.72it/s]
100%|██████████| 132/132 [00:04<00:00, 29.40it/s]
100%|██████████| 24/24 [00:00<00:00, 28.20it/s]
100%|██████████| 10/10 [00:00<00:00, 44.17it/s]
100%|██████████| 33/33 [00:01<00:00, 25.44it/s]


Computations for 3th computation in 2014 finished
The 4th computation in 2014 starts


2024-09-30 11:35:49,872 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-09-30 11:35:53,530 - BERTopic - Dimensionality - Completed ✓
2024-09-30 11:35:53,534 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-09-30 11:35:57,195 - BERTopic - Cluster - Completed ✓
2024-09-30 11:35:57,212 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-09-30 11:35:57,716 - BERTopic - Representation - Completed ✓
2024-09-30 11:35:57,903 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-09-30 11:35:58,804 - BERTopic - Dimensionality - Completed ✓
2024-09-30 11:35:58,806 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-09-30 11:36:01,543 - BERTopic - Cluster - Completed ✓
2024-09-30 11:36:01,550 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-09-30 11:36:01,730 - BERTopic - Representation - Completed ✓
2024-09-30 1

real pos num_cluster is 56
real neg num_cluster is 3
real neu num_cluster is 58
The final number of clusters in 2014 is 117
Model fitting for 4th computation in 2014 finished


100%|██████████| 96/96 [00:03<00:00, 28.07it/s]
100%|██████████| 37/37 [00:00<00:00, 47.27it/s]
100%|██████████| 132/132 [00:04<00:00, 31.41it/s]
100%|██████████| 24/24 [00:01<00:00, 23.81it/s]
100%|██████████| 10/10 [00:00<00:00, 49.65it/s]
100%|██████████| 33/33 [00:00<00:00, 35.07it/s]


Computations for 4th computation in 2014 finished
The 5th computation in 2014 starts


2024-09-30 11:36:30,513 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-09-30 11:36:34,206 - BERTopic - Dimensionality - Completed ✓
2024-09-30 11:36:34,210 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-09-30 11:36:37,496 - BERTopic - Cluster - Completed ✓
2024-09-30 11:36:37,510 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-09-30 11:36:38,004 - BERTopic - Representation - Completed ✓
2024-09-30 11:36:38,170 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-09-30 11:36:38,933 - BERTopic - Dimensionality - Completed ✓
2024-09-30 11:36:38,935 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-09-30 11:36:40,473 - BERTopic - Cluster - Completed ✓
2024-09-30 11:36:40,480 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-09-30 11:36:40,655 - BERTopic - Representation - Completed ✓
2024-09-30 1

real pos num_cluster is 41
real neg num_cluster is 6
real neu num_cluster is 58
The final number of clusters in 2014 is 105
Model fitting for 5th computation in 2014 finished


100%|██████████| 96/96 [00:03<00:00, 28.69it/s]
100%|██████████| 37/37 [00:00<00:00, 46.22it/s]
100%|██████████| 132/132 [00:04<00:00, 31.38it/s]
100%|██████████| 24/24 [00:00<00:00, 24.48it/s]
100%|██████████| 10/10 [00:00<00:00, 49.51it/s]
100%|██████████| 33/33 [00:00<00:00, 35.23it/s]


Computations for 5th computation in 2014 finished
comutation for 2014 ends


In [12]:
print(f"The number of clusters is {num_clusters_dict}")
print(f"The mean number of clusters is {mean_num_clusters_dict}")
print(f"The insample R square of combined version is {com_tr_r2_dict}")
print(f"The outsample R square of combined version is {com_te_r2_dict}")
print(f"The mean insample R square of combined version is {mean_com_tr_r2_dict}")
print(f"THe mean outsample R square of combined version is {mean_com_te_r2_dict}")
print(f"The insample R square of separate version is {sep_tr_r2_dict}")
print(f"The outsample R square of separate version is {sep_te_r2_dict}")
print(f"The mean insample R square of separate version is {mean_sep_tr_r2_dict}")
print(f"The mean outsample R square of separate version is {mean_sep_te_r2_dict}")

The number of clusters is {2014: [139, 64, 114, 117, 105]}
The mean number of clusters is {2014: 107.8}
The insample R square of combined version is {2014: [0.052501885010649785, 0.007803406859818063, 0.017249533375712667, 0.019380744659257765, 0.018269602424957898]}
The outsample R square of combined version is {2014: [0.027192164060876545, 0.0047592867115535675, 0.011239337100404256, 0.01249582103625213, 0.011557966121010343]}
The mean insample R square of combined version is {2014: 0.023041034466079236}
THe mean outsample R square of combined version is {2014: 0.013448915006019368}
The insample R square of separate version is {2014: [0.016757433799557842, 0.0028091665926831344, 0.006618498285481221, 0.007340049541884292, 0.006741373074125069]}
The outsample R square of separate version is {2014: [0.010016055131618162, 0.0012623930462520795, 0.0038650026114559724, 0.004572299403354408, 0.004039114019756721]}
The mean insample R square of separate version is {2014: 0.00805330425874631