In [36]:
from transformers import BertTokenizer, BertModel
import torch
import os
import pandas as pd

In [37]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [38]:
tokenizer = BertTokenizer.from_pretrained('bert-large-cased')
model = BertModel.from_pretrained('bert-large-cased')
model.eval()
model = model.to(device)

In [39]:
# path = '../CSR_report_processed_v4_gemini_v0/NASDAQ_AAL_2007_v0_gemini_corrected.txt'
# try:
#     with open(path, 'r') as file:
#         text = file.read()
# except:
#     print(f"Error: The file '{path}' was not found.")
#     exit(1)
# file.close()

In [None]:
max_len=512
chunk_size = max_len - 2
input_folder = "../CSR_report_processed_v4_gemini_v0/CSR_report_new_collect"
output_csv = "../output_dataset/CSR_report_new_collect/csr_embeddings.csv"

In [41]:
def get_embedding(text):
    # 兩個換行符號代表下一個段落
    paragraphs = [p.strip() for p in text.split(r'\n\s*\n') if p.strip()]
    embeddings = []
    for paragraph in paragraphs:
        tokens = tokenizer.tokenize(paragraph)
        chuncks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]

        for chunck in chuncks:
            tokens_chunck = ['[CLS]'] + chunck + ['[SEP]']
            input_ids = tokenizer.convert_tokens_to_ids(tokens_chunck)
            input_ids = torch.tensor([input_ids]).to(device)

            with torch.no_grad():
                outputs = model(input_ids)
                last_hidden_states = outputs.last_hidden_state.squeeze(0)

            token_embeddings = last_hidden_states[1:-1]  # Exclude [CLS] and [SEP]
            chunck_embedding = token_embeddings.mean(dim=0)
            embeddings.append(chunck_embedding)
    doc_embedding = torch.stack(embeddings).mean(dim=0)
    return doc_embedding

In [42]:
if not os.path.exists(output_csv):
    df = pd.DataFrame(columns=["file_name"] + [f"dim_{i}" for i in range(1024)])
    df.to_csv(output_csv, index=False)

In [43]:
max_files = 5000

In [44]:
processed_count = 0

for fname in sorted(os.listdir(input_folder)):
    if fname.endswith(".txt") and processed_count < max_files:
        try:
            full_path = os.path.join(input_folder, fname)
            with open(full_path, 'r', encoding='utf-8') as f:
                text = f.read()

            file_name = fname.split('_v0')[0]

            # 避免重複處理
            existing_df = pd.read_csv(output_csv)
            if file_name in existing_df["file_name"].values:
                print(f"✔️ Already processed: {file_name}")
                continue

            print(f"🚀 Processing: {file_name}")
            emb = get_embedding(text)

            # 儲存結果
            row = [file_name] + emb.tolist()
            df_new = pd.DataFrame([row], columns=["file_name"] + [f"dim_{i}" for i in range(1024)])
            df_new.to_csv(output_csv, mode='a', header=False, index=False)

            processed_count += 1

        except Exception as e:
            print(f"❌ Error processing {fname}: {e}")

✔️ Already processed: NASDAQ_AAL_2007
✔️ Already processed: NASDAQ_AAL_2008
✔️ Already processed: NASDAQ_AAL_2009
✔️ Already processed: NASDAQ_AAL_2011
✔️ Already processed: NASDAQ_AAL_2012
✔️ Already processed: NASDAQ_AAL_2013
✔️ Already processed: NASDAQ_AAL_2014
✔️ Already processed: NASDAQ_AAL_2015
✔️ Already processed: NASDAQ_AAL_2016
✔️ Already processed: NASDAQ_AAL_2017
✔️ Already processed: NASDAQ_AAL_2018
✔️ Already processed: NASDAQ_AAL_2019
✔️ Already processed: NASDAQ_AAL_2020
✔️ Already processed: NASDAQ_AAL_2021
✔️ Already processed: NASDAQ_AAON_2018_2019
✔️ Already processed: NASDAQ_AAON_2019_2020
✔️ Already processed: NASDAQ_AAON_2020_2021
✔️ Already processed: NASDAQ_AAON_2021_2022
✔️ Already processed: NASDAQ_AAPL_2014
✔️ Already processed: NASDAQ_AAPL_2015
✔️ Already processed: NASDAQ_AAPL_2016
✔️ Already processed: NASDAQ_AAPL_2017
✔️ Already processed: NASDAQ_AAPL_2018
✔️ Already processed: NASDAQ_AAPL_2019
✔️ Already processed: NASDAQ_AAPL_2021
✔️ Already processe

# Merge CSR embedding and score

In [46]:
import pandas as pd

In [47]:
csr_embeddings = pd.read_csv("../output_dataset/csr_embeddings.csv")
hm_non_overlapping_score = pd.read_csv("../output_dataset/handmade_features_non_overlapping_score_v1.csv")
hm_overlapping_score = pd.read_csv("../output_dataset/handmade_features_overlapping_score_v1.csv")

In [48]:
# 合併 non-overlapping 分數與 meta 資訊
csr_embeddings = csr_embeddings.merge(
    hm_non_overlapping_score[['folder_name', 'exchange', 'ticker', 'year1', 'greenwash_score']],
    left_on='file_name', right_on='folder_name', how='left'
)
csr_embeddings = csr_embeddings.drop(columns=['folder_name'])

# 合併 overlapping 分數與 meta 資訊
csr_embeddings = csr_embeddings.merge(
    hm_overlapping_score[['folder_name', 'exchange', 'ticker', 'year1', 'year2', 'greenwash_score_mean']],
    left_on='file_name', right_on='folder_name', how='left', suffixes=('_non_overlap', '_overlap')
)
csr_embeddings = csr_embeddings.drop(columns=['folder_name'])

# 合併 greenwash score
csr_embeddings['greenwash_score_combined'] = csr_embeddings['greenwash_score'].combine_first(csr_embeddings['greenwash_score_mean'])

# 合併 metadata 欄位（exchange, ticker, year1）優先使用 non-overlapping 的資料
csr_embeddings['exchange_combined'] = csr_embeddings['exchange_non_overlap'].combine_first(csr_embeddings['exchange_overlap'])
csr_embeddings['ticker_combined'] = csr_embeddings['ticker_non_overlap'].combine_first(csr_embeddings['ticker_overlap'])
csr_embeddings['year1_combined'] = csr_embeddings['year1_non_overlap'].combine_first(csr_embeddings['year1_overlap'])

# 最後選取必要欄位
columns_to_keep = ['file_name'] + \
                  [col for col in csr_embeddings.columns if col.startswith('dim_')] + \
                  ['exchange_combined', 'ticker_combined', 'year1_combined', 'greenwash_score_combined']
csr_embeddings = csr_embeddings[columns_to_keep]

# 欄位重新命名
csr_embeddings = csr_embeddings.rename(columns={
    'exchange_combined': 'exchange',
    'ticker_combined': 'ticker',
    'year1_combined': 'year1',
    'greenwash_score_combined': 'greenwash_score'
})

In [49]:
csr_embeddings.drop_duplicates(subset=['file_name'], inplace=True)
csr_embeddings

Unnamed: 0,file_name,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,...,dim_1018,dim_1019,dim_1020,dim_1021,dim_1022,dim_1023,exchange,ticker,year1,greenwash_score
0,NASDAQ_AAL_2007,-0.148248,0.231116,-0.150665,0.079551,0.004632,-0.085545,-0.026714,0.010149,0.026384,...,0.314554,-0.168807,-0.348401,0.077774,-0.112577,-0.033042,NASDAQ,AAL,2007.0,-32.270374
1,NASDAQ_AAL_2008,-0.090258,0.254854,-0.100899,0.023933,0.000505,0.038396,-0.049111,-0.015655,0.012061,...,0.390074,-0.166179,-0.289788,0.055627,-0.091702,0.010885,NASDAQ,AAL,2008.0,-42.852530
2,NASDAQ_AAL_2009,-0.049567,0.328302,-0.079704,-0.008522,0.106443,0.065740,-0.071306,-0.027917,0.015646,...,0.417550,-0.119864,-0.247691,0.059680,-0.079766,0.010840,NASDAQ,AAL,2009.0,-38.978609
3,NASDAQ_AAL_2011,-0.062906,0.314260,-0.081485,-0.017161,0.037950,0.094659,-0.069738,0.010951,-0.033939,...,0.394653,-0.171746,-0.281573,0.077768,-0.054243,0.010716,NASDAQ,AAL,2011.0,-51.990725
4,NASDAQ_AAL_2012,-0.066502,0.291016,-0.064844,-0.030483,0.111363,0.069522,-0.033351,-0.024867,0.013964,...,0.406766,-0.136174,-0.233754,-0.006370,-0.100836,0.022443,NASDAQ,AAL,2012.0,-55.278625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6096,NYSE_hpq_2017,-0.013833,0.245796,-0.150804,-0.019208,-0.064568,0.040931,-0.103157,-0.011840,0.135871,...,0.349479,-0.178389,-0.310765,-0.048159,-0.136025,0.180911,NYSE,hpq,2017.0,
6097,NYSE_hpq_2018,-0.023769,0.247830,-0.148104,-0.013313,-0.065983,0.037401,-0.111327,0.003065,0.133549,...,0.358458,-0.164955,-0.315862,-0.056981,-0.141862,0.172649,NYSE,hpq,2018.0,
6098,NYSE_hpq_2019,-0.001791,0.274772,-0.077549,-0.031943,-0.118519,0.081628,-0.053422,0.054658,0.013923,...,0.306478,-0.184107,-0.410896,-0.009220,-0.101751,0.125796,NYSE,hpq,2019.0,
6099,NYSE_hpq_2020,-0.030829,0.239682,-0.137343,-0.018286,-0.090202,0.049535,-0.088813,0.020725,0.124592,...,0.380528,-0.177449,-0.315273,-0.067815,-0.120835,0.168072,NYSE,hpq,2020.0,


In [53]:
csr_embeddings.loc[csr_embeddings['file_name'] == 'NASDAQ_AMD_2012_2013', 'year1'] = 2012
csr_embeddings.loc[csr_embeddings['file_name'] == 'NYSE_CPS_2017', 'year1'] = 2017
csr_embeddings.loc[csr_embeddings['file_name'] == 'NYSE_LLY_2020', 'year1'] = 2020
csr_embeddings.loc[csr_embeddings['file_name'] == 'NYSE_R_2018', 'year1'] = 2018
# csr_embeddings.loc[
#     csr_embeddings['file_name'] == 'NYSE_CPS_2017_19acc6731a1542cca99dde3385095ced',
#     'file_name'
# ] = 'NYSE_CPS_2017'


In [54]:
# 篩選出 year1 欄位為 NaN 的資料列
invalid_year1 = csr_embeddings[csr_embeddings['year1'].isna()]

# 顯示這些報錯資料的 file_name 和 year1
invalid_year1[['file_name', 'year1']]


Unnamed: 0,file_name,year1


In [56]:
csr_embeddings['year1'] = csr_embeddings['year1'].astype(int)
csr_embeddings['year1'] = csr_embeddings['year1'].astype(str)
csr_embeddings.to_csv("../output_dataset/csr_embeddings_score.csv", index=False)