In [2]:
from transformers import BertTokenizer, BertModel
import torch
import os
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-large-cased')
model = BertModel.from_pretrained('bert-large-cased')
model.eval()
model = model.to(device)

In [5]:
# path = '../CSR_report_processed_v4_gemini_v0/NASDAQ_AAL_2007_v0_gemini_corrected.txt'
# try:
#     with open(path, 'r') as file:
#         text = file.read()
# except:
#     print(f"Error: The file '{path}' was not found.")
#     exit(1)
# file.close()

In [6]:
max_len=512
chunk_size = max_len - 2
input_folder = "../CSR_report_processed_v4_gemini_v0/CSR_report_new_collect"
output_csv = "../output_dataset/CSR_report_new_collect/csr_embeddings.csv"

In [7]:
def get_embedding(text):
    # 兩個換行符號代表下一個段落
    paragraphs = [p.strip() for p in text.split(r'\n\s*\n') if p.strip()]
    embeddings = []
    for paragraph in paragraphs:
        tokens = tokenizer.tokenize(paragraph)
        chuncks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]

        for chunck in chuncks:
            tokens_chunck = ['[CLS]'] + chunck + ['[SEP]']
            input_ids = tokenizer.convert_tokens_to_ids(tokens_chunck)
            input_ids = torch.tensor([input_ids]).to(device)

            with torch.no_grad():
                outputs = model(input_ids)
                last_hidden_states = outputs.last_hidden_state.squeeze(0)

            token_embeddings = last_hidden_states[1:-1]  # Exclude [CLS] and [SEP]
            chunck_embedding = token_embeddings.mean(dim=0)
            embeddings.append(chunck_embedding)
    doc_embedding = torch.stack(embeddings).mean(dim=0)
    return doc_embedding

In [8]:
if not os.path.exists(output_csv):
    df = pd.DataFrame(columns=["file_name"] + [f"dim_{i}" for i in range(1024)])
    df.to_csv(output_csv, index=False)

In [9]:
max_files = 5000

In [10]:
processed_count = 0

for fname in sorted(os.listdir(input_folder)):
    if fname.endswith(".txt") and processed_count < max_files:
        try:
            full_path = os.path.join(input_folder, fname)
            with open(full_path, 'r', encoding='utf-8') as f:
                text = f.read()

            file_name = fname.split('_v0')[0]

            # 避免重複處理
            existing_df = pd.read_csv(output_csv)
            if file_name in existing_df["file_name"].values:
                print(f"✔️ Already processed: {file_name}")
                continue

            print(f"🚀 Processing: {file_name}")
            emb = get_embedding(text)

            # 儲存結果
            row = [file_name] + emb.tolist()
            df_new = pd.DataFrame([row], columns=["file_name"] + [f"dim_{i}" for i in range(1024)])
            df_new.to_csv(output_csv, mode='a', header=False, index=False)

            processed_count += 1

        except Exception as e:
            print(f"❌ Error processing {fname}: {e}")

🚀 Processing: NASDAQ_AMZN_2019
🚀 Processing: NASDAQ_BRKS_2019
🚀 Processing: NASDAQ_GILD_2016
🚀 Processing: NASDAQ_GILD_2017
🚀 Processing: NASDAQ_GILD_2018
🚀 Processing: NASDAQ_GILD_2019
🚀 Processing: NASDAQ_MIDD_2019
🚀 Processing: NYSE_AES_2015
🚀 Processing: NYSE_AES_2016
🚀 Processing: NYSE_AES_2017
🚀 Processing: NYSE_AKO-B_2014
🚀 Processing: NYSE_AKO-B_2015
🚀 Processing: NYSE_AKO-B_2016
🚀 Processing: NYSE_AKO-B_2017
🚀 Processing: NYSE_AKO-B_2018
🚀 Processing: NYSE_AKO-B_2019
🚀 Processing: NYSE_AVY_2019
🚀 Processing: NYSE_CDP_2014
🚀 Processing: NYSE_CDP_2016
🚀 Processing: NYSE_CDP_2017
🚀 Processing: NYSE_CDP_2018
🚀 Processing: NYSE_CDP_2019
🚀 Processing: NYSE_CHU_2016
🚀 Processing: NYSE_CHU_2017
🚀 Processing: NYSE_CHU_2018
🚀 Processing: NYSE_CHU_2019
🚀 Processing: NYSE_COR_2016
🚀 Processing: NYSE_COR_2017
🚀 Processing: NYSE_COR_2018
🚀 Processing: NYSE_COR_2019
🚀 Processing: NYSE_CRM_2012
🚀 Processing: NYSE_CRM_2014
🚀 Processing: NYSE_CRM_2016
🚀 Processing: NYSE_CRM_2017
🚀 Processing: N