### ESG-BERT Classification

In [1]:
import os
import re
import json
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 1️⃣ 定義 10 個模型名稱
model_names = [
    "ESGBERT/EnvironmentalBERT-forest",
    "ESGBERT/EnvironmentalBERT-action",
    "ESGBERT/EnvironmentalBERT-environmental",
    # "ESGBERT/SocRoBERTa-social",
    # "ESGBERT/GovRoBERTa-governance",
    "ESGBERT/GovernanceBERT-governance",
    # "ESGBERT/EnvRoBERTa-environmental",
    "ESGBERT/SocialBERT-social",
    "ESGBERT/EnvironmentalBERT-water",
    "ESGBERT/EnvironmentalBERT-biodiversity",
]

# 2️⃣ 預先載入所有 Tokenizer 和 Model
print("Loading all models... (This may take a while)")
tokenizers = {}
models = {}

for name in model_names:
    print(f"Loading {name}...")
    tokenizers[name] = AutoTokenizer.from_pretrained(name)
    models[name] = AutoModelForSequenceClassification.from_pretrained(name).to(device)

print("All models loaded successfully!\n")

Loading all models... (This may take a while)
Loading ESGBERT/EnvironmentalBERT-forest...
Loading ESGBERT/EnvironmentalBERT-action...
Loading ESGBERT/EnvironmentalBERT-environmental...
Loading ESGBERT/GovernanceBERT-governance...
Loading ESGBERT/SocialBERT-social...
Loading ESGBERT/EnvironmentalBERT-water...
Loading ESGBERT/EnvironmentalBERT-biodiversity...
All models loaded successfully!



In [3]:
def split_long_text(text, tokenizer, max_length=512, fixed_chunk_size=500):
    # 先用標點符號、換行切割文本
    sentences = re.split(r"(?<=[.?!])\s+|\n", text)
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        tokenized_sentence = tokenizer(sentence, add_special_tokens=False)
        token_count = len(tokenized_sentence["input_ids"])

        # 如果單句超過 max_length，則進一步切割
        if token_count > max_length - 2:
            sub_sentences = sentence.split(",")  # 用逗號切割
            for sub_sentence in sub_sentences:
                tokenized_sub_sentence = tokenizer(sub_sentence, add_special_tokens=False)
                sub_token_count = len(tokenized_sub_sentence["input_ids"])

                if sub_token_count > max_length - 2:
                    words = sub_sentence.split()  # 空格切割
                    temp_chunk = []
                    temp_length = 0
                    
                    for word in words:
                        word_tokens = tokenizer(word, add_special_tokens=False)["input_ids"]
                        word_count = len(word_tokens)
                        
                        if word_count > max_length - 2:
                            word_tokens = word_tokens[:max_length - 2]  # 截斷過長單詞
                        
                        if temp_length + word_count > max_length - 2:
                            chunks.append(" ".join(temp_chunk))
                            temp_chunk = []
                            temp_length = 0
                        
                        temp_chunk.append(word)
                        temp_length += word_count
                    
                    if temp_chunk:
                        chunks.append(" ".join(temp_chunk))  # 剩餘詞加入
                else:
                    chunks.append(sub_sentence)
        else:
            if current_length + token_count > max_length - 2:
                chunks.append(" ".join(current_chunk))
                current_chunk = []
                current_length = 0
            
            current_chunk.append(sentence)
            current_length += token_count
    
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    
    # 最後檢查是否仍有段落超過 max_length，進一步強制 500 字切割
    final_chunks = []
    for chunk in chunks:
        if len(chunk) > fixed_chunk_size:
            for i in range(0, len(chunk), fixed_chunk_size):
                final_chunks.append(chunk[i:i + fixed_chunk_size])
        else:
            final_chunks.append(chunk)
    
    return final_chunks

In [4]:
def esg_classification(csr_data):
    results = {name: [] for name in model_names}

    for page_num, text in csr_data.items():
        # print(f"\n📄 處理 Page {page_num}...")

        for model_name in model_names:
            tokenizer = tokenizers[model_name]
            model = models[model_name]

            # **使用強化版文本切割函數**
            text_chunks = split_long_text(text, tokenizer, max_length=512)

            page_results = []

            for chunk in text_chunks:
                tokenized_chunk = tokenizer(chunk, add_special_tokens=True, return_tensors="pt")

                # **🚨 在這裡再次確認 token 長度**
                chunk_length = len(tokenized_chunk["input_ids"][0])
                if chunk_length > 512:
                    print(f"\n❌ 警告！chunk 長度 = {chunk_length} > 512，強制截斷！")
                    tokenized_chunk = tokenizer(chunk, truncation=True, max_length=512, return_tensors="pt")

                tokenized_chunk = tokenized_chunk.to(device)

                with torch.no_grad():
                    outputs = model(**tokenized_chunk)

                logits = outputs.logits
                probs = torch.nn.functional.softmax(logits, dim=-1)  # 轉換為機率
                page_results.append(probs.cpu().numpy().tolist()[0])  # 轉成 list 存入

            results[model_name].extend(page_results)
    
    # 計算每個模型的分類結果平均值
    final_scores = {
        model_name: torch.tensor(results[model_name]).cpu().mean(dim=0).tolist()
        for model_name in model_names
    }

    return final_scores

### Perspective Taking

In [5]:
import json
import nltk
from nltk.tokenize import word_tokenize

In [6]:
# 確保 NLTK 需要的資源已下載
# nltk.download('punkt')
# nltk.data.path.append('/home/francia/anaconda3/envs/csr_env/nltk_data')

In [7]:
# 定義第一人稱與第二人稱代詞
first_person_pronouns = {
    "I", "i", "I'm", "i'm", "Im", "im", "I'am", "i'am",
    "Me", "me", "My", "my", "Mine", "mine",
    "We", "we", "Us", "us", "Our", "our", "Ours", "ours",
    "I've", "i've", "I'd", "i'd", "I'll", "i'll",
    "We're", "we're", "We've", "we've", "We'd", "we'd", "We'll", "we'll",
    "I'd've", "i'd've", "We'll've", "we'll've", "We've'll", "we've'll"
}

second_person_pronouns = {
    "You", "you", "Your", "your", "Yours", "yours",
    "You're", "you're", "You've", "you've", "You'll", "you'll", "You'd", "you'd"
}

In [8]:
# 計算 Perspective Taking 值
def calculate_perspective_taking(text):
    words = word_tokenize(text.lower())

    # 計算代詞數量
    first_person_count = sum(1 for word in words if word in first_person_pronouns)
    second_person_count = sum(1 for word in words if word in second_person_pronouns)

    # 公式計算
    perspective_taking = second_person_count / (first_person_count + second_person_count + 0.0001)
    
    return perspective_taking

### Readability

In [9]:
import textstat

def readability_metrics(text):
    fog = textstat.gunning_fog(text)
    flesch_reading_ease = textstat.flesch_reading_ease(text)
    smog = textstat.smog_index(text)
    automated_readability = textstat.automated_readability_index(text)
    
    return fog, flesch_reading_ease, smog, automated_readability

### Sentiment analysis

In [10]:
from textblob import TextBlob

In [11]:
def sentiment_analysis(csr_text):
    blob = TextBlob(csr_text)
    polarity = blob.sentiment.polarity  # 介於 -1.0 到 1.0 之間
    subjectivity = blob.sentiment.subjectivity  # 介於 0.0 到 1.0 之間
    return polarity, subjectivity

### Repoert Length

In [12]:
import math
# get the number of words by log
def report_length(csr_text):
    length = math.log(len(csr_text))
    return length

### Tense and Comparative

In [13]:
import spacy
import pandas as pd
import json
import os
from collections import Counter

In [14]:
def analyze_tense_and_comparison(csr_text):
    # 加载 spaCy 的英語模型
    nlp = spacy.load("en_core_web_sm")
    nlp.max_length = 2_000_000  # 增加到 2,000,000 字符

    # 定義時態關鍵詞
    past_tense_tags = {"VBD", "VBN"}
    present_tense_tags = {"VBZ", "VBP", "VBG"}
    future_tense_keywords = {"will", "shall", "going to"}
    comparison_tags = {"JJR"}  # 比較級
    superlative_tags = {"JJS"}  # 最高級

    # 解析文本
    doc = nlp(csr_text)

    # 計數器
    tense_counts = Counter()
    comparison_counts = Counter()

    # 遍歷 tokens
    for token in doc:
        # 時態標註
        if token.tag_ in past_tense_tags:
            tense_counts["past"] += 1
        elif token.tag_ in present_tense_tags:
            tense_counts["present"] += 1
        elif token.text.lower() in future_tense_keywords:
            tense_counts["future"] += 1

        # 比較級與最高級標註
        if token.tag_ in comparison_tags:
            comparison_counts["comparative"] += 1
        elif token.tag_ in superlative_tags:
            comparison_counts["superlative"] += 1

    return comparison_counts["comparative"], comparison_counts["superlative"], tense_counts["present"], tense_counts["past"], tense_counts["future"]

In [15]:
def process_all(base_path):
    output_dir = "output_dataset"
    os.makedirs(output_dir, exist_ok=True)  # 確保目錄存在
    output_path = os.path.join(output_dir, "handmade_features_NASDAQ.csv")
        
    df = pd.DataFrame()

    for folder_name in os.listdir(base_path):
        folder_path = os.path.join(base_path, folder_name)

        if not os.path.isdir(folder_path):
            continue

        json_path = os.path.join(folder_path, f"{folder_name}_v2_remove_punctuation.json")

        if os.path.exists(json_path):
            with open(json_path, "r", encoding="utf-8") as f:
                csr_data = json.load(f)

            csr_text = "".join(value for key, value in csr_data.items() if key not in (1, 2))

            esg_scores = esg_classification(csr_data)
            perspective_taking = calculate_perspective_taking(csr_text)
            fog, flesch_reading_ease, smog, automated_readability = readability_metrics(csr_text)
            polarity, subjectivity = sentiment_analysis(csr_text)
            length = report_length(csr_text)
            comparative, superlative, present, past, future = analyze_tense_and_comparison(csr_text)

            row_data = {"folder_name": folder_name, "perspective_taking": perspective_taking, "length": length, "readability_fog": fog, "readability_flesch_reading_ease": flesch_reading_ease, "readability_smog": smog, "readability_automated_readability": automated_readability, "sentiment_polarity": polarity, "sentiment_subjectivity": subjectivity, "comparative": comparative, "superlative": superlative, "present": present, "past": past, "future": future}

            for model_name, scores in esg_scores.items():
                for i, score in enumerate(scores):
                    row_data[f"{model_name}_class_{i}"] = score

            df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)

            print(f"Processed: {folder_name}")

    df.to_csv(output_path, index=False)

In [16]:
# 讀取 JSON 檔案
base_path = os.path.abspath("/home/francia/research_hub/csr_project/CSR_report_processed_v4/NASDAQ")
process_all(base_path)

Processed: NASDAQ_CEQP_2019
Processed: NASDAQ_ZEAL_2013
Processed: NASDAQ_LOGI_2018
Processed: NASDAQ_PTC_2018_2019
Processed: NASDAQ_EGLE_2019
Processed: NASDAQ_INFN_2020
Processed: NASDAQ_AMAT_2016
Processed: NASDAQ_PRGO_2020
Processed: NASDAQ_MSFT_2015
Processed: NASDAQ_FLIR_2018
Processed: NASDAQ_PRGS_2012


Token indices sequence length is longer than the specified maximum sequence length for this model (916 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (916 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (916 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (916 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (916 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for thi

Processed: NASDAQ_COMM_2022
Processed: NASDAQ_BIIB_2020
Processed: NASDAQ_PFG_2017
Processed: NASDAQ_TRMB_2018_2020
Processed: NASDAQ_WYNN_2017
Processed: NASDAQ_UNFI_2014
Processed: NASDAQ_BPOP_2019
Processed: NASDAQ_KALU_2019
Processed: NASDAQ_COLM_2019
Processed: NASDAQ_JBLU_2019_2020
Processed: NASDAQ_SNBR_2019
Processed: NASDAQ_SP_2018
Processed: NASDAQ_AAWW_2018_2019
Processed: NASDAQ_CNSL_2019
Processed: NASDAQ_PETD_2018
Processed: NASDAQ_COST_2019
Processed: NASDAQ_TSLA_2018
Processed: NASDAQ_HA_2020
Processed: NASDAQ_CRTO_2018
Processed: NASDAQ_EGLE_2022
Processed: NASDAQ_MKTX_2020
Processed: NASDAQ_REGI_2019
Processed: NASDAQ_AMD_2014_2015
Processed: NASDAQ_ACIW_2021
Processed: NASDAQ_FB_2020
Processed: NASDAQ_EXLS_2020
Processed: NASDAQ_BCPC_2020
Processed: NASDAQ_BCPC_2019
Processed: NASDAQ_INTC_2007
Processed: NASDAQ_SBNY_2020
Processed: NASDAQ_VRSK_2019
Processed: NASDAQ_AAWW_2020
Processed: NASDAQ_ADBE_2016
Processed: NASDAQ_NTGR_2017
Processed: NASDAQ_NLOK_2020
Processe