### ESG-BERT Classification

In [13]:
import os
import re
import json
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7e773fbc4e10>>
Traceback (most recent call last):
  File "/home/francia/anaconda3/envs/csr_env/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


In [None]:
# 1️⃣ 定義 10 個模型名稱
model_names = [
    "ESGBERT/EnvironmentalBERT-forest",
    "ESGBERT/EnvironmentalBERT-action",
    "ESGBERT/EnvironmentalBERT-environmental",
    # "ESGBERT/SocRoBERTa-social",
    # "ESGBERT/GovRoBERTa-governance",
    "ESGBERT/GovernanceBERT-governance",
    # "ESGBERT/EnvRoBERTa-environmental",
    "ESGBERT/SocialBERT-social",
    "ESGBERT/EnvironmentalBERT-water",
    "ESGBERT/EnvironmentalBERT-biodiversity",
]

# 2️⃣ 預先載入所有 Tokenizer 和 Model
print("Loading all models... (This may take a while)")
tokenizers = {}
models = {}

for name in model_names:
    print(f"Loading {name}...")
    tokenizers[name] = AutoTokenizer.from_pretrained(name)
    models[name] = AutoModelForSequenceClassification.from_pretrained(name).to(device)

print("All models loaded successfully!\n")

Loading all models... (This may take a while)
Loading ESGBERT/EnvironmentalBERT-forest...
Loading ESGBERT/EnvironmentalBERT-action...
Loading ESGBERT/EnvironmentalBERT-environmental...
Loading ESGBERT/GovernanceBERT-governance...
Loading ESGBERT/SocialBERT-social...
Loading ESGBERT/EnvironmentalBERT-water...
Loading ESGBERT/EnvironmentalBERT-biodiversity...
All models loaded successfully!



In [None]:
def split_long_text(text, tokenizer, max_length=512):
    chunk_size = max_length - 2  # 因為要預留 [CLS] 和 [SEP]
    paragraphs = [p.strip() for p in re.split(r'\n\s*\n', text) if p.strip()]
    chunks = []

    for paragraph in paragraphs:
        tokens = tokenizer.tokenize(paragraph)
        token_chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]

        for token_chunk in token_chunks:
            chunk_text = tokenizer.convert_tokens_to_string(token_chunk)
            chunks.append(chunk_text)

    return chunks

In [None]:
def esg_classification(csr_data):
    results = {name: [] for name in model_names}

    for page_num, text in csr_data.items():
        # print(f"\n📄 處理 Page {page_num}...")

        for model_name in model_names:
            tokenizer = tokenizers[model_name]
            model = models[model_name]

            # **使用強化版文本切割函數**
            text_chunks = split_long_text(text, tokenizer, max_length=512)

            page_results = []

            for chunk in text_chunks:
                tokenized_chunk = tokenizer(chunk, add_special_tokens=True, return_tensors="pt")

                # **🚨 在這裡再次確認 token 長度**
                chunk_length = len(tokenized_chunk["input_ids"][0])
                if chunk_length > 512:
                    print(f"\n❌ 警告！chunk 長度 = {chunk_length} > 512，強制截斷！")
                    tokenized_chunk = tokenizer(chunk, truncation=True, max_length=512, return_tensors="pt")

                tokenized_chunk = tokenized_chunk.to(device)

                with torch.no_grad():
                    outputs = model(**tokenized_chunk)

                logits = outputs.logits
                probs = torch.nn.functional.softmax(logits, dim=-1)  # 轉換為機率
                page_results.append(probs.cpu().numpy().tolist()[0])  # 轉成 list 存入

            results[model_name].extend(page_results)
    
    # 計算每個模型的分類結果平均值
    final_scores = {
        model_name: torch.tensor(results[model_name]).cpu().mean(dim=0).tolist()
        for model_name in model_names
    }

    return final_scores

### Perspective Taking

In [None]:
import json
import nltk
from nltk.tokenize import word_tokenize

In [None]:
# 確保 NLTK 需要的資源已下載
# nltk.download('punkt')
# nltk.data.path.append('/home/francia/anaconda3/envs/csr_env/nltk_data')

In [None]:
# 定義第一人稱與第二人稱代詞
first_person_pronouns = {
    "I", "i", "I'm", "i'm", "Im", "im", "I'am", "i'am",
    "Me", "me", "My", "my", "Mine", "mine",
    "We", "we", "Us", "us", "Our", "our", "Ours", "ours",
    "I've", "i've", "I'd", "i'd", "I'll", "i'll",
    "We're", "we're", "We've", "we've", "We'd", "we'd", "We'll", "we'll",
    "I'd've", "i'd've", "We'll've", "we'll've", "We've'll", "we've'll"
}

second_person_pronouns = {
    "You", "you", "Your", "your", "Yours", "yours",
    "You're", "you're", "You've", "you've", "You'll", "you'll", "You'd", "you'd"
}

In [None]:
# 計算 Perspective Taking 值
def calculate_perspective_taking(text):
    words = word_tokenize(text.lower())

    # 計算代詞數量
    first_person_count = sum(1 for word in words if word in first_person_pronouns)
    second_person_count = sum(1 for word in words if word in second_person_pronouns)

    # 公式計算
    perspective_taking = second_person_count / (first_person_count + second_person_count + 0.0001)
    
    return perspective_taking

### Readability

In [None]:
import textstat

def readability_metrics(text):
    fog = textstat.gunning_fog(text)
    flesch_reading_ease = textstat.flesch_reading_ease(text)
    smog = textstat.smog_index(text)
    automated_readability = textstat.automated_readability_index(text)
    
    return fog, flesch_reading_ease, smog, automated_readability

### Sentiment analysis

In [None]:
from textblob import TextBlob

In [None]:
def sentiment_analysis(csr_text):
    blob = TextBlob(csr_text)
    polarity = blob.sentiment.polarity  # 介於 -1.0 到 1.0 之間
    subjectivity = blob.sentiment.subjectivity  # 介於 0.0 到 1.0 之間
    return polarity, subjectivity

### Repoert Length

In [None]:
import math
# get the number of words by log
def report_length(csr_text):
    length = math.log(len(csr_text))
    return length

### Tense and Comparative

In [None]:
import spacy
import pandas as pd
import json
import os
from collections import Counter

In [None]:
def analyze_tense_and_comparison(csr_text):
    # 加载 spaCy 的英語模型
    nlp = spacy.load("en_core_web_sm")
    nlp.max_length = 2_000_000  # 增加到 2,000,000 字符

    # 定義時態關鍵詞
    past_tense_tags = {"VBD", "VBN"}
    present_tense_tags = {"VBZ", "VBP", "VBG"}
    future_tense_keywords = {"will", "shall", "going to"}
    comparison_tags = {"JJR"}  # 比較級
    superlative_tags = {"JJS"}  # 最高級

    # 解析文本
    doc = nlp(csr_text)

    # 計數器
    tense_counts = Counter()
    comparison_counts = Counter()

    # 遍歷 tokens
    for token in doc:
        # 時態標註
        if token.tag_ in past_tense_tags:
            tense_counts["past"] += 1
        elif token.tag_ in present_tense_tags:
            tense_counts["present"] += 1
        elif token.text.lower() in future_tense_keywords:
            tense_counts["future"] += 1

        # 比較級與最高級標註
        if token.tag_ in comparison_tags:
            comparison_counts["comparative"] += 1
        elif token.tag_ in superlative_tags:
            comparison_counts["superlative"] += 1

    return comparison_counts["comparative"], comparison_counts["superlative"], tense_counts["present"], tense_counts["past"], tense_counts["future"]

In [None]:
def process_all_txt(base_path):
    output_dir = "output_dataset"
    os.makedirs(output_dir, exist_ok=True)  # 確保目錄存在
    output_path = os.path.join(output_dir, "handmade_features_corrected_txt.csv")
    
    df = pd.DataFrame()

    for fname in sorted(os.listdir(base_path)):
        if not fname.endswith(".txt"):
            continue

        try:
            file_path = os.path.join(base_path, fname)
            with open(file_path, "r", encoding="utf-8") as f:
                text = f.read()

            folder_name = fname.replace(".txt", "")

            # 將整份 text 視為單一 csr_text，並模擬 json 格式（你原本程式中 csr_data 是 dict）
            csr_data = {0: text}  # 用 dummy key 0 包起來即可通用原本的 `esg_classification` 函數
            csr_text = text  # 無需過濾頁碼，因為整體為一份連貫文字

            # 執行分析
            esg_scores = esg_classification(csr_data)
            perspective_taking = calculate_perspective_taking(csr_text)
            fog, flesch_reading_ease, smog, automated_readability = readability_metrics(csr_text)
            polarity, subjectivity = sentiment_analysis(csr_text)
            length = report_length(csr_text)
            comparative, superlative, present, past, future = analyze_tense_and_comparison(csr_text)

            row_data = {
                "folder_name": folder_name,
                "perspective_taking": perspective_taking,
                "length": length,
                "readability_fog": fog,
                "readability_flesch_reading_ease": flesch_reading_ease,
                "readability_smog": smog,
                "readability_automated_readability": automated_readability,
                "sentiment_polarity": polarity,
                "sentiment_subjectivity": subjectivity,
                "comparative": comparative,
                "superlative": superlative,
                "present": present,
                "past": past,
                "future": future
            }

            for model_name, scores in esg_scores.items():
                for i, score in enumerate(scores):
                    row_data[f"{model_name}_class_{i}"] = score

            df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)

            print(f"✅ Processed: {folder_name}")

        except Exception as e:
            print(f"❌ Error processing {fname}: {e}")
    
    df.to_csv(output_path, index=False)


In [None]:
# 讀取 JSON 檔案
base_path = os.path.abspath("/opt/hdd_1/research_hub/csr_project/CSR_report_processed_v4_gemini_v0")
process_all_txt(base_path)

Token indices sequence length is longer than the specified maximum sequence length for this model (678 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (678 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (678 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (678 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (678 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for thi

✅ Processed: NASDAQ_AAL_2007_v0_gemini_corrected
✅ Processed: NASDAQ_AAL_2008_v0_gemini_corrected
✅ Processed: NASDAQ_AAL_2009_v0_gemini_corrected
✅ Processed: NASDAQ_AAL_2011_v0_gemini_corrected
✅ Processed: NASDAQ_AAL_2012_v0_gemini_corrected
✅ Processed: NASDAQ_AAL_2013_v0_gemini_corrected
✅ Processed: NASDAQ_AAL_2014_v0_gemini_corrected
✅ Processed: NASDAQ_AAL_2015_v0_gemini_corrected
✅ Processed: NASDAQ_AAL_2016_v0_gemini_corrected
✅ Processed: NASDAQ_AAL_2017_v0_gemini_corrected
✅ Processed: NASDAQ_AAL_2018_v0_gemini_corrected
✅ Processed: NASDAQ_AAL_2019_v0_gemini_corrected
✅ Processed: NASDAQ_AAL_2020_v0_gemini_corrected
✅ Processed: NASDAQ_AAL_2021_v0_gemini_corrected
✅ Processed: NASDAQ_AAON_2018_2019_v0_gemini_corrected
✅ Processed: NASDAQ_AAON_2019_2020_v0_gemini_corrected
✅ Processed: NASDAQ_AAON_2020_2021_v0_gemini_corrected
✅ Processed: NASDAQ_AAON_2021_2022_v0_gemini_corrected
✅ Processed: NASDAQ_AAPL_2014_v0_gemini_corrected
✅ Processed: NASDAQ_AAPL_2015_v0_gemini_corr

# get gw scores

In [49]:
import pandas as pd

In [50]:
hm_v1 = pd.read_csv("../output_dataset/handmade_features_corrected_txt.csv")
hm_v0_nonoverlapping = pd.read_csv("../output_dataset/handmade_features_non_overlapping_score_v0.csv")
hm_v0_overlapping = pd.read_csv("../output_dataset/handmade_features_overlapping_score_v0.csv")

In [51]:
hm_v0_overlapping

Unnamed: 0,folder_name,perspective_taking,length,readability_fog,readability_flesch_reading_ease,readability_smog,readability_automated_readability,sentiment_polarity,sentiment_subjectivity,comparative,...,ESGBERT/SocialBERT-social_class_1,ESGBERT/EnvironmentalBERT-water_class_0,ESGBERT/EnvironmentalBERT-water_class_1,ESGBERT/EnvironmentalBERT-biodiversity_class_0,ESGBERT/EnvironmentalBERT-biodiversity_class_1,exchange,ticker,year1,year2,greenwash_score_mean
0,NASDAQ_PTC_2018_2019,0.004717,10.512329,14.17,28.47,17.4,17.2,0.132992,0.378544,26,...,0.626154,0.976418,0.023582,0.999546,0.000454,NASDAQ,PTC,2018,2019,3.741212
1,NASDAQ_TRMB_2018_2020,0.006696,11.166215,13.49,20.72,17.8,18.5,0.131486,0.386574,45,...,0.451050,0.952490,0.047510,0.980519,0.019481,NASDAQ,TRMB,2018,2020,-5.594654
2,NASDAQ_JBLU_2019_2020,0.011429,11.672013,12.65,29.18,16.9,17.2,0.111325,0.394300,51,...,0.290642,0.976001,0.023999,0.993934,0.006066,NASDAQ,JBLU,2019,2020,-2.611591
3,NASDAQ_AAWW_2018_2019,0.008032,11.165281,12.93,21.53,17.5,17.8,0.126959,0.365044,37,...,0.440789,0.975139,0.024861,0.993737,0.006263,NASDAQ,AAWW,2018,2019,4.854945
4,NASDAQ_AMD_2014_2015,0.013514,12.480301,9.99,18.15,16.5,18.0,0.094548,0.378701,100,...,0.263421,0.950613,0.049387,0.999818,0.000182,NASDAQ,AMD,2014,2015,-14.358808
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177,NYSE_CWT_2017_2018,0.001992,11.314962,15.28,23.80,18.4,20.0,0.113846,0.370221,53,...,0.237708,0.408850,0.591150,0.963896,0.036104,NYSE,CWT,2017,2018,-22.498283
178,NYSE_ACN_2020_2021,0.003686,11.544327,12.70,20.11,17.6,17.5,0.159957,0.397346,62,...,0.639192,0.974719,0.025281,0.995416,0.004584,NYSE,ACN,2020,2021,-24.990763
179,NYSE_JBL_2018_2019,0.021951,11.457635,13.27,27.56,17.8,17.8,0.054992,0.301659,45,...,0.397233,0.887914,0.112086,0.999795,0.000205,NYSE,JBL,2018,2019,-15.095687
180,NYSE_BMI_2020_2021,0.014458,11.265643,12.02,15.41,17.4,16.9,0.109998,0.423015,28,...,0.329437,0.638954,0.361046,0.988781,0.011219,NYSE,BMI,2020,2021,-5.722273


In [52]:
# 只保留 folder_name 在 hm_v0_nonoverlapping 和 hm_v0_overlapping 中的資料
hm_v1_nonoverlapping = hm_v1[hm_v1["folder_name"].isin(hm_v0_nonoverlapping["folder_name"])].copy()
hm_v1_overlapping = hm_v1[hm_v1["folder_name"].isin(hm_v0_overlapping["folder_name"])].copy()

In [53]:
hm_v1_nonoverlapping["folder_name"] = hm_v1_nonoverlapping["folder_name"].astype(str)
hm_v0_nonoverlapping["folder_name"] = hm_v0_nonoverlapping["folder_name"].astype(str)
hm_v1_overlapping["folder_name"] = hm_v1_overlapping["folder_name"].astype(str)
hm_v0_overlapping["folder_name"] = hm_v0_overlapping["folder_name"].astype(str)

In [54]:
# 把 hm_v0_nonoverlapping 中的 exchange,ticker,year1,greenwash_score 加入到 hm_v1_nonoverlapping
hm_v1_nonoverlapping_score = hm_v1_nonoverlapping.merge(
    hm_v0_nonoverlapping[["folder_name", "exchange", "ticker", "year1", "greenwash_score"]],
    on="folder_name",
    how="left"
)
# 把 hm_v0_overlapping 中的 exchange,ticker,year1,greenwash_score 加入到 hm_v1_overlapping
hm_v1_overlapping_score = hm_v1_overlapping.merge(
    hm_v0_overlapping[["folder_name", "exchange", "ticker", "year1", "year2", "greenwash_score_mean"]],
    on="folder_name",
    how="left"
)

In [56]:
hm_v1_nonoverlapping_score.to_csv("../output_dataset/handmade_features_non_overlapping_score_v1.csv", index=False)
hm_v1_overlapping_score.to_csv("../output_dataset/handmade_features_overlapping_score_v1.csv", index=False)