In [20]:
import spacy
import pandas as pd
import json
import os
from collections import Counter

def analyze_tense_and_comparison(json_paths, output_csv="tense_comparison_analysis.csv"):
    # 加载 spaCy 的英語模型
    nlp = spacy.load("en_core_web_sm")

    # 定義時態關鍵詞
    past_tense_tags = {"VBD", "VBN"}
    present_tense_tags = {"VBZ", "VBP", "VBG"}
    future_tense_keywords = {"will", "shall", "going to"}
    comparison_tags = {"JJR"}  # 比較級
    superlative_tags = {"JJS"}  # 最高級

    # 初始化存儲結果的列表
    results = []

    for json_path in json_paths:
        try:
            # 讀取 JSON 文件
            with open(json_path, "r", encoding="utf-8") as f:
                text_data = json.load(f)

            # 合併文本內容
            text = " ".join(text_data.values())

            # 解析文本
            doc = nlp(text)

            # 計數器
            tense_counts = Counter()
            comparison_counts = Counter()

            # 遍歷 tokens
            for token in doc:
                # 時態標註
                if token.tag_ in past_tense_tags:
                    tense_counts["past"] += 1
                elif token.tag_ in present_tense_tags:
                    tense_counts["present"] += 1
                elif token.text.lower() in future_tense_keywords:
                    tense_counts["future"] += 1

                # 比較級與最高級標註
                if token.tag_ in comparison_tags:
                    comparison_counts["comparative"] += 1
                elif token.tag_ in superlative_tags:
                    comparison_counts["superlative"] += 1

            # 儲存當前 JSON 文件的分析結果
            results.append({
                "file_name": os.path.basename(json_path),
                "comparative": comparison_counts["comparative"],
                "superlative": comparison_counts["superlative"],
                "present": tense_counts["present"],
                "past": tense_counts["past"],
                "future": tense_counts["future"],
            })

        except Exception as e:
            print(f"處理 {json_path} 時發生錯誤: {e}")

    # 建立 DataFrame
    df = pd.DataFrame(results)

    # 儲存為 CSV
    df.to_csv(output_csv, index=False)

    print(f"分析完成，結果已儲存至 {output_csv}")
    return df

In [21]:
# 使用示例
json_paths = [
    "/home/francia/research_hub/csr_project/CSR_report_processed_v4/NYSE/NYSE_A_2004/NYSE_A_2004_v2_remove_punctuation.json",
    "/home/francia/research_hub/csr_project/CSR_report_processed_v4/NYSE/NYSE_A_2005/NYSE_A_2005_v2_remove_punctuation.json",
]
df_result = analyze_tense_and_comparison(json_paths)
print(df_result)

分析完成，結果已儲存至 tense_comparison_analysis.csv
                                file_name  comparative  superlative  present  \
0  NYSE_A_2004_v2_remove_punctuation.json           62           24     1802   
1  NYSE_A_2005_v2_remove_punctuation.json           35           14      830   

   past  future  
0  1549      24  
1   640      14  
