In [11]:
import json
import re
import os
import string
import ftfy
import numpy as np
import language_tool_python

In [12]:
tool = language_tool_python.LanguageTool('en-US')

In [13]:
def preprocess_text(json_path, output_path, tool):
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        # 依據頁碼分類文字
        page_texts = {}
        for item in data:
            page = item.get("page")
            text = item.get("all_text", "").strip()

            if page is not None and text:
                # 先用 ftfy 修正亂碼
                fixed_text = ftfy.fix_text(text)
                if page not in page_texts:
                    page_texts[page] = []
                page_texts[page].append(fixed_text)

        # 將每頁的文字串接並先用 ftfy 修正，再用 tool.correct 矯正
        processed_texts = {
            str(page): tool.correct(ftfy.fix_text(" ".join(texts)))
            for page, texts in page_texts.items()
        }

        # 存成新的 JSON 檔案
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(processed_texts, f, ensure_ascii=False, indent=4)

        return f"Processed text saved to {output_path}"

    except FileNotFoundError:
        return "Error: File not found."
    except json.JSONDecodeError:
        return "Error: Invalid JSON format."

In [14]:
def process_all_nasdaq(base_path, tool):
    """ 遍歷 NASDAQ 目錄，對每個資料夾內的 dense_regions.json 進行改名並處理 """
    for folder_name in os.listdir(base_path):
        folder_path = os.path.join(base_path, folder_name)

        # 只處理 2019 或更早的年份
        match = re.search(r'(\d{4})$', folder_name)
        if match:
            year = int(match.group(1))
            if year > 2019:
                continue
        else:
            continue  # 無年份格式就跳過

        if os.path.isdir(folder_path):  # 確保是資料夾
            old_json_path = os.path.join(folder_path, "dense_regions.json")
            new_json_path = os.path.join(folder_path, f"{folder_name}_v0_extract_text.json")
            output_json_path = os.path.join(folder_path, f"{folder_name}_v1_preprocessed_tool.json")

            # 檢查是否已處理過
            if os.path.exists(output_json_path):
                print(f"Skipping {folder_name}, already processed.")
                continue

            if os.path.exists(old_json_path):
                # 重新命名 dense_regions.json
                os.rename(old_json_path, new_json_path)

                # 執行處理函數
                preprocess_text(new_json_path, output_json_path, tool)

                print(f"Processed: {folder_name}")

In [15]:
nasdaq_base_path = os.path.abspath("../CSR_report_processed_v4/CSR_report_new_collect/")
process_all_nasdaq(nasdaq_base_path, tool)

Skipping NYSE_KOF_2017, already processed.
Processed: NYSE_MFG_2009
Processed: NYSE_IHG_2015
Processed: NYSE_CDP_2017
Processed: NYSE_GE_2019
Processed: NYSE_HAL_2016
Processed: NYSE_MFG_2013
Processed: NYSE_KOF_2019
Processed: NYSE_G_2019
Processed: NYSE_MFG_2018
Processed: NYSE_CRM_2017
Processed: NYSE_DB_2005
Processed: NYSE_DB_2006
Processed: NASDAQ_GILD_2018
Processed: NYSE_COR_2018
Processed: NYSE_TTC_2019
Processed: NYSE_HSBC_2012
Processed: NYSE_DB_2014
Processed: NYSE_CTB_2014
Processed: NYSE_IHG_2018
Processed: NYSE_CX_2017
Processed: NYSE_LTM_2016
Processed: NYSE_LTM_2013
Processed: NYSE_CX_2006
Processed: NYSE_AVY_2019
Processed: NYSE_MFG_2017
Processed: NYSE_KDSKF_2019
Processed: NYSE_DB_2007
Processed: NYSE_CX_2010
Processed: NYSE_MDT_2014
Processed: NYSE_DB_2008
Processed: NYSE_DB_2010
Processed: NYSE_CHU_2017
Processed: NYSE_HSBC_2018
Processed: NYSE_AES_2017
Processed: NYSE_CX_2018
Processed: NASDAQ_GILD_2019
Processed: NASDAQ_BRKS_2019
Processed: NYSE_LTM_2018
Process

In [16]:
import os

def process_specific(folder_path, tool):
    """
    僅處理指定資料夾內的 dense_regions.json：
    - 將其重新命名為 *_v0_extract_text.json
    - 使用 preprocess_text 處理並產生 *_v1_preprocessed_tool.json
    """
    folder_name = os.path.basename(folder_path)
    new_json_path = os.path.join(folder_path, f"{folder_name}_v0_extract_text.json")
    output_json_path = os.path.join(folder_path, f"{folder_name}_v1_preprocessed_tool.json")
    print(f"Processing: {folder_name}")
    print(f"New JSON path: {new_json_path}")
    print(f"Output JSON path: {output_json_path}")

    if os.path.exists(output_json_path):
        print(f"Skipping {folder_name}, already processed.")
        return

    if os.path.exists(new_json_path):
        preprocess_text(new_json_path, output_json_path, tool)
        print(f"Processed: {folder_name}")

In [17]:
specific_path = os.path.abspath("/home/francia/research_hub/csr_project/CSR_report_processed_v4/NASDAQ/NASDAQ_AMD_2012_2013")
process_specific(specific_path, tool)

Processing: NASDAQ_AMD_2012_2013
New JSON path: /home/francia/research_hub/csr_project/CSR_report_processed_v4/NASDAQ/NASDAQ_AMD_2012_2013/NASDAQ_AMD_2012_2013_v0_extract_text.json
Output JSON path: /home/francia/research_hub/csr_project/CSR_report_processed_v4/NASDAQ/NASDAQ_AMD_2012_2013/NASDAQ_AMD_2012_2013_v1_preprocessed_tool.json
