In [18]:
import json
import re
import os
import string
import numpy as np

In [19]:
def clean_text(text):
    # 移除換行符號
    text = text.replace("\n", " ").replace("\r", " ")

    # 移除特殊字符
    text = re.sub(r"[^a-zA-Z0-9.,!?%$€£-]", " ", text)

    # 移除多餘空格
    text = re.sub(r"\s+", " ", text).strip()
    
    return text

In [20]:
def process_all(base_path):
    for folder_name in os.listdir(base_path):
        folder_path = os.path.join(base_path, folder_name)

        if os.path.isdir(folder_path):  # 確保是資料夾
            json_path = os.path.join(folder_path, f"{folder_name}_v1_preprocessed_tool.json")
            output_json_path = os.path.join(folder_path, f"{folder_name}_v2_remove_punctuation.json")

            # 檢查是否已處理過
            if os.path.exists(output_json_path):
                print(f"Skipping {folder_name}, already processed.")
                continue

            if os.path.exists(json_path):
                # 讀取 JSON 檔案
                with open(json_path, "r", encoding="utf-8") as f:
                    data = json.load(f)

                # 清理 JSON 內容
                cleaned_data = {key: clean_text(value) for key, value in data.items()}

                # 寫入新 JSON 檔案
                with open(output_json_path, "w", encoding="utf-8") as f:
                    json.dump(cleaned_data, f, indent=4, ensure_ascii=False)

                print(f"Processed: {folder_name}")

In [None]:
base_path = os.path.abspath("/home/francia/research_hub/csr_project/CSR_report_processed_v4/CSR_report_new_collect")
process_all(base_path)

In [None]:
# import os

# def process_specific(folder_path):
#     folder_name = os.path.basename(folder_path)
#     json_path = os.path.join(folder_path, f"{folder_name}_v1_preprocessed_tool.json")
#     output_json_path = os.path.join(folder_path, f"{folder_name}_v2_remove_punctuation.json")
#     print(f"Processing: {folder_name}")
#     print(f"New JSON path: {json_path}")
#     print(f"Output JSON path: {output_json_path}")

#     if os.path.exists(output_json_path):
#         print(f"Skipping {folder_name}, already processed.")
#         return

#     if os.path.exists(json_path):
#         # 讀取 JSON 檔案
#         with open(json_path, "r", encoding="utf-8") as f:
#             data = json.load(f)
#         cleaned_data = {key: clean_text(value) for key, value in data.items()}
#         with open(output_json_path, "w", encoding="utf-8") as f:
#             json.dump(cleaned_data, f, indent=4, ensure_ascii=False)

#             print(f"Processed: {folder_name}")

In [None]:
# specific_path = os.path.abspath("/home/francia/research_hub/csr_project/CSR_report_processed_v4/NASDAQ/NASDAQ_AMD_2012_2013")
# process_specific(specific_path)

Processing: NASDAQ_AMD_2012_2013
New JSON path: /home/francia/research_hub/csr_project/CSR_report_processed_v4/NASDAQ/NASDAQ_AMD_2012_2013/NASDAQ_AMD_2012_2013_v1_preprocessed_tool.json
Output JSON path: /home/francia/research_hub/csr_project/CSR_report_processed_v4/NASDAQ/NASDAQ_AMD_2012_2013/NASDAQ_AMD_2012_2013_v2_remove_punctuation.json
Processed: NASDAQ_AMD_2012_2013
