In [152]:
import json
import re
import string
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from collections import Counter

In [153]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [154]:
# def density_threshold(data):
    # thresholds = []
    # threshold = 0.0
    # for item in data:
    #     if item.get("text_density") is not None:
    #         thresholds.append(item.get("text_density"))
    # thresholds.sort()
    # if len(thresholds) > 0:
    #     # get 75th percentile
    #     threshold = thresholds[int(len(thresholds) * 0.75)]
    # return threshold

def density_threshold(data):
    densities = [item.get("text_density", 0) for item in data if item.get("text_density") is not None]
    print(len(densities))
    return np.percentile(densities, 75) if densities else 0.0

In [155]:
def remove_punctuation(text):
    punctuation_pattern = f"[{re.escape(string.punctuation)}]"  # 自動轉義所有標點
    return re.sub(punctuation_pattern, " ", text)  # 替換為空格

In [156]:
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    words = text.split()
    return ' '.join(word for word in words if word.lower() not in stop_words)

In [157]:
def apply_stemming(text):
    return ' '.join(stemmer.stem(word) for word in text.split())

In [158]:
def apply_lemmatization(text):
    return ' '.join(lemmatizer.lemmatize(word) for word in text.split())

In [159]:
lemmatizer = WordNetLemmatizer()

def remove_frequent_words(valid_text):
    cnt = Counter()
    for word in valid_text.split():
        cnt[lemmatizer.lemmatize(word)] += 1  # 進行詞形還原
    return cnt.most_common(10)

In [160]:
def preprocess_text(json_path):
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    threshold = density_threshold(data)
    # print(threshold)

    # Filter out invalid text
    valid_text = []
    for item in data:
        text = item.get("all_text", "").strip().lower()
        # print(text)

        # filter out empty text
        if not text or item.get("text_num") == 0:
            continue
        # print(text)

        # filter out text with low text density
        # if item.get("text_density") < threshold:
        #     continue
        # print(text)

        # remove non-alphanumeric characters
        text = remove_punctuation(text)
        # print(text)

        # remove stopwords
        text = remove_stopwords(text)
        # print(text)

        # apply stemming
        text = apply_stemming(text)

        # apply lemmatization
        text = apply_lemmatization(text)

        valid_text.append(text)

    # remove frequent words
    if valid_text:
        most_common_words = remove_frequent_words(' '.join(valid_text))
        freqword = set([w for (w, wc) in most_common_words])
        valid_text = [' '.join([word for word in text.split() if word not in freqword]) for text in valid_text]

    return valid_text if valid_text else ["No valid text found."]


In [161]:
# 測試程式
json_path = "/home/francia/research_hub/csr_project/CSR_report_processed_v4/NASDAQ/NASDAQ_AAL_2007/dense_regions.json"
processed_regions = preprocess_text(json_path)

719


In [162]:
processed_regions

['r commit preserv wonder world 2007 respons prepar corpor',
 'pr sli bieta ohi fianc riist fel fot ‘er peed de ee pace seat ae pe oh sess ft tte',
 'dear friend',
 'continu evalu perform come run safe depend effici oper likewis mind respons respons err opportun u evalu live respons share find mani stakehold',
 'last year’ inaugur err focus primarili company’ carbon footprint year’ examin broader footprint ensur comprehens transpar err global initi gri level c requir sustain framework read gri requir appendix',
 'alway — inde entir industri — made import stride reduc impact activ environ much progress trace increas effici today u aviat industri consum 3 percent less 2000 carri 20 percent passeng cargo last year’ commit build progress date share aggress plan improv effici 20 percent 2020 recent month — along belong air transport associ — laid even ambiti goal increas effici 30 percent 2025 base year measur 2005 two year data collect iam plea announc track meet goal',
 '‘the less burn le