In [126]:
import os
import re
import pandas as pd
import numpy as np

## Preprocess

In [127]:
import re
import string
import spacy
from nltk.corpus import stopwords

In [128]:
# 載入 spaCy 英文模型
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])  # 只保留 tok/pos/lemma

# 載入停用詞表
stop_words = set(stopwords.words('english'))

In [None]:
def preprocess(text):
    # 1. Lowercasing
    text = text.lower()

    # 2. Removing Punctuation & Special Characters
    punctuation_pattern = r'[^\w\s]'
    text = re.sub(punctuation_pattern, '', text)

    # 3. Removing Stop-Words Removal
    word_tokens = text.split()
    text = [word for word in word_tokens if word not in stop_words]

    # 4. Removal of URLs
    url_pattern = r'http\S+|www\S+|https\S+'
    text = re.sub(url_pattern, '', ' '.join(text))

    # 5. 移除所有數字（包含詞中、連結、日期等）
    text = re.sub(r"\b\d+\b", " ", text)

    # 6. 移除所有標點符號
    text = text.translate(str.maketrans("", "", string.punctuation))

    # 7. 多餘空白壓縮為單一空白
    text = re.sub(r"\s+", " ", text)

    # 8. Tokenization + Lemmatization
    doc = nlp(text)
    tokens = [
        token.lemma_ for token in doc
    ]

    return " ".join(tokens)

## Load file

In [130]:
# read txt from CSR_report_processed_v4_gemini_v0
# Path to the folder containing the txt files
folder_path = "../CSR_report_processed_v4_gemini_v0"
xy_cpc = pd.read_csv("../output_dataset/csr_y_cpc.csv")
filled_flag = pd.read_csv("../output_dataset/csr_embeddings_leq2019_flag.csv")

# Filter filled_flag for is_imputed == False
valid_flags = filled_flag[filled_flag["is_imputed"] == False]

# Get the set of valid filenames
valid_filenames = set(valid_flags["file_name"]) & set(xy_cpc["file_name"])

In [131]:
# List to storprocessed = []
processed = []

for filename in list(valid_filenames):
    file_path = os.path.join(folder_path, filename + "_v0_gemini_corrected.txt")
    if os.path.exists(file_path):
        with open(file_path, "r", encoding="utf-8") as f:
            raw = f.read()
            clean = preprocess(raw)
            row = xy_cpc[xy_cpc["file_name"] == filename].iloc[0]
            processed.append({
                "filename": filename,
                "ticker": row["ticker"],
                "year": row["year"],
                "clean_text": clean,
                "patent_count": row["patents_count"],
                "citation_count": row["total_5yr_forward_citations"]
            })
        print(f"Processed {filename} with {len(processed[-1]['clean_text'].split())} tokens.")
    else:
        print(f"File {file_path} does not exist.")

# 儲存為 CSV
df_processed = pd.DataFrame(processed)
df_processed.to_csv("../output_dataset/csr_text_processed_v0_preprocessed.csv", index=False)
print(f"Saved {len(df_processed)} processed documents.")

Processed NYSE_SCS_2019 with 10429 tokens.
Processed NYSE_INGR_2016 with 3776 tokens.
Processed NYSE_CMI_2019 with 15105 tokens.
Processed NASDAQ_INTC_2007 with 36429 tokens.
Processed NYSE_PRU_2018 with 12877 tokens.
Processed NYSE_WMT_2013 with 33058 tokens.
Processed NYSE_WLK_2017 with 2581 tokens.
Processed NYSE_VZ_2005 with 11134 tokens.
Processed NYSE_TSN_2009 with 28320 tokens.
Processed NYSE_APD_2019 with 24618 tokens.
Processed NYSE_APA_2015 with 4325 tokens.
Processed NYSE_ALB_2009 with 2668 tokens.
Processed NYSE_NEM_2019 with 58972 tokens.
Processed NASDAQ_IOSP_2018 with 10023 tokens.
Processed NYSE_DTE_2015 with 884 tokens.
Processed NYSE_WFC_2015 with 16397 tokens.
Processed NASDAQ_MSFT_2007 with 1378 tokens.
Processed NYSE_CF_2012 with 3099 tokens.
Processed NYSE_ICE_2016 with 5881 tokens.
Processed NASDAQ_AAPL_2016 with 11171 tokens.
Processed NYSE_ORCL_2010 with 10824 tokens.
Processed NYSE_AXP_2019 with 17684 tokens.
Processed NYSE_SLB_2017 with 20560 tokens.
Processe

In [117]:
len(valid_filenames)

1486

# TF-IDF

In [136]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import joblib  # 用來儲存 vectorizer

In [137]:
# 1. 載入處理後資料
df = pd.read_csv("../output_dataset/csr_text_processed_v0_preprocessed.csv")

# 2. 取出 clean_text 欄位
texts = df["clean_text"].fillna("")

# 3. 建立 TF-IDF vectorizer（你可依需求調整參數）
vectorizer = TfidfVectorizer(
    # max_features=10000,         # 保留所有特徵，後續再做選擇
    ngram_range=(1, 2),         # 一元與二元詞（unigram + bigram）
    min_df=15,                   # 10~30	出現在 0.7%~2% 文檔中，1500 篇文章的理想區間
    max_df=0.8,                 # 0.7–0.9	移除超過 70~90% 文檔都出現的詞彙， 1500 篇推薦範圍，可濾除常見無效語彙
    stop_words="english"        # 雙重保險停用詞（可與 spaCy 停用詞互補）
)

# 4. 向量化
X_tfidf = vectorizer.fit_transform(texts)  # 結果為 sparse matrix，shape = (n_samples, n_features)

# 5. 輸出 TF-IDF 基本統計
print(f"TF-IDF shape: {X_tfidf.shape}")
non_zero = X_tfidf.count_nonzero()
density = non_zero / (X_tfidf.shape[0] * X_tfidf.shape[1])
print(f"Non-zero entries: {non_zero}")
print(f"Matrix density: {density:.6f}")
# 檢查是否有全為 0 的向量（稀疏過頭）
empty_docs = (X_tfidf.sum(axis=1) == 0).A1  # 轉成 1D 陣列
print(f"Empty TF-IDF vectors: {np.sum(empty_docs)} / {X_tfidf.shape[0]}")

# 6. 儲存 TF-IDF 矩陣（可選）
from scipy import sparse
sparse.save_npz("../output_dataset/csr_text_processed_tf_idf.npz", X_tfidf)

# 7. 儲存 TF-IDF vectorizer 物件
joblib.dump(vectorizer, "../output_dataset/csr_text_processed_tf_idf.pkl")

# 8. 儲存所有 TF-IDF 特徵名稱（for feature selection、可視化等）
feature_names = vectorizer.get_feature_names_out()
pd.Series(feature_names).to_csv("../output_dataset/csr_text_tfidf_feature_names.csv", index=False)

print("✅ TF-IDF 向量化與儲存完成！")

TF-IDF shape: (1486, 140069)
Non-zero entries: 8109075
Matrix density: 0.038959
Empty TF-IDF vectors: 0 / 1486
✅ TF-IDF 向量化與儲存完成！


# Mutual Information

In [160]:
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import SelectKBest
import joblib

## Select top k

In [161]:
def select_top_k_features_by_target(X_tfidf, y, feature_names, target_name, output_dir, k=1000):
    """
    根據指定的 y，進行 mutual information 特徵選擇，並儲存篩選後的矩陣與詞彙名稱。
    
    Args:
        X_tfidf: TF-IDF 特徵矩陣 (scipy.sparse.csr_matrix)
        y: 對應目標變數的一維 array
        feature_names: TF-IDF 對應詞彙陣列
        target_name: 任務名稱（如 "patent"）
        output_dir: 輸出資料夾
        k: 要選出的 top-k 特徵數
    """
    print("=" * 60)
    print(f"🔍 Selecting top {k} features for target: '{target_name}'")
    
    # 安全限制 k 不超過現有特徵數
    k = min(k, X_tfidf.shape[1])

    # 計算 MI 分數
    X_dense = X_tfidf.toarray()
    mi_scores = mutual_info_regression(X_dense, y, discrete_features=False)

    # 取得 top-k 特徵索引與詞彙
    top_k_idx = np.argsort(mi_scores)[-k:]
    X_selected = X_tfidf[:, top_k_idx]
    selected_features = feature_names[top_k_idx]

    # 儲存 TF-IDF 子矩陣
    sparse.save_npz(os.path.join(output_dir, f"tfidf_top{k}_{target_name}.npz"), X_selected)

    # 儲存詞彙名稱
    pd.Series(selected_features).to_csv(
        os.path.join(output_dir, f"tfidf_top{k}_{target_name}_features.csv"), index=False
    )

    # 儲存全部詞彙與對應 MI 分數（可用於可視化）
    mi_df = pd.DataFrame({
        "feature": feature_names,
        "mi_score": mi_scores
    }).sort_values(by="mi_score", ascending=False)
    mi_df.to_csv(os.path.join(output_dir, f"mi_scores_{target_name}.csv"), index=False)

    print(f"✅ Done! Saved TF-IDF matrix and feature list for '{target_name}' with k={k}")
    print("=" * 60 + "\n")

## Load file

In [162]:
# 1. 載入資料
X_tfidf = sparse.load_npz("../output_dataset/csr_text_processed_tf_idf.npz")
df = pd.read_csv("../output_dataset/csr_text_processed_v0_preprocessed.csv")
vectorizer = joblib.load("../output_dataset/csr_text_processed_tf_idf.pkl")
feature_names = vectorizer.get_feature_names_out()
output_dir = "../output_dataset"

In [163]:
print(f"TF-IDF shape: {X_tfidf.shape}")

TF-IDF shape: (1486, 140069)


## Process

In [None]:
# 2. 執行不同 k 值的特徵選擇（對兩個 y）
for k in [1000, 2000, 3000, 5000]:
    # patent_count
    select_top_k_features_by_target(
        X_tfidf=X_tfidf,
        y=df["patent_count"].values,
        feature_names=feature_names,
        target_name=f"count_k{k}",
        output_dir=output_dir,
        k=k
    )

    # citation_count
    select_top_k_features_by_target(
        X_tfidf=X_tfidf,
        y=df["citation_count"].values,
        feature_names=feature_names,
        target_name=f"citation_k{k}",
        output_dir=output_dir,
        k=k
    )

🔍 Selecting top 1000 features for target: 'count_k1000'
