## 結巴段詞 + 計算詞頻
- Data: ALL twhg(twhg_with_latlng_and_places/ JSON files) topic + description

In [1]:
import json
import os
from collections import Counter

import jieba
import pandas as pd

# 設定你的資料路徑
raw_data_folder_path: str = "../../../data/twhg_with_latlng_and_places/"
# 設定字典檔路徑 (請確保檔案存在)
dict_path: str = "real_estate_dict.txt"

# 1. 載入自定義字典 (關鍵步驟)
if os.path.exists(dict_path):
    jieba.load_userdict(dict_path)
    print(f"已載入自定義字典: {dict_path}")
else:
    print("警告: 找不到自定義字典檔，將使用預設字典。")

# 取得所有 json 檔案路徑
json_files: list[str] = [
    os.path.join(raw_data_folder_path, f)
    for f in os.listdir(raw_data_folder_path)
    if f.endswith('.json')
]

# 2. 擴充停用詞 (Stop Words)
# 根據你的 CSV 結果，這些詞頻很高但對分析房地產特色沒幫助
stop_words = {
    "的", "了", "是", "和", "與", "及", "或", "在", "有", "就", "都", "而",
    "這", "那", "個", "之", "去", "來", "做", "上", "下", "裡", "後", "前",
    "亦", "也", "但", "並", "給", "對", "於", "把", "讓", "向", "往",
    "可", "較", "需", "被", "為", "已", "未", "將", "因", "又", "更",
    "Tags", "tags", "TAGS", "Image",
    "1", "2", "3", "4", "5", "6", "7", "8", "9", "0",
    " ", "\n", "\t", "\r", ",", "，", ".", "。", "!", "！", "?", "？",
    "、", "：", ":", "(", ")", "（", "）", "/", "「", "」", "【", "】",
    "-", "~", "+", "＋", "｜", "_", "→"
}

all_words_list = []

print(f"開始處理 {len(json_files)} 個檔案...")

for file_path in json_files:
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

            listing = data.get('listing')
            if not listing:
                continue

            title = listing.get('title', '') or ''
            description = listing.get('description', '') or ''

            # 組合字串
            full_text = f"{title} {description}"

            full_text = full_text.replace('\n', ' ').replace('\r', '')
            full_text = full_text.replace('平車', '平面車位')
            full_text = full_text.replace('機車位', '機械車位') # 注意：這可能會誤傷真的"機車"位，視情況使用
            full_text = full_text.replace('衛浴開窗', '衛浴開窗') # 確保連在一起
            full_text = full_text.replace('乾濕分離', '乾濕分離')

            # 處理 "近" 系列的強力黏合 (Regex 也可以，這裡用簡單取代)
            # 如果字典檔夠強，這步可以省略，但加上去會更保險
            full_text = full_text.replace('近捷運', '近捷運')

            # 3. 結巴斷詞
            words = jieba.cut(full_text)

            for word in words:
                word = word.strip()

                # 過濾邏輯
                if (word
                        and word not in stop_words
                        and not word.isdigit()
                ):
                    all_words_list.append(word)

    except Exception as e:
        print(f"處理檔案 {file_path} 時發生錯誤: {e}")

# 計算詞頻
word_counts = Counter(all_words_list)
df_word_freq = pd.DataFrame(word_counts.items(), columns=['詞彙', '次數'])
df_word_freq = df_word_freq.sort_values(by='次數', ascending=False).reset_index(drop=True)

df_word_freq

  import pkg_resources
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.276 seconds.
Prefix dict has been built successfully.


已載入自定義字典: raw_data_analyze/real_estate_dict.txt
開始處理 66 個檔案...


Unnamed: 0,詞彙,次數
0,空間,672
1,採光,533
2,收納,346
3,牆面,319
4,通風,315
...,...,...
5107,宿舍,1
5108,覺,1
5109,縮,1
5110,齊全度,1


## TF-IDF 計算
- Data: ALL twhg(twhg_with_latlng_and_places/ JSON files) topic + description

In [2]:
import os
import json
import jieba
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# 設定你的資料路徑
raw_data_folder_path: str = "../../../data/twhg_with_latlng_and_places/"
# 設定字典檔路徑
dict_path: str = "real_estate_dict.txt"

# 1. 載入自定義字典
if os.path.exists(dict_path):
    jieba.load_userdict(dict_path)
    print(f"已載入自定義字典: {dict_path}")
else:
    print("警告: 找不到自定義字典檔，將使用預設字典。")

# 取得所有 json 檔案路徑
json_files: list[str] = [
    os.path.join(raw_data_folder_path, f)
    for f in os.listdir(raw_data_folder_path)
    if f.endswith('.json')
]

# 2. 定義停用詞
stop_words = {
    "的", "了", "是", "和", "與", "及", "或", "在", "有", "就", "都", "而",
    "這", "那", "個", "之", "去", "來", "做", "上", "下", "裡", "後", "前",
    "亦", "也", "但", "並", "給", "對", "於", "把", "讓", "向", "往",
    "可", "較", "需", "被", "為", "已", "未", "將", "因", "又", "更",
    "Tags", "tags", "TAGS", "Image",
    "1", "2", "3", "4", "5", "6", "7", "8", "9", "0",
    " ", "\n", "\t", "\r", ",", "，", ".", "。", "!", "！", "?", "？",
    "、", "：", ":", "(", ")", "（", "）", "/", "「", "」", "【", "】",
    "-", "~", "+", "＋", "｜", "_", "→"
}

# 用來儲存「每一份文件」處理完的字串
corpus = []

print(f"開始處理 {len(json_files)} 個檔案...")

for file_path in json_files:
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

            listing = data.get('listing')
            if not listing:
                continue

            title = listing.get('title', '') or ''
            description = listing.get('description', '') or ''

            # 組合字串
            full_text = f"{title} {description}"

            # 資料清洗與替換
            full_text = full_text.replace('\n', ' ').replace('\r', '')
            full_text = full_text.replace('平車', '平面車位')
            full_text = full_text.replace('機車位', '機械車位')
            full_text = full_text.replace('衛浴開窗', '衛浴開窗')
            full_text = full_text.replace('乾濕分離', '乾濕分離')
            full_text = full_text.replace('近捷運', '近捷運')

            # 3. 結巴斷詞
            words = jieba.cut(full_text)

            # 篩選詞彙並重新組合成以空白分隔的字串 (為了給 TfidfVectorizer 吃)
            filtered_words = []
            for word in words:
                word = word.strip()
                if (word and word not in stop_words and not word.isdigit()):
                    filtered_words.append(word)

            # 如果這份文件有有效詞彙，就加入語料庫
            if filtered_words:
                corpus.append(" ".join(filtered_words))

    except Exception as e:
        print(f"處理檔案 {file_path} 時發生錯誤: {e}")

print(f"有效文件數量: {len(corpus)}")

# 4. 計算 TF-IDF
# 初始化 Vectorizer
# min_df=2 代表如果一個詞在少於 2 份文件中出現，就忽略它 (去除極端冷僻字)
vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b", min_df=2)

# 計算矩陣 (Rows: 文件, Cols: 詞彙)
tfidf_matrix = vectorizer.fit_transform(corpus)

# 取得所有詞彙名稱
feature_names = vectorizer.get_feature_names_out()

# 5. 統計總分 (將所有文件的 TF-IDF 分數加總)
# 這代表該詞彙在整個資料集中的「權重總和」
sum_tfidf = tfidf_matrix.sum(axis=0)

# 轉換格式為 List [(詞彙, 分數)]
data_tuples = []
for col, term in enumerate(feature_names):
    data_tuples.append( (term, sum_tfidf[0, col]) )

# 6. 轉為 DataFrame 並排序
df_tfidf = pd.DataFrame(data_tuples, columns=['詞彙', 'TF_IDF_Score'])
df_tfidf = df_tfidf.sort_values(by='TF_IDF_Score', ascending=False).reset_index(drop=True)

df_tfidf

已載入自定義字典: raw_data_analyze/real_estate_dict.txt
開始處理 66 個檔案...
有效文件數量: 66


Unnamed: 0,詞彙,TF_IDF_Score
0,空間,9.885249
1,採光,7.744079
2,客廳,5.133459
3,收納,5.074192
4,牆面,4.816285
...,...,...
2359,有磨損,0.089799
2360,雖老舊,0.089799
2361,潮痕,0.089799
2362,化妝區,0.089567
