In [None]:
import re

def extract_chinese_words(file_path):
    with open(file_path, 'rb') as f:
        data = f.read()

    # 将原始字节中符合 UTF-16LE 编码的部分提取成字符串
    text = data.decode('utf-16le', errors='ignore')

    # 使用正则找出连续的中文词语（两个及以上）
    words = re.findall(r'[\u4e00-\u9fff]{2,}', text)

    # 去重并排序
    unique_words = sorted(set(words))
    return unique_words

def save_to_txt(words, output_path):
    with open(output_path, 'w', encoding='utf-8') as f:
        for word in words:
            f.write(word + '\n')

if __name__ == "__main__":
    scel_file = "汽车常用词_术语.scel"
    output_file = "汽车词汇词库.txt"

    words = extract_chinese_words(scel_file)
    print(f"提取到 {len(words)} 个词语。")
    save_to_txt(words, output_file)
    print(f"已保存为 UTF-8 格式的文本文件：{output_file}")


提取到 124 个词语。
已保存为 UTF-8 格式的文本文件：D:/汽车词汇词库.txt


In [5]:
with open(scel_file, 'rb') as f:
    print(f.read(16))


b'@\x15\x00\x00DCS\x01\x01\x00\x00\x00\x9c9\xfb\xac'


In [None]:
# quick_scel_debug.py
with open("汽车词汇大全【官方推荐】.scel", 'rb') as f:
    data = f.read()

print("文件大小：", len(data))
print("前128字节（十六进制）：")
print(" ".join(f"{b:02x}" for b in data[:128]))


文件大小： 93744
前128字节（十六进制）：
40 15 00 00 44 43 53 01 01 00 00 00 9c 39 fb ac 3b e0 7c b7 ed 4d 3e 0a 7c b0 10 75 31 00 35 00 31 00 35 00 33 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00


In [None]:
import jieba
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# 路径配置
custom_dict_path = r"汽车词汇词库.txt"
stopwords_path = r"stopped_words.txt"
excel_path = r"二手汽车口碑.xlsx"

# 读取自定义词典
jieba.load_userdict(custom_dict_path)

# 读取停用词
with open(stopwords_path, 'r', encoding='utf-8') as f:
    stopwords = set(line.strip() for line in f if line.strip())

# 需要分词的列名
columns_to_segment = ['空间', '驾驶感受', '操控', '续航', '舒适性', '外观', '内饰', '性价比', '智能化']

# 读取 Excel 文件
df = pd.read_excel(excel_path)

# 分词函数
def segment_text(text):
    if not isinstance(text, str):
        return []
    words = jieba.lcut(text)
    return [w for w in words if w.strip() and w not in stopwords]

# 对指定列进行分词
for col in columns_to_segment:
    if col in df.columns:
        df[col + '_分词'] = df[col].apply(segment_text)
    else:
        print(f"警告：列 {col} 不存在于 Excel 文件中")

# 为 TF-IDF 准备数据（将分词列表转为字符串）
for col in columns_to_segment:
    segmented_col = col + '_分词'
    if segmented_col in df.columns:
        df[segmented_col + '_str'] = df[segmented_col].apply(lambda x: ' '.join(x))

# 为每列分别计算 TF-IDF
tfidf_results = {}
for col in columns_to_segment:
    text_col = col + '_分词_str'
    if text_col in df.columns:
        corpus = df[text_col].fillna('').tolist()
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(corpus)
        tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
        tfidf_results[col] = tfidf_df
        print(f"完成 TF-IDF 计算：{col}（词数：{len(vectorizer.get_feature_names_out())}）")

        # 可选：将每列 TF-IDF 分数保存为 Excel（例如每列一个 sheet）
        tfidf_df.to_excel(rf"tfidf_{col}.xlsx", index=False)

# 保存分词结果文件（含 _分词 和 _分词_str 列）
output_path = r"分词结果.xlsx"
df.to_excel(output_path, index=False)

print("分词完成，结果已保存到：分词结果.xlsx：", output_path)


分词完成，结果已保存到：E:\桌面\20250513最终数据_分词结果.xlsx


In [None]:
import pandas as pd
from wordcloud import WordCloud
from collections import Counter
import ast

# 读取Excel文件
file_path = r"分词结果.xlsx"
df = pd.read_excel(file_path)

# 分词列名
columns = ['空间_分词', '驾驶感受_分词', '操控_分词', '续航_分词', '舒适性_分词', '外观_分词', '内饰_分词', '性价比_分词', '智能化_分词']

all_words = []
for col in columns:
    if col in df.columns:
        for item in df[col].dropna():
            if isinstance(item, str):
                try:
                    words = ast.literal_eval(item)
                except:
                    words = item.split()
            else:
                words = item
            all_words.extend(words)

# 词频统计
word_freq = Counter(all_words)

# 生成高清词云
wc = WordCloud(
    font_path=r"C:\Windows\Fonts\simsun.ttc",  # 宋体
    width=3000,     # 大宽度生成高分辨率图
    height=2400,    # 对应高度
    background_color='white',
    max_words=500
).generate_from_frequencies(word_freq)

# 直接保存为图片（最高清）
output_path = r"合并分词列词云_高清.png"
wc.to_file(output_path)
plt.show()

print(f"词云图已以最高清晰度保存至：{output_path}")


词云图已以最高清晰度保存至：E:\桌面\合并分词列词云_高清.png


In [None]:
import matplotlib.font_manager
[f.name for f in
matplotlib.font_manager.fontManager.ttflist]


['DejaVu Sans',
 'STIXNonUnicode',
 'STIXNonUnicode',
 'cmtt10',
 'cmss10',
 'DejaVu Serif',
 'STIXNonUnicode',
 'cmr10',
 'STIXSizeThreeSym',
 'STIXSizeOneSym',
 'cmex10',
 'STIXSizeFourSym',
 'STIXGeneral',
 'DejaVu Serif',
 'STIXSizeTwoSym',
 'cmmi10',
 'DejaVu Sans Display',
 'DejaVu Serif Display',
 'cmb10',
 'DejaVu Sans Mono',
 'DejaVu Sans Mono',
 'DejaVu Serif',
 'DejaVu Sans',
 'DejaVu Sans',
 'DejaVu Sans',
 'STIXGeneral',
 'STIXGeneral',
 'STIXNonUnicode',
 'DejaVu Sans Mono',
 'DejaVu Sans Mono',
 'STIXGeneral',
 'DejaVu Serif',
 'STIXSizeOneSym',
 'STIXSizeFiveSym',
 'STIXSizeTwoSym',
 'cmsy10',
 'STIXSizeFourSym',
 'STIXSizeThreeSym',
 'Goudy Stout',
 'LiSu',
 'Bodoni MT',
 'KaiTi',
 'Candara',
 'Franklin Gothic Book',
 'Yu Gothic',
 'Segoe UI',
 'Constantia',
 'Arial',
 'Rockwell Condensed',
 'Lucida Fax',
 'Arial',
 'Dubai',
 'MS Outlook',
 'Segoe UI',
 'Book Antiqua',
 'Lucida Console',
 'Lucida Bright',
 'Tw Cen MT',
 'Segoe UI',
 'Mongolian Baiti',
 'Sylfaen',
 'Ari