In [3]:
# 词向量训练（Skip-Gram模式）
import pandas as pd
import jieba
from gensim.models.word2vec import Word2Vec
import logging  # 添加日志记录

# 配置日志输出
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# 1. 数据预处理
def preprocess_text(text):
    """文本清洗和分词处理"""
    # 去除标点符号（扩展更全的标点集合）
    punctuation = "，。！？、；：“”‘’【】（）《》~@#￥%……&*"
    for p in punctuation:
        text = text.replace(p, "")
    return jieba.lcut(text)

# 读入训练集文件
data = pd.read_csv('train.csv')
corpus = [preprocess_text(str(comment)) for comment in data['comment'].values]

# 2. Skip-Gram模型训练
model = Word2Vec(
    corpus,
    sg=1,  # 关键修改：sg=1表示使用Skip-Gram（默认CBOW是sg=0）
    vector_size=300,  # 词向量维度
    window=5,        # 上下文窗口大小（Skip-Gram通常用更大窗口）
    min_count=3,     # 忽略低频词
    workers=4,       # 并行线程数
    negative=5,      # 负采样数（Skip-Gram推荐5-20）
    hs=0,            # 禁用层次softmax（与negative采样二选一）
    alpha=0.025,     # 初始学习率
    min_alpha=0.0001 # 最小学习率
)

# 3. 模型保存与加载
model.save("word2vec_skipgram.model")  # 保存模型
# model = Word2Vec.load("word2vec_skipgram.model")  # 加载模型

# 4. 模型验证
print('\n===== 模型参数 =====')
print(f"模型架构: {'Skip-Gram' if model.sg else 'CBOW'}")
print(f"词表大小: {len(model.wv)}")
print(f"训练总词数: {model.corpus_total_words}\n")

2025-04-02 22:40:07,147 : INFO : collecting all words and their counts
2025-04-02 22:40:07,148 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2025-04-02 22:40:07,200 : INFO : collected 12099 word types from a corpus of 188848 raw words and 10000 sentences
2025-04-02 22:40:07,200 : INFO : Creating a fresh vocabulary
2025-04-02 22:40:07,222 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=3 retains 4028 unique words (33.29% of original 12099, drops 8071)', 'datetime': '2025-04-02T22:40:07.222343', 'gensim': '4.3.3', 'python': '3.9.21 (main, Dec 11 2024, 16:35:24) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26100-SP0', 'event': 'prepare_vocab'}
2025-04-02 22:40:07,223 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=3 leaves 179103 word corpus (94.84% of original 188848, drops 9745)', 'datetime': '2025-04-02T22:40:07.223466', 'gensim': '4.3.3', 'python': '3.9.21 (main, Dec 11 2024, 16:35:24) [MSC v.1929 64 bit (AMD64)]


===== 模型参数 =====
模型架构: Skip-Gram
词表大小: 4028
训练总词数: 188848



In [4]:
# 语义相似度查询
test_words = ['点赞', '不错', '难吃', '推荐', '地道']
for word in test_words:
    if word in model.wv:
        print(f"与'{word}'最相似的词：{model.wv.most_similar(word, topn=3)}")

# 向量获取示例
if '地道' in model.wv:
    print(f"\n'地道'的词向量（前10维）:\n{model.wv['地道'][:10]}")
else:
    print("\n警告：'地道'不在词表中")

与'点赞'最相似的词：[('传说', 0.989485502243042), ('大方', 0.9877336025238037), ('送货上门', 0.9873219132423401)]
与'不错'最相似的词：[('挺不错', 0.9070080518722534), ('很棒', 0.906531035900116), ('好极了', 0.9012317061424255)]
与'难吃'最相似的词：[('咸', 0.875076174736023), ('垃圾', 0.8659026026725769), ('实在', 0.8470757007598877)]
与'推荐'最相似的词：[('值得', 0.9029862880706787), ('一试', 0.8813337087631226), ('一去', 0.8808634877204895)]
与'地道'最相似的词：[('正', 0.976787269115448), ('很赞', 0.9732523560523987), ('大爱', 0.96816486120224)]

'地道'的词向量（前10维）:
[-0.01213648  0.10693915  0.00260235  0.06403901 -0.04777374 -0.11223552
  0.15519957  0.52974755 -0.07509932 -0.16252017]


In [5]:
# 检查并输出"环境"的词向量及形状
if '环境' in model.wv:
    env_vector = model.wv['环境']
    print(f"'环境'的词向量（前5维）:\n{env_vector[:5]}")
    print(f"词向量形状: {env_vector.shape}")  # 应输出 (300,)
else:
    print("警告：'环境'不在词表中")

'环境'的词向量（前5维）:
[-0.06844943  0.20632029 -0.07643649  0.04566484  0.08594491]
词向量形状: (300,)


In [6]:
# 输出与"好吃"最相似的3个词
if '好吃' in model.wv:
    print("\n与'好吃'最相似的3个词:")
    for word, similarity in model.wv.most_similar('好吃', topn=3):
        print(f"{word}: {similarity:.4f}")
else:
    print("警告：'好吃'不在词表中")

# 计算词语相似度
similarity_results = []
for word in ['美味', '蟑螂']:
    if '好吃' in model.wv and word in model.wv:
        sim = model.wv.similarity('好吃', word)
        similarity_results.append((word, sim))
    else:
        print(f"警告：'{word}'不在词表中")

print("\n词语相似度:")
for word, sim in similarity_results:
    print(f"'好吃' vs '{word}': {sim:.4f}")


与'好吃'最相似的3个词:
棒: 0.8207
入味: 0.8146
香: 0.8118

词语相似度:
'好吃' vs '美味': 0.7766
'好吃' vs '蟑螂': 0.3039


In [7]:
# 向量类比计算
if all(word in model.wv for word in ['餐厅', '聚会', '安静']):
    result = model.wv.most_similar(
        positive=['餐厅', '聚会'],
        negative=['安静'],
        topn=1
    )
    print(f"\n向量运算 '餐厅 + 聚会 - 安静' ≈ '{result[0][0]}' (相似度: {result[0][1]:.4f})")
else:
    print("警告：计算所需的词未全部存在于词表中")


向量运算 '餐厅 + 聚会 - 安静' ≈ '部门' (相似度: 0.9451)
