## 第一节 加载公开词向量

In [1]:
from gensim.models import KeyedVectors

model_path = './data/sgns.weibo.word.bz2'
model = KeyedVectors.load_word2vec_format(model_path)

In [2]:
# 1.维数
model.vector_size

300

In [4]:
# 2.词数
len(model.index_to_key)

195202

In [5]:
# 3.查看向量
model['地铁']

array([ 2.92064e-01, -5.18680e-02, -2.13720e-01,  1.82131e-01,
        2.82900e-03,  4.14104e-01,  1.56440e-01, -1.27940e-02,
       -3.28332e-01, -8.25000e-02, -8.46890e-02, -2.14700e-02,
        1.18650e-01, -4.73659e-01, -1.97850e-02,  1.13939e-01,
        1.82734e-01, -6.46420e-02,  5.60832e-01, -6.65230e-02,
       -1.97960e-01,  1.26039e-01, -3.28720e-01, -3.09730e-02,
       -3.46580e-01, -1.53190e-01, -2.96226e-01, -5.75517e-01,
        1.10684e-01,  8.19220e-02, -1.04721e-01, -1.77477e-01,
       -1.21332e-01,  1.49816e-01,  2.86278e-01, -8.11200e-03,
        6.72540e-02,  6.92220e-02, -3.50973e-01, -5.49500e-02,
       -7.80250e-02, -1.92952e-01, -1.70920e-01, -1.28289e-01,
        1.08204e-01, -7.24913e-01, -1.11735e-01, -6.75000e-03,
        4.38086e-01, -8.75720e-02, -1.41320e-01, -1.91726e-01,
        1.68363e-01, -7.85700e-02, -1.79772e-01, -1.27950e-01,
        3.24675e-01,  2.70616e-01,  1.96330e-02, -3.09431e-01,
       -4.02670e-02,  5.80160e-02, -1.06603e-01,  2.480

In [7]:
# 4.相似度
model.similarity('地铁', '图书馆')

0.2721027

In [9]:
# 5.最相似
model.most_similar(positive=['男人', '女孩'], negative=['男孩'], topn=5)

[('女人', 0.6578881740570068),
 ('女孩子', 0.515068531036377),
 ('女生', 0.45194485783576965),
 ('女人真', 0.4420627951622009),
 ('女人们', 0.43698593974113464)]

## 第二节 训练自己的词向量

In [23]:
import pandas as pd
import jieba
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [6]:
df = pd.read_csv('./data/online_shopping_10_cats.csv', encoding='utf-8').dropna()

In [9]:
df.head()

Unnamed: 0,cat,label,review
0,书籍,1,做父母一定要有刘墉这样的心态，不断地学习，不断地进步，不断地给自己补充新鲜血液，让自己保持一...
1,书籍,1,作者真有英国人严谨的风格，提出观点、进行论述论证，尽管本人对物理学了解不深，但是仍然能感受到...
2,书籍,1,作者长篇大论借用详细报告数据处理工作和计算结果支持其新观点。为什么荷兰曾经县有欧洲最高的生产...
3,书籍,1,作者在战几时之前用了＂拥抱＂令人叫绝．日本如果没有战败，就有会有美军的占领，没胡官僚主义的延...
4,书籍,1,作者在少年时即喜阅读，能看出他精读了无数经典，因而他有一个庞大的内心世界。他的作品最难能可贵...


In [19]:
sentences = [[token for token in jieba.lcut(sentence) if token.strip() != ''] for sentence in df['review']]

In [20]:
model = Word2Vec(
    sentences,  # 已分词的句子序列
    vector_size=100,  # 词向量维度
    window=5,  # 上下文窗口大小
    min_count=2,  # 最小词频（低于将被忽略）
    sg=1,  # 1:Skip-Gram，0:CBOW
    workers=4  # 并行训练线程数
)

In [21]:
model.wv.save_word2vec_format('./data/word2vec.txt')

In [25]:
KeyedVectors.load_word2vec_format('./data/word2vec.txt')['地铁']

array([-3.10723305e-01,  5.37223220e-02, -6.77480828e-04, -1.34950206e-01,
        2.40192771e-01, -4.36220281e-02, -4.81436044e-01,  3.70887756e-01,
       -3.14808547e-01, -9.51673165e-02, -7.71107748e-02, -2.96264827e-01,
        2.97449112e-01,  7.55995959e-02,  3.34701180e-01, -1.02686949e-01,
        1.12877287e-01, -1.07090378e+00,  6.59387857e-02, -1.58875093e-01,
        4.37332690e-01,  1.31408870e-01,  3.28013033e-01, -2.07532555e-01,
       -3.32438946e-01, -5.33579625e-02,  5.77054799e-01, -8.39267671e-02,
       -3.13885123e-01, -1.96195140e-01,  4.87490684e-01,  6.82728961e-02,
       -1.28541678e-01, -3.55246395e-01,  7.21564412e-01,  1.26523197e-01,
        2.38266766e-01, -1.11640573e-01,  3.71553488e-02, -1.64254993e-01,
       -3.13329488e-01,  2.97708482e-01, -3.55815500e-01, -3.68661821e-01,
        5.76398134e-01,  5.92784658e-02,  6.20667934e-01,  5.90833187e-01,
        3.52375180e-01,  3.19252282e-01, -2.58486718e-01,  1.98057666e-01,
       -1.43978372e-01,  

## 第三节 词向量应用

In [33]:
from torch import nn
from gensim.models import KeyedVectors
import torch
import jieba

In [30]:
# 1.加载词向量
wv = KeyedVectors.load_word2vec_format('./data/word2vec.txt')

In [37]:
# 2.处理OOV
unk_token = '<unk>'
index2word = [unk_token] + wv.index_to_key
word2index = {word: index for index, word in enumerate(index2word)}

In [38]:
# 3.准备词向量矩阵
num_embeddings = len(index2word)
embedding_dim = wv.vector_size
embedding_matrix = torch.randn(num_embeddings, embedding_dim)

for index, word in enumerate(index2word):
    if word in wv:
        embedding_matrix[index] = torch.tensor(wv[word])

In [39]:
# 4.创建Embedding
embedding = nn.Embedding.from_pretrained(embedding_matrix)

In [41]:
# 5.测试
text = "我喜欢乘坐宇宙飞船"
tokens = jieba.lcut(text)
input_ids = [word2index.get(token, word2index[unk_token]) for token in tokens]
input_tensor = torch.tensor(input_ids)
embedding(input_tensor).shape

torch.Size([4, 100])