# 导包并定义模型参数

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as tud

from collections import Counter # 计算频率
import numpy as np
import random

import scipy
from sklearn.metrics.pairwise import cosine_similarity # 计算相似度

dtype = torch.FloatTensor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

random.seed(1) # 固定种子，便于复现
np.random.seed(1)
torch.manual_seed(1)

C = 3 # 窗口大小
K = 15 # 负采样数
epochs = 1 # 训练1次
MAX_VOCAB_SIZE = 10000 # 取频率前9999个和一个<UNK>
EMBEDDING_SIZE = 100 # 词向量维度
batch_size = 32 # 每批32个

# 读取文本数据并处理

In [None]:
from google.colab import drive
drive.mount('/content/drive')

with open('/content/drive//MyDrive/data/text8/text8.test.txt') as f:
    text = f.read() # 得到文本内容

text = text.lower().split() #　分割成单词列表
vocab_dict = dict(Counter(text).most_common(MAX_VOCAB_SIZE - 1)) # 得到单词字典表，key是单词，value是次数
vocab_dict['<UNK>'] = len(text) - np.sum(list(vocab_dict.values())) # 把不常用的单词都编码为"<UNK>"
word2idx = {word:i for i, word in enumerate(vocab_dict.keys())}
idx2word = {i:word for i, word in enumerate(vocab_dict.keys())} # 建立索引
word_counts = np.array([count for count in vocab_dict.values()], dtype=np.float32) # 转变成数组
word_freqs = word_counts / np.sum(word_counts) # 计算频率
word_freqs = word_freqs ** (3./4.) # 将频率变为原来的 0.75 次方

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 实现 DataLoader
- 把所有 word 编码成数字
- 保存 vocabulary，单词 count、normalized word frequency
- 每个 iteration sample 一个中心词
- 根据当前的中心词返回 context 单词
- 根据中心词 sample 一些 negative 单词
- 返回 sample 出的所有数据

In [None]:
class WordEmbeddingDataset(tud.Dataset):
    def __init__(self, text, word2idx, word_freqs):
        super(WordEmbeddingDataset, self).__init__() # #通过父类初始化模型，然后重写两个方法
        self.text_encoded = [word2idx.get(word, word2idx['<UNK>']) for word in text] # 把单词数字化表示。如果不在词典中，也表示为unk
        self.text_encoded = torch.LongTensor(self.text_encoded) # nn.Embedding需要传入LongTensor类型
        self.word2idx = word2idx
        self.word_freqs = torch.Tensor(word_freqs)
        
        
    def __len__(self):
        return len(self.text_encoded) # 返回所有单词的总数，即item的总数
    
    def __getitem__(self, idx):
        ''' 这个function返回以下数据用于训练
            - 中心词
            - 这个单词附近的positive word
            - 随机采样的K个单词作为negative word
        '''
        center_words = self.text_encoded[idx] # 取得中心词
        pos_indices = list(range(idx - C, idx)) + list(range(idx + 1, idx + C + 1)) # 先取得中心左右各C个词的索引
        pos_indices = [i % len(self.text_encoded) for i in pos_indices] # 为了避免索引越界，所以进行取余处理
        pos_words = self.text_encoded[pos_indices] # tensor(list)
        
        neg_words = torch.multinomial(self.word_freqs, K * pos_words.shape[0], True)
        # torch.multinomial作用是对self.word_freqs做K * pos_words.shape[0]次取值，输出的是self.word_freqs对应的下标
        # 取样方式采用有放回的采样，并且self.word_freqs数值越大，取样概率越大
        # 每采样一个正确的单词(positive word)，就采样K个错误的单词(negative word)，pos_words.shape[0]是正确单词数量
        
        # while 循环是为了保证 neg_words中不能包含背景词
        while len(set(pos_words.numpy().tolist()) & set(neg_words.numpy().tolist())) > 0:
            neg_words = torch.multinomial(self.word_freqs, K * pos_words.shape[0], True)

        return center_words, pos_words, neg_words

dataset = WordEmbeddingDataset(text, word2idx, word_freqs)
dataloader = tud.DataLoader(dataset, batch_size, shuffle=True)

# 定义 PyTorch 模型

In [None]:
class EmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(EmbeddingModel, self).__init__()
        
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        
        self.in_embed = nn.Embedding(self.vocab_size, self.embed_size) # 训练出来的权重就是每个词作为中心词的权重
        self.out_embed = nn.Embedding(self.vocab_size, self.embed_size) # 训练出来的权重就是每个词作为背景词的权重
        
    def forward(self, input_labels, pos_labels, neg_labels):
        ''' input_labels: center words, [batch_size]
            pos_labels: positive words, [batch_size, (window_size * 2)]
            neg_labels：negative words, [batch_size, (window_size * 2 * K)]
            
            return: loss, [batch_size]
        '''
        input_embedding = self.in_embed(input_labels) # [batch_size, embed_size]
        pos_embedding = self.out_embed(pos_labels)# [batch_size, (window * 2), embed_size]
        neg_embedding = self.out_embed(neg_labels) # [batch_size, (window * 2 * K), embed_size]
        
        input_embedding = input_embedding.unsqueeze(2) # [batch_size, embed_size, 1] # 增加维度

        # bmm(a, b)，batch matrix multiply。函数中的两个参数 a,b 
        # 都是维度为 3 的 tensor，并且这两个 tensor 的第一个维度必须相同，后面两个维度必须满足矩阵乘法的要求     
        pos_dot = torch.bmm(pos_embedding, input_embedding) # [batch_size, (window * 2), 1]
        pos_dot = pos_dot.squeeze(2) # [batch_size, (window * 2)]
        
        neg_dot = torch.bmm(neg_embedding, -input_embedding) # [batch_size, (window * 2 * K), 1]
        neg_dot = neg_dot.squeeze(2) # batch_size, (window * 2 * K)]
        
        log_pos = F.logsigmoid(pos_dot).sum(1) # .sum()结果只为一个数，.sum(1)结果是一维的张量
        log_neg = F.logsigmoid(neg_dot).sum(1)
        
        loss = log_pos + log_neg
        
        return -loss
    
    def input_embedding(self):
        return self.in_embed.weight.detach().numpy()

model = EmbeddingModel(MAX_VOCAB_SIZE, EMBEDDING_SIZE).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# 训练模型

In [None]:
for e in range(epochs):
    for i, (input_labels, pos_labels, neg_labels) in enumerate(dataloader):
        input_labels = input_labels.long().to(device)
        pos_labels = pos_labels.long().to(device)
        neg_labels = neg_labels.long().to(device)

        optimizer.zero_grad()
        loss = model(input_labels, pos_labels, neg_labels).mean()
        loss.backward()

        optimizer.step()

        if i % 1000 == 0:
            print('epoch', e, 'iteration', i, loss.item())

epoch 0 iteration 0 397.1173095703125
epoch 0 iteration 1000 261.72503662109375
epoch 0 iteration 2000 228.12376403808594
epoch 0 iteration 3000 159.30996704101562
epoch 0 iteration 4000 131.16302490234375
epoch 0 iteration 5000 88.49677276611328
epoch 0 iteration 6000 128.16195678710938
epoch 0 iteration 7000 109.7065200805664
epoch 0 iteration 8000 69.03596496582031
epoch 0 iteration 9000 72.1278305053711
epoch 0 iteration 10000 62.440711975097656
epoch 0 iteration 11000 69.9999008178711
epoch 0 iteration 12000 39.46931838989258
epoch 0 iteration 13000 40.16891860961914
epoch 0 iteration 14000 46.689842224121094
epoch 0 iteration 15000 37.453643798828125
epoch 0 iteration 16000 32.9650764465332
epoch 0 iteration 17000 29.186105728149414
epoch 0 iteration 18000 45.67950439453125
epoch 0 iteration 19000 29.1345157623291
epoch 0 iteration 20000 28.938091278076172
epoch 0 iteration 21000 31.526216506958008
epoch 0 iteration 22000 29.683401107788086
epoch 0 iteration 23000 27.853820800781

In [None]:
embedding_weights = model.cpu().input_embedding()
torch.save(model.state_dict(), "embedding-{}.th".format(EMBEDDING_SIZE))

# 词向量应用

In [None]:
def find_nearest(word):
    index = word2idx[word]
    embedding = embedding_weights[index]
    cos_dis = np.array([scipy.spatial.distance.cosine(e, embedding) for e in embedding_weights])
    return [idx2word[i] for i in cos_dis.argsort()[:10]]

In [None]:
for word in ["two", "america", "computer"]:
    print(word, find_nearest(word))

two ['two', 'three', 'five', 'four', 'zero', 'six', 'one', 'seven', 'eight', 'nine']
america ['america', 'world', 'into', 'also', 'first', 'area', 'from', 'mutation', 'binary', 'younger']
computer ['computer', 'has', 'often', 'windows', 'popular', 'system', 'not', 'phillips', 'sultan', 'have']
