In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [4]:
class SkipGramModel(nn.Module):
    # embed_size是词表大小，embed_dimension是词向量维度
    def __init__(self,embed_size,embed_dimension):
        super(SkipGramModel,self).__init__()
        self.embed_size = embed_size
        self.embed_dimension = embed_dimension
        self.w_embeddings = nn.Embedding(embed_size,embed_dimension,sparse=True)
        self.v_embeddings = nn.Embedding(embed_size, embed_dimension, sparse=True) 
        self._init_emb()

    #官方代码中的初始化方法
    def _init_emb(self):
        initrange = 0.5 / self.embed_dimension
        self.w_embeddings.weight.data.uniform_(-initrange, initrange)# 这是个正态分布，中心词
        self.v_embeddings.weight.data.uniform_(-0, 0)#这个是周围词

    def forward(self, pos_w, pos_v, neg_v):
        #这个是中心词的词向量，pos_w是batch_size*1大小的词向量
        emb_w = self.w_embeddings(torch.LongTensor(pos_w).cuda())  # 转为tensor 大小 [ mini_batch_size * emb_dimension ]
        #这是周围词
        emb_v = self.v_embeddings(torch.LongTensor(pos_v).cuda())
        #这是负样本
        neg_emb_v = self.v_embeddings(torch.LongTensor(neg_v).cuda())  # 转换后大小 [ mini_batch_size * negative_sampling_number * emb_dimension ]
        score = torch.mul(emb_w, emb_v).squeeze()

        score = torch.sum(score, dim=1)
        score = torch.clamp(score, max=10, min=-10)
        score = F.logsigmoid(score)

        neg_score = torch.bmm(neg_emb_v, emb_w.unsqueeze(2)).squeeze() #增加了一个维度
        neg_score = torch.clamp(neg_score, max=10, min=-10)
        neg_score = F.logsigmoid(-1 * neg_score)
        # L = log sigmoid (Xw.T * θv) + ∑neg(v) [log sigmoid (-Xw.T * θneg(v))]
        loss = - torch.sum(score) - torch.sum(neg_score)
        return loss

    def save_embedding(self, id2word, file_name):
        embedding_1 = self.w_embeddings.weight.data.cpu().numpy()
        embedding_2 = self.v_embeddings.weight.data.cpu().numpy()
        embedding = (embedding_1+embedding_2)/2
        fout = open(file_name, 'w')
        fout.write('%d %d\n' % (len(id2word), self.embed_dimension))
        for wid, w in id2word.items():
            e = embedding[wid]
            e = ' '.join(map(lambda x: str(x), e))
            fout.write('%s %s\n' % (w, e))

In [6]:
model = SkipGramModel(100, 10)
id2word = dict()
for i in range(100):
    id2word[i] = str(i)
pos_w = [0, 0, 1, 1, 1]
pos_v = [1, 2, 0, 2, 3]
neg_v = [[23, 42, 32], [32, 24, 53], [32, 24, 53], [32, 24, 53], [32, 24, 53]]
model.forward(pos_w, pos_v, neg_v)


tensor(13.8629, grad_fn=<MulBackward0>)

In [9]:
model.w_embeddings.weight.data.cpu().numpy().shape

(100, 10)