In [1]:
import collections
import d2lzh as d2l
import math
from mxnet import autograd, gluon, nd
from mxnet.gluon import data as gdata, loss as gloss, nn
import random
import sys
import time
import zipfile

In [2]:
with zipfile.ZipFile('../data/ptb.zip', 'r') as zin:
    zin.extractall('../data/')
    
with open('../data/ptb/ptb.train.txt', 'r') as f:
    lines = f.readlines()
    raw_dataset = [st.split() for st in lines]

'# sentences: %d' % len(raw_dataset)

'# sentences: 42068'

In [3]:
# 句尾符为“<eos>”，生僻词全用“<unk>”表示，数字则被替换成了“N”。
for st in raw_dataset[:3]:
    print('# tokens:', len(st), st[:5])

# tokens: 24 ['aer', 'banknote', 'berlitz', 'calloway', 'centrust']
# tokens: 15 ['pierre', '<unk>', 'N', 'years', 'old']
# tokens: 11 ['mr.', '<unk>', 'is', 'chairman', 'of']


In [4]:
counter = collections.Counter([tk for st in raw_dataset for tk in st])
# 保留出现次数大于5的
counter = dict(filter(lambda x: x[1] >= 5, counter.items()))

In [5]:
# 建立一个所有token的list
idx_to_token = [tk for tk, _ in counter.items()]
# 建立一个token到索引的词典
token_to_idx = {tk: idx for idx, tk in enumerate(idx_to_token)}
# 将所有的token变成索引
dataset = [[token_to_idx[tk] for tk in st if tk in token_to_idx]
          for st in raw_dataset]
num_tokens = sum([len(st) for st in dataset])
'# tokens: %d' % num_tokens

'# tokens: 887100'

In [6]:
def discard(idx):
    return random.uniform(0, 1) < 1 - math.sqrt(
        1e-4 / counter[idx_to_token[idx]] * num_tokens)

# 高频词被丢弃概率越大
subsampled_dataset = [[tk for tk in st if not discard(tk)] for st in dataset]
'# tokens: %d' % sum([len(st) for st in subsampled_dataset])

'# tokens: 375925'

In [7]:
def compare_counts(token):
    return '# %s: before=%d, after=%d' % (token, sum(
        [st.count(token_to_idx[token]) for st in dataset]), sum(
        [st.count(token_to_idx[token]) for st in subsampled_dataset]))

compare_counts('a')

'# a: before=21196, after=1297'

In [8]:
compare_counts('join')

'# join: before=45, after=45'

In [9]:
# 每次在整数1和max_window_size之间随机均匀采样一个整数作为背景窗口大小。
def get_centers_and_contexts(dataset, max_window_size):
    centers, contexts = [], []
    for st in dataset:
        if len(st) < 2:  # 每个句子至少要有2个词才可能组成一对“中心词-背景词”
            continue
        centers += st
        for center_i in range(len(st)):
            window_size = random.randint(1, max_window_size)
            indices = list(range(max(0, center_i - window_size),
                                 min(len(st), center_i + 1 + window_size)))
            indices.remove(center_i)  # 将中心词排除在背景词之外
            contexts.append([st[idx] for idx in indices])
    return centers, contexts

In [10]:
tiny_dataset = [list(range(7)), list(range(7, 10))]
print('dataset', tiny_dataset)
for center, context in zip(*get_centers_and_contexts(tiny_dataset, 2)):
    print('center', center, 'has contexts', context)

dataset [[0, 1, 2, 3, 4, 5, 6], [7, 8, 9]]
center 0 has contexts [1]
center 1 has contexts [0, 2, 3]
center 2 has contexts [0, 1, 3, 4]
center 3 has contexts [2, 4]
center 4 has contexts [2, 3, 5, 6]
center 5 has contexts [3, 4, 6]
center 6 has contexts [5]
center 7 has contexts [8, 9]
center 8 has contexts [7, 9]
center 9 has contexts [8]


In [11]:
# 设最大背景窗口大小为5。下面提取数据集中所有的中心词及其背景词。
all_centers, all_contexts = get_centers_and_contexts(subsampled_dataset, 5)

In [12]:
def get_negatives(all_contexts, sampling_weights, K):
    all_negatives, neg_candidates, i = [], [], 0
    pupulation = list(range(len(sampling_weights)))
    for contexts in all_contexts:
        negatives = []
        while len(negatives) < len(contexts) * K:
            if i == len(neg_candidates):
                #根据每个词的权重随机生成K个词的索引作为噪声词
                i, neg_candidates = 0, random.choices(pupulation, sampling_weights, k = int(1e5))
            neg, i = neg_candidates[i], i+1
            # 当噪声词不为背景词时才可以使用
            if neg not in set(contexts):
                negatives.append(neg)
        all_negatives.append(negatives)
    return all_negatives

# 根据word2vec论文的建议，噪声词采样概率 P(w) 设为 w 词频与总词频之比的0.75次方
sampling_weights = [counter[w]**0.75 for w in idx_to_token]
all_negatives = get_negatives(all_contexts, sampling_weights, 5)

In [13]:
# 每个元素分别包含中心词center、背景词context,掩码mask和噪声词negative。
def batchify(data):
    max_len = max(len(c) + len(n) for _,c,n in data)
    centers, contexts_negatives, masks, labels= [],[],[],[]
    for center, context, negative in data:
        cur_len = len(context) + len(negative)
        centers += [center]
        # 添加填充项
        contexts_negatives += [context + negative + [0] *(max_len - cur_len)]
        masks += [[1] * cur_len + [0] * (max_len - cur_len)]
        labels += [[1] * len(context) + [0] * (max_len - len(context))]
    return (nd.array(centers).reshape((-1, 1)), nd.array(contexts_negatives),
            nd.array(masks), nd.array(labels))

In [51]:
batch_size = 1024
num_workers = 0 if sys.platform.startswith('win32') else 4
dataset = gdata.ArrayDataset(all_centers, all_contexts, all_negatives)
data_iter = gdata.DataLoader(dataset, batch_size, shuffle=True,
                             batchify_fn=batchify, num_workers=num_workers)
for batch in data_iter:
    for name, data in zip(['centers', 'contexts_negatives', 'masks',
                           'labels'], batch):
        print(name, 'shape:', data.shape)
    break

centers shape: (1024, 1)
contexts_negatives shape: (1024, 60)
masks shape: (1024, 60)
labels shape: (1024, 60)


In [55]:
# 设词典大小为20，词向量的维度为4
embed = nn.Embedding(input_dim=20, output_dim=4,sparse_grad=True)
embed.initialize()
embed.weight

Parameter embedding9_weight (shape=(20, 4), dtype=float32)

In [56]:
x = nd.array([[1, 2, 3], [4, 5, 6]])
embed(x)


[[[-0.00847371  0.03706941 -0.02670899 -0.04258007]
  [ 0.01846186  0.04419332 -0.05181816  0.05025811]
  [ 0.02387923  0.02199347 -0.00506344  0.00273282]]

 [[-0.0315154   0.01798582  0.03464216  0.01915742]
  [-0.04369262  0.04885338 -0.04868669 -0.0378929 ]
  [-0.06785017  0.05838076  0.05058205  0.06298339]]]
<NDArray 2x3x4 @cpu(0)>

In [57]:
# 给定两个形状分别为( n ,  a ,  b )和( n ,  b ,  c )的NDArray，
# 小批量乘法输出的形状为( n ,  a ,  c )
X = nd.ones((2, 1, 4))
Y = nd.ones((2, 4, 6))
nd.batch_dot(X, Y).shape

(2, 1, 6)

In [58]:
def skip_gram(center, contexts_and_negatives, embed_v, embed_u):
    v = embed_v(center)
    u = embed_u(contexts_and_negatives)
    pred = nd.batch_dot(v, u.swapaxes(1, 2))
    return pred

In [59]:
loss = gloss.SigmoidBinaryCrossEntropyLoss()

In [60]:
pred = nd.array([[1.5, 0.3, -1, 2], [1.1, -0.6, 2.2, 0.4]])
# 标签变量label中的1和0分别代表背景词和噪声词
label = nd.array([[1, 0, 0, 0], [1, 1, 0, 0]])
# 通过掩码变量指定小批量中参与损失函数计算的部分预测值和标签
# 也会避免填充项对损失函数计算的影响
mask = nd.array([[1, 1, 1, 1], [1, 1, 1, 0]])  # 掩码变量
loss(pred, label, mask) * mask.shape[1] / mask.sum(axis=1)


[0.8739896 1.2099689]
<NDArray 2 @cpu(0)>

In [61]:
def sigmd(x):
    return -math.log(1 / (1 + math.exp(-x)))
# 从零实现的二元交叉熵损失函数计算
# label改变正负，mask改变是否选取
print('%.7f' % ((sigmd(1.5) + sigmd(-0.3) + sigmd(1) + sigmd(-2)) / 4))
print('%.7f' % ((sigmd(1.1) + sigmd(-0.6) + sigmd(-2.2)) / 3))

0.8739896
1.2099689


In [69]:
embed_size = 250
net = nn.Sequential()
net.add(nn.Embedding(input_dim=len(idx_to_token), output_dim=embed_size, sparse_grad = True),
        nn.Embedding(input_dim=len(idx_to_token), output_dim=embed_size, sparse_grad = True))

In [70]:
def train(net, lr, num_epochs):
    ctx = d2l.try_gpu()
    net.initialize(ctx=ctx, force_reinit=True)
    trainer = gluon.Trainer(net.collect_params(), 'adam',
                            {'learning_rate': lr})
    for epoch in range(num_epochs):
        start, l_sum, n = time.time(), 0.0, 0
        for batch in data_iter:
            center, context_negative, mask, label = [
                data.as_in_context(ctx) for data in batch]
            with autograd.record():
                pred = skip_gram(center, context_negative, net[0], net[1])
                # 使用掩码变量mask来避免填充项对损失函数计算的影响
                l = (loss(pred.reshape(label.shape), label, mask) *
                     mask.shape[1] / mask.sum(axis=1))
            l.backward()
            trainer.step(batch_size)
            l_sum += l.sum().asscalar()
            n += l.size
        print('epoch %d, loss %.2f, time %.2fs'
              % (epoch + 1, l_sum / n, time.time() - start))

In [71]:
train(net, 0.02, 5)

epoch 1, loss 0.48, time 25.16s
epoch 2, loss 0.39, time 25.00s
epoch 3, loss 0.33, time 24.85s
epoch 4, loss 0.29, time 25.12s
epoch 5, loss 0.27, time 25.05s


In [65]:
def get_similar_tokens(query_token, k, embed):
    W = embed.weight.data()
    x = W[token_to_idx[query_token]]
    # 添加的1e-9是为了数值稳定性
    cos = nd.dot(W, x) / (nd.sum(W * W, axis=1) * nd.sum(x * x) + 1e-9).sqrt()
    topk = nd.topk(cos, k=k+1, ret_typ='indices').asnumpy().astype('int32')
    for i in topk[1:]:  # 除去输入词
        print('cosine sim=%.3f: %s' % (cos[i].asscalar(), (idx_to_token[i])))

get_similar_tokens('chip', 3, net[0])

cosine sim=0.581: microprocessor
cosine sim=0.562: intel
cosine sim=0.550: hard-disk


In [47]:
get_similar_tokens('love', 3, net[0])

cosine sim=0.667: thieves
cosine sim=0.657: gut
cosine sim=0.654: catching


In [49]:
get_similar_tokens('the', 3, net[0])

cosine sim=0.495: pravda
cosine sim=0.489: avoided
cosine sim=0.458: grossly
