# 数据准备

In [1]:
with open('../dataset/text8', 'r', encoding='utf-8') as fd:
    words = fd.read().split()
    
# words=words[:200]

In [2]:
from functools import reduce
from collections import Counter

voc_size = 50000    # 词典大小

word_cnt = list()
word_cnt.extend(Counter(words).most_common(voc_size-1))    # -1为未在记录的词UNK预留一个位置

# 映射表，记得把0预留给UNK
word2int = {item[0]: idx+1 for idx, item in enumerate(word_cnt)}
int2word = {idx+1: item[0] for idx, item in enumerate(word_cnt)}

data = list(map(lambda x: word2int.get(x, 0), words))    # 将所有word转成int

unk_cnt = len(words)-reduce(lambda x, y: x+y, map(lambda x: x[1], word_cnt))
word_cnt.insert(0, ('UNK', unk_cnt))    # 未在记录的词

# 为映射表添加UNK
word2int['UNK'] = 0
int2word[0] = 'UNK'

del words

使用滑动窗口生成分批数据。

In [3]:
from collections import deque
import numpy as np


class WordData:
    def __init__(self, words, batch_size=32, cent_offset=2, cont_per_cent=4):
        self.words = words
        self.batch_size = batch_size
        self.cent_offset = cent_offset    # 中心词的在窗口中的idx，即左右窗口的大小
        self.cont_per_cent = cont_per_cent    # 每个中心词产生4个上下文，即4个样本

        # batch_size是每个中心词产生样本数量的整数倍，这样就保证了每生成一个batch就会改变中心词
        assert self.batch_size % self.cont_per_cent == 0
        assert self.cont_per_cent <= self.cent_offset*2    # 每个中心词生成样本数应小于等于窗口内的上下文单词数

        self.sample_cnt = 0

    def next_batch(self):
        self.data = list()
        self.label = list()

        for idx, center_word in enumerate(self.words):
            for context_word in self.words[max(0, idx-self.cent_offset):min(idx+self.cent_offset, len(self.words))+1]:
                if context_word != center_word:
                    self.data.append(center_word)
                    self.label.append(context_word)

                    self.sample_cnt += 1    # 每生成一个样本进行计数

                    if self.sample_cnt == self.batch_size:    # 样本数达到一个batch时抛出
                        self.sample_cnt = 0
                        yield np.array(self.data), np.array(self.label).reshape((-1,1))
                        self.data = list()
                        self.label = list()

        yield np.array(self.data), np.array(self.label).reshape((-1,1))    # 抛出多余数据

In [4]:
batch_size=64

train_data=WordData(data)

# 网络设计

In [5]:
unit_I = 1    # 单个数字表示的word
emb_size = 128
unit_O = 1

neg_samples = 10    # 负采样参数
valid_size = 16  # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

# 搭建网络

In [6]:
import tensorflow as tf
import math

X = tf.placeholder(tf.int32, shape=[None])
Y = tf.placeholder(tf.int32, shape=[None, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

with tf.name_scope('Emb'):
    emb = tf.Variable(tf.random_uniform([voc_size, emb_size], -1, 1))
    embed = tf.nn.embedding_lookup(emb, X)

with tf.name_scope('Eval'):
    nce_weights = tf.Variable(
        tf.truncated_normal(
            [voc_size, emb_size],
            stddev=1.0 / math.sqrt(emb_size)))
    nce_biases = tf.Variable(tf.zeros([voc_size]))
    loss = tf.reduce_mean(
        tf.nn.nce_loss(
            weights=nce_weights,
            biases=nce_biases,
            labels=Y,
            inputs=embed,
            num_sampled=neg_samples,
            num_classes=voc_size))

with tf.name_scope('train_op'):
    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

with tf.name_scope('Valid'):
    norm = tf.sqrt(tf.reduce_sum(tf.square(emb), 1, keepdims=True))
    norm_emb = emb / norm
    valid_emb = tf.nn.embedding_lookup(norm_emb, valid_dataset)
    similarity = tf.matmul(valid_emb, norm_emb, transpose_b=True)

init = tf.global_variables_initializer()    # 所有变量初始化
config = tf.ConfigProto()
config.gpu_options.allow_growth = True    # 按需使用显存

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


# 训练网络

In [8]:
with tf.Session(config=config) as sess:
    sess.run(init)
    epochs = 20

    batch_cnt = 0
    for epoch in range(epochs):
        for batch_data, batch_labels in train_data.next_batch():
            batch_cnt += 1
            _, loss_val = sess.run(
                [optimizer, loss],
                feed_dict={X: batch_data, Y: batch_labels})

            # 每1000batch输出一次信息
            if (batch_cnt+1) % 1000 == 0:
                print('epoch: {}, batch_loss: {}'.format(
                    epoch+1, loss_val))
                
            if (batch_cnt+1) % 5000 == 0:
                sim=similarity.eval()
                for i in range(valid_size):
                    valid_word=int2word[valid_examples[i]]
                    top_k=5
                    nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                    log='Nearest to {}:'.format(valid_word)
                    for k in range(top_k):
                        sim_word=int2word[nearest[k]]
                        log+=' {}'.format(sim_word)
                    print(log)

NameError: name 'average_loss' is not defined