# 数据准备

In [6]:
with open('../dataset/text8', 'r', encoding='utf-8') as fd:
    words = fd.read().split()
    
words=words[:1000000]

In [7]:
from functools import reduce
from collections import Counter

voc_size = 50000    # 词典大小

word_cnt = list()
word_cnt.extend(Counter(words).most_common(voc_size-1))    # -1为未在记录的词UNK预留一个位置

# 映射表，记得把0预留给UNK
word2int = {item[0]: idx+1 for idx, item in enumerate(word_cnt)}
int2word = {idx+1: item[0] for idx, item in enumerate(word_cnt)}

data = list(map(lambda x: word2int.get(x, 0), words))    # 将所有word转成int

unk_cnt = len(words)-reduce(lambda x, y: x+y, map(lambda x: x[1], word_cnt))
word_cnt.insert(0, ('UNK', unk_cnt))    # 未在记录的词

# 为映射表添加UNK
word2int['UNK'] = 0
int2word[0] = 'UNK'

del words

使用滑动窗口生成分批数据。

In [8]:
from collections import deque
import numpy as np


class WordData:
    def __init__(self, words, batch_size=32, cent_offset=2, cont_per_cent=4):
        self.words = words
        self.batch_size = batch_size
        self.cent_offset = cent_offset    # 中心词的在窗口中的idx，即左右窗口的大小
        self.cont_per_cent = cont_per_cent    # 每个中心词产生4个上下文，即4个样本

        # batch_size是每个中心词产生样本数量的整数倍，这样就保证了每生成一个batch就会改变中心词
        assert self.batch_size % self.cont_per_cent == 0
        assert self.cont_per_cent <= self.cent_offset*2    # 每个中心词生成样本数应小于等于窗口内的上下文单词数

        self.sample_cnt = 0

    def next_batch(self):
        self.data = list()
        self.label = list()

        for idx, center_word in enumerate(self.words):
            for context_word in self.words[max(0, idx-self.cent_offset):min(idx+self.cent_offset, len(self.words))+1]:
                if context_word != center_word:
                    self.data.append(center_word)
                    self.label.append(context_word)

                    self.sample_cnt += 1    # 每生成一个样本进行计数

                    if self.sample_cnt == self.batch_size:    # 样本数达到一个batch时抛出
                        self.sample_cnt = 0
                        yield np.array(self.data), np.array(self.label).reshape((-1, 1))
                        self.data = list()
                        self.label = list()

        # 抛出不足以成批的数据
        yield np.array(self.data), np.array(self.label).reshape((-1, 1))

In [9]:
batch_size=128

train_data=WordData(data)
len(data)/batch_size

7812.5

# 网络设计

In [10]:
unit_I = 1    # 单个数字表示的word
emb_size = 128
unit_O = 1

neg_samples = 10    # 负采样参数
valid_size = 16  # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

# 搭建网络

In [11]:
import tensorflow as tf
import math

X = tf.placeholder(tf.int32, shape=[None])
Y = tf.placeholder(tf.int32, shape=[None, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

with tf.name_scope('Emb'):
    emb = tf.Variable(tf.random_uniform([voc_size, emb_size], -1, 1))
    embed = tf.nn.embedding_lookup(emb, X)

with tf.name_scope('Eval'):
    nce_weights = tf.Variable(tf.truncated_normal([voc_size, emb_size],
                                                  stddev=1.0 / math.sqrt(emb_size)))
    nce_biases = tf.Variable(tf.zeros([voc_size]))
    loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights,
                                         biases=nce_biases,
                                         labels=Y,
                                         inputs=embed,
                                         num_sampled=neg_samples,
                                         num_classes=voc_size))

with tf.name_scope('train_op'):
    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

with tf.name_scope('Valid'):
    norm = tf.sqrt(tf.reduce_sum(tf.square(emb), 1, keepdims=True))
    norm_emb = emb / norm
    valid_emb = tf.nn.embedding_lookup(norm_emb, valid_dataset)
    similarity = tf.matmul(valid_emb, norm_emb, transpose_b=True)

init = tf.global_variables_initializer()    # 所有变量初始化
config = tf.ConfigProto()
config.gpu_options.allow_growth = True    # 按需使用显存

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


# 训练网络

In [12]:
with tf.Session(config=config) as sess:
    sess.run(init)
    epochs = 20

    batch_cnt = 0
    for epoch in range(epochs):
        for batch_data, batch_labels in train_data.next_batch():
            batch_cnt += 1
            _, loss_val = sess.run(
                [optimizer, loss],
                feed_dict={X: batch_data, Y: batch_labels})

            # 每5000batch输出一次信息
            if (batch_cnt+1) % 10000 == 0:
                print('epoch: {}, batch_loss: {}'.format(
                    epoch+1, loss_val))

            if (batch_cnt+1) % 50000 == 0:
                sim = similarity.eval()
                for i in range(valid_size):
                    valid_word = int2word[valid_examples[i]]
                    top_k = 5
                    nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                    log = 'Nearest to {}:'.format(valid_word)
                    for k in range(top_k):
                        sim_word = int2word[nearest[k]]
                        log += ' {}'.format(sim_word)
                    print(log)

epoch: 1, batch_loss: 13.46476936340332
epoch: 1, batch_loss: 24.948165893554688
epoch: 1, batch_loss: 35.9627799987793
epoch: 1, batch_loss: 4.226629257202148
epoch: 1, batch_loss: 15.40259075164795
Nearest to had: was became he has into
Nearest to for: which as of by such
Nearest to being: are has been since often
Nearest to new: national history reportedly the states
Nearest to and: the was such zero study
Nearest to two: five six three zero eight
Nearest to i: when s not about was
Nearest to UNK: australia its this eight such
Nearest to up: linemen pacific children dedicated t
Nearest to were: are people he had was
Nearest to them: not fixed flight any central
Nearest to use: history suez find tend possible
Nearest to called: such two language sanctioned known
Nearest to th: three seven eight six five
Nearest to six: three five eight seven four
Nearest to was: had where and became were
epoch: 1, batch_loss: 4.229766368865967
epoch: 1, batch_loss: 7.380623817443848
epoch: 1, batch_l

epoch: 4, batch_loss: 3.6077609062194824
epoch: 4, batch_loss: 1.0354843139648438
epoch: 4, batch_loss: 3.894710063934326
epoch: 4, batch_loss: 4.2480974197387695
epoch: 5, batch_loss: 2.977494478225708
Nearest to had: has have were since thus
Nearest to for: example on within next machine
Nearest to being: both while identify people him
Nearest to new: york canada england australian usa
Nearest to and: beyond towards galt world throughout
Nearest to two: five three four seven six
Nearest to i: you me ii just t
Nearest to UNK: story pole bulleri institutions writing
Nearest to up: while germany sacrifice out he
Nearest to were: was are had himself been
Nearest to them: individual him up people individuals
Nearest to use: against addition or account name
Nearest to called: essentially held subpoenaed formed means
Nearest to th: rd seven eight zero births
Nearest to six: four three five seven two
Nearest to was: were translation himself are been
epoch: 5, batch_loss: 3.642193078994751
ep

epoch: 8, batch_loss: 3.0253043174743652
epoch: 8, batch_loss: 2.8150570392608643
epoch: 8, batch_loss: 2.015658140182495
epoch: 8, batch_loss: 2.1830825805664062
epoch: 8, batch_loss: 2.0545730590820312
Nearest to had: has been have knew having
Nearest to for: perilous musculoskeletal with replica facing
Nearest to being: were taliban together beings mabillon
Nearest to new: york myths similarly wide uganda
Nearest to and: in as of a books
Nearest to two: eight four six zero five
Nearest to i: you we ever xn ii
Nearest to UNK: lawmaking bret mk handke theory
Nearest to up: them shaferi dion him octavian
Nearest to were: are became having being some
Nearest to them: him closer up nutmeg individuals
Nearest to use: proper hebrew finding original name
Nearest to called: defined musculoskeletal known regarded viewed
Nearest to th: eight march four century st
Nearest to six: four eight five seven zero
Nearest to was: is became were cultriformis dice
epoch: 8, batch_loss: 2.9983439445495605

epoch: 12, batch_loss: 3.199989080429077
epoch: 12, batch_loss: 4.404428482055664
epoch: 12, batch_loss: 3.515326499938965
epoch: 12, batch_loss: 2.803966522216797
epoch: 12, batch_loss: 2.7001166343688965
Nearest to had: have would knew scrutinized umpires
Nearest to for: xn and as after book
Nearest to being: peacemaker alarmed freeing often were
Nearest to new: york gecko approbation prothrombin frightening
Nearest to and: was with in for as
Nearest to two: six three eight five bayerischen
Nearest to i: iii t tiles you just
Nearest to UNK: merican revolutionaries reprising jordana sui
Nearest to up: wants yards right reaches down
Nearest to were: are was is being so
Nearest to them: way him contract men statements
Nearest to use: warrantless tasks manufacture multitude refer
Nearest to called: named bound tamarack widget known
Nearest to th: nd nine eight seven six
Nearest to six: eight nine zero four seven
Nearest to was: is were and as play
epoch: 12, batch_loss: 2.397804975509643

epoch: 15, batch_loss: 2.7370412349700928
Nearest to had: has have maintain was hold
Nearest to for: as navigate resistible because property
Nearest to being: was funneled yhwh adasaurus kesteven
Nearest to new: york audi transfered etiology idea
Nearest to and: on postmodernism beyond socialist charted
Nearest to two: six swat seven natural macedon
Nearest to i: comfortably embrace emulated iv ii
Nearest to UNK: tter warhol astrum garland sipapu
Nearest to up: me dion once kneel screaming
Nearest to were: are turncoat rushers many residing
Nearest to them: cars interception reacted hkia memnoch
Nearest to use: absence sense accumulate exist worship
Nearest to called: described used okinawan signed regarded
Nearest to th: century nd rd centuries st
Nearest to six: three zero seven eight ptarmigan
Nearest to was: is had suspending being however
epoch: 15, batch_loss: 2.925800323486328
epoch: 15, batch_loss: 3.09213924407959
epoch: 15, batch_loss: 3.2070226669311523
epoch: 15, batch_loss

epoch: 18, batch_loss: 2.2541916370391846
epoch: 18, batch_loss: 2.073733329772949
epoch: 19, batch_loss: 2.8434689044952393
epoch: 19, batch_loss: 3.2795650959014893
epoch: 19, batch_loss: 2.7318665981292725
Nearest to had: have has could having so
Nearest to for: to vivendi personal youthful dedicating
Nearest to being: instantly men acounting keter emigrating
Nearest to new: york jersey oregon mississippi zealand
Nearest to and: by to eight uniti countries
Nearest to two: seven three six one eight
Nearest to i: t you ll ii toinn
Nearest to UNK: agave cay jericho isles tilsit
Nearest to up: serapeum down ocean petrochemical north
Nearest to were: are comprise be hazlitt had
Nearest to them: people saddle him enabling away
Nearest to use: warrantless technik aspiring timber condensed
Nearest to called: mccormack schlegel named herriman iaijutsu
Nearest to th: nd century cityofathens rd nine
Nearest to six: one seven two three eight
Nearest to was: tapiau be pleads seipel rivaled
epoch