In [8]:
import math
import numpy as np
import os
import random
import tensorflow as tf
import bz2
from collections import Counter, deque
from matplotlib import pylab
from six.moves import range
from six.moves.urllib.request import urlretrieve
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import nltk # standard preprocessing
import operator # sorting items in dictionary by value
from math import ceil
import csv

%matplotlib inline

In [59]:
def read_data_small(filename):
    """
    Extract a part of the data
    """
    with bz2.BZ2File(filename) as f:
        data = []
        file_size = os.stat(filename).st_size
        chunk_size = 1024 * 1024  # reading 1 MB at a time as the dataset is moderately large
        print('Reading data...')
        for i in range(int(ceil(file_size // chunk_size) + 1)):
            bytes_to_read = min(chunk_size, file_size - (i * chunk_size))
            file_string = f.read(bytes_to_read).decode('utf-8')
            file_string = file_string.lower()
            # tokenizes a string to words residing in a list
            file_string = nltk.word_tokenize(file_string)
            data.extend(file_string)
    return data

filename = "./data/wikipedia2text-extracted.txt.bz2"
words = read_data_small(filename)
print('Data size %d' % len(words))
print('Example words (start): ', words[:10])
print('Example words (end): ', words[-10:])

Reading data...
Data size 3360286
Example words (start):  ['propaganda', 'is', 'a', 'concerted', 'set', 'of', 'messages', 'aimed', 'at', 'influencing']
Example words (end):  ['favorable', 'long-term', 'outcomes', 'for', 'around', 'half', 'of', 'those', 'diagnosed', 'with']


In [60]:
def build_dataset(words, vocabulary_size=50000):
    word_cnt = [['UNK', -1]]

    # Use `Counter` to get the most common words
    word_cnt.extend(Counter(words).most_common(vocabulary_size - 1))
    word2id = dict()

    for word, _ in word_cnt:
        word2id[word] = len(word2id)  # ID start from 0

    data = list()
    unk_cnt = 0
    for word in words:
        if word in word2id:
            index = word2id[word]
        else:
            index = 0  # word2id['UNK']
            unk_cnt += 1
        data.append(index)

    # word_cnt[0] = ['UNK', unk_cnt]
    word_cnt[0][1] = unk_cnt

    id2word = dict(zip(word2id.values(), word2id.keys()))
    assert len(word2id) == vocabulary_size

    return data, word_cnt, word2id, id2word

data, count, word2id, id2word = build_dataset(words)
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10])

Most common words (+UNK) [['UNK', 69215], ('the', 226881), (',', 184013), ('.', 120944), ('of', 116323)]
Sample data [1721, 9, 8, 16471, 223, 4, 5165, 4456, 26, 11590]


In [89]:
span = 2 * skip_window + 1
data_index = len(data) - span
"""指定整个滑动窗口 [context center context] 的起始位置"""

def generate_batch_sg(data, batch_size=8, skip_window=1, num_skips=None):
    """Function to generate **one** training batch for the skip-gram model.

    每次完成 batch_size // num_skips 个中心词的构建，每个中心词取 num_skips 个上下文词

    Args:
        num_skips: 随机选择`num_skips`个窗口中的 context words
            在CS224n的课程里没有提到这个参数，也就是默认使用所有的上下文词，即 `num_skips=2 * skip_window`
        skip_window: 一侧的窗口长度，完整的窗口大小为 1+2*skip_window

    Returns:
        inputs: center_words
            形如 ndarray([1,1,33,33,55,55，67,67]) 每个中心词的重复次数等于`num_skips`
        labels: context_words
            形如 ndarray([23,243,543,65,7658,342，8567,3123])
            长度与 center_words 相同，分别是对应中心词的上下文词，即 (1,23),(1,243),(33,543),...
    """
    global data_index

    if num_skips is None:
        num_skips = 2 * skip_window
    else:
        assert num_skips <= 2 * skip_window
    assert batch_size % num_skips == 0

    # center_words
    inputs = np.ndarray(shape=(batch_size,), dtype=np.int32)  # (batch_size,)
    # context_words
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)  # (batch_size, 1)

    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    buffer = deque(maxlen=span)  # 双端队列，支持自动弹出

    if data_index + span > len(data):
        data_index = 0
    for _ in range(span):
        buffer.append(data[data_index])
        data_index += 1
        
    context_words = [w for w in range(span) if w != skip_window]
    for i in range(batch_size // num_skips):
        context_words = random.sample(context_words, num_skips)

        for j, context_word in enumerate(context_words):
            inputs[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[context_word]
        
        if data_index >= len(data):
            print(data_index)
            buffer = data[:span]
            data_index = span
        else:
            buffer.append(data[data_index])
            data_index += 1

    # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data) - span) % len(data)
    return inputs, labels


print('data:', [id2word[di] for di in data[:16]])

for skip_window in [1, 2]:
    data_index = 0
    batch, labels = generate_batch_sg(data, batch_size=8, skip_window=skip_window)
    print('\nwith window_size = %d:' % skip_window)
    print('    batch:', [id2word[bi] for bi in batch])
    print('    labels:', [id2word[li] for li in labels.reshape(8)])

data_index = 0
for i in range(1, 4):
    batch, labels = generate_batch_sg(data, batch_size=8, skip_window=1)
#     print(data_index)
    print('\nbatch %d:' % i)
    print('    batch:', [id2word[bi] for bi in batch])
    print('    labels:', [id2word[li] for li in labels.reshape(8)])

data: ['propaganda', 'is', 'a', 'concerted', 'set', 'of', 'messages', 'aimed', 'at', 'influencing', 'the', 'opinions', 'or', 'behavior', 'of', 'large']

with window_size = 1:
    batch: ['is', 'is', 'a', 'a', 'concerted', 'concerted', 'set', 'set']
    labels: ['a', 'propaganda', 'concerted', 'is', 'a', 'set', 'concerted', 'of']

with window_size = 2:
    batch: ['a', 'a', 'a', 'a', 'concerted', 'concerted', 'concerted', 'concerted']
    labels: ['concerted', 'propaganda', 'is', 'set', 'is', 'of', 'a', 'set']

batch 1:
    batch: ['is', 'is', 'a', 'a', 'concerted', 'concerted', 'set', 'set']
    labels: ['propaganda', 'a', 'is', 'concerted', 'a', 'set', 'concerted', 'of']

batch 2:
    batch: ['of', 'of', 'messages', 'messages', 'aimed', 'aimed', 'at', 'at']
    labels: ['messages', 'set', 'aimed', 'of', 'at', 'messages', 'influencing', 'aimed']

batch 3:
    batch: ['influencing', 'influencing', 'the', 'the', 'opinions', 'opinions', 'or', 'or']
    labels: ['the', 'at', 'influencing',

In [75]:
data_index = len(data)
print(data_index)

3360286


In [90]:
import csv
import math
import random
import numpy as np
import tensorflow as tf

class config_sg:
    """"""
    batch_size = 128
    embedding_size = 128
    window_size = 4
    vocabulary_size = 50000

    # A random validation set
    valid_size = 16
    valid_window = 50

    num_sampled = 32  # 负采样

    num_steps = 100001
    top_k = 8
    
config = config_sg

valid_window, valid_size = config.valid_window, config.valid_size
valid_examples = np.array(random.sample(range(valid_window), valid_size))
valid_examples = np.append(valid_examples, random.sample(range(1000, 1000 + valid_window), valid_size), axis=0)

# Graph
tf.reset_default_graph()
g = tf.get_default_graph()

# Input
batch_size = config.batch_size
train_dataset = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])

valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

# Word2Vec Model
# Embedding layer
vocabulary_size, embedding_size = config.vocabulary_size, config.embedding_size
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
embed = tf.nn.embedding_lookup(embeddings, train_dataset)

# Softmax Weights and Biases
softmax_W = tf.Variable(
    tf.truncated_normal([vocabulary_size, embedding_size],
                        stddev=0.5 / math.sqrt(embedding_size)))
softmax_b = tf.Variable(tf.random_uniform([vocabulary_size], 0.0, 0.01))

# 负采样
num_sampled = config.num_sampled
loss = tf.reduce_mean(
    tf.nn.sampled_softmax_loss(
        weights=softmax_W, biases=softmax_b, inputs=embed,
        labels=train_labels, num_sampled=num_sampled, num_classes=vocabulary_size))

# train_op
optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)

# 计算相似度
# norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
# normalized_embeddings = embeddings / norm
normalized_embeddings = tf.nn.l2_normalize(embeddings, axis=1)  # 等价于以上两行
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
# 计算 cosine 相似度（内积）
similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))

# 准备数据
# filename = "./data/wikipedia2text-extracted.txt.bz2"  # for Ipython
# # words = read_data(filename)
# words = read_data_small(filename)
# print('Data size %d' % len(words))
# print('Example words (start): ', words[:10])
# print('Example words (end): ', words[-10:])

# data, count, word2id, id2word = build_dataset(words)
# print('Most common words (+UNK)', count[:5])
# print('Sample data', data[:10])
# del words

# Train
skip_losses = []
num_steps = config.num_steps
with tf.Session(graph=g) as sess:
    """"""
    # init
    tf.global_variables_initializer().run()

    # The average loss is an estimate of the loss over the last 2000 batches.
    average_loss = 0
    for step in range(num_steps):

        batch_data, batch_labels = generate_batch_sg(data, batch_size, config.window_size)

        # run train_op and get loss
        _, loss_val = sess.run([optimizer, loss], feed_dict={train_dataset: batch_data,
                                                             train_labels: batch_labels})

        # Update the average loss
        average_loss += loss_val

        if (step + 1) % 2000 == 0:
            if step > 0:
                average_loss = average_loss / 2000

            skip_losses.append(average_loss)
            print('Average loss at step %d: %f' % (step + 1, average_loss))
            average_loss = 0

        # 评价
        if (step + 1) % 10000 == 0:
            sim = similarity.eval()
            for i in range(valid_size):
                valid_word = id2word[valid_examples[i]]
                top_k = config.top_k
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log = 'Nearest to %s:' % valid_word
                for k in range(top_k):
                    close_word = id2word[nearest[k]]
                    log = '%s %s,' % (log, close_word)
                print(log)

    embeddings_sg = normalized_embeddings.eval()

# We will save the word vectors learned and the loss over time
# as this information is required later for comparisons
np.save('./out/embeddings_sg', embeddings_sg)

with open('skip_losses.csv', 'wt') as f:
    writer = csv.writer(f, delimiter=',')
    writer.writerow(skip_losses)


Average loss at step 2000: 3.918839
Average loss at step 4000: 3.496119
Average loss at step 6000: 3.489038
Average loss at step 8000: 3.527437
Average loss at step 10000: 3.412748
Nearest to 's: and, hold'em, cixi, the, monty, imbalances, this, july,
Nearest to and: the, ,, in, to, ., on, with, 's,
Nearest to one: viewing, equation, verge, humans, alec, sorter, pre-selection, pinprick,
Nearest to be: 20.7, landmasses, truesdell, yezidis, capable, espionage, yams, rica,
Nearest to also: baez, rorty, heidelberg, unionpay, misdemeanor, café, incidental, dent,
Nearest to ): UNK, natalie, rep., constitutionality, physiographic, conclusively, chichibu, sogdian,
Nearest to was: is, give, in, could, ., excl, had, glutamine,
Nearest to from: département, loans, 1.00794, growth, removing, vacationers, languish, for,
Nearest to by: that, unpredictably, ciws, neighborhoods, arsuf, pot-limit, covered, compel,
Nearest to which: that, and, ., tropical, harder, go, pétion, aliases,
Nearest to were: a

Average loss at step 62000: 3.248929
Average loss at step 64000: 3.266676
Average loss at step 66000: 3.253489
Average loss at step 68000: 3.285432
Average loss at step 70000: 3.239971
Nearest to 's: his, the, first, objectives, one, valueless, in, of,
Nearest to and: ,, ., the, in, UNK, (, of, ching,
Nearest to one: most, two, three, part, only, the, 's, mva,
Nearest to be: jazzy, utilised, have, technician, aft, burma, that, pending,
Nearest to also: often, bethlen, 1569, quartered, hoppe, it, slowest, chowtal,
Nearest to ): UNK, (, ,, madrigal, grunts, august, 1486, 2,
Nearest to was: were, is, had, became, has, ,, garry, racism,
Nearest to from: sassanid, burke, o'neill, marauders, in, classical, to, constructions,
Nearest to by: who, and, mieszko, ginseng, ., the, myspace, thunder,
Nearest to which: a, antti, committing, giorgos, pages, meantime, arbitrator, courtyards,
Nearest to were: are, was, had, enlistment, outspoken, two, non-theistic, bluffs,
Nearest to but: though, howeve