In [1]:
import tensorflow as tf
import numpy as np
import time
import os
import collections
import random
import time

In [2]:
def build_dataset(words, n_words):
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

data_index = 0

# Step 3: Function to generate a training batch for the skip-gram model.
def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    global data
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)
    if data_index + span > len(data):
        data_index = 0
    buffer.extend(data[data_index:data_index + span])
    data_index += span
    for i in range(batch_size // num_skips):
        context_words = [w for w in range(span) if w != skip_window]
        words_to_use = random.sample(context_words, num_skips)
        for j, context_word in enumerate(words_to_use):
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[context_word]
        if data_index == len(data):
            for word in data[:span]:
                buffer.append(word)
            data_index = span
        else:
            buffer.append(data[data_index])
            data_index += 1
    # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels

In [3]:
import pickle
with open('dataset.p', 'rb') as fopen:
    dataset=pickle.load(fopen)

In [6]:
vocabulary=(' '.join(dataset[:,1].tolist())).split()
print('example 10 words:',vocabulary[:10])
print('size corpus:',len(vocabulary))
vocabulary_size = len(list(set(vocabulary)))
print('size of unique words:',vocabulary_size)
dimension = 128
skip_window = 1
num_skips = 2
batch_size = 64
location = os.getcwd()

example 10 words: ['retailers', 'restaurants', 'schools', 'hospitals', 'also', 'added', 'jobs', 'brisk', 'pace', 'according']
size corpus: 315018
size of unique words: 21022


In [7]:
data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,vocabulary_size)
del vocabulary  # Hint to reduce memory.
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])

Most common words (+UNK) [['UNK', 1], ('said', 3130), ('percent', 2509), ('market', 2446), ('year', 2128)]
Sample data [656, 4813, 1971, 4358, 36, 173, 82, 2640, 186, 136] ['retailers', 'restaurants', 'schools', 'hospitals', 'also', 'added', 'jobs', 'brisk', 'pace', 'according']


In [8]:
valid_size = 16
valid_window = 100
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

In [9]:
graph = tf.Graph()

with graph.as_default():

    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

    # Ops and variables pinned to the CPU because of missing GPU implementation
    with tf.device('/cpu:0'):
        embeddings = tf.Variable(tf.random_uniform([vocabulary_size, dimension], -1.0, 1.0))
        embed = tf.nn.embedding_lookup(embeddings, train_inputs)
        nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, dimension],
                                                      stddev=1.0 / np.sqrt(dimension)))
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
        loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights,
                                             biases=nce_biases,
                                             labels=train_labels,
                                             inputs=embed,
                                             num_sampled=batch_size / 2,
                                             num_classes=vocabulary_size))
        optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

        # Compute the cosine similarity between minibatch examples and all embeddings.
        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
        normalized_embeddings = embeddings / norm
        valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
        similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)
        init = tf.global_variables_initializer()

num_steps = 300000

In [10]:
with tf.Session(graph=graph) as session:
    init.run()
    print('Initialized')

    average_loss = 0
    for step in range(num_steps):
        batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            print('Average loss at step ', step, ': ', average_loss)
            average_loss = 0

        if step % 10000 == 0:
            sim = similarity.eval()
            for i in range(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8  # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log_str = 'Nearest to %s:' % valid_word
                for k in range(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log_str = '%s %s,' % (log_str, close_word)
                print(log_str)
    final_embeddings = normalized_embeddings.eval()

Initialized
Average loss at step  0 :  142.486282349
Nearest to higher: nonprofits, easiest, sport-utility, atement, straphangers, geography, extreme, less-stimulative,
Nearest to economic: inspires, liberalizations, bowen, crest, hotly, wrapped, teton, intangibilityrespect,
Nearest to months: mid-s, dorrow, negro, belie, reaffirmed, trillion, sync, -nation,
Nearest to consumer: now-useless, kennedy, mid-six-lies, glimmer, lies, keating, dormant, sanders,
Nearest to rate: voted, stumbling, keyboard, rino, lobust, arthrocare, socially, sadly,
Nearest to trading: khedouri, lighter, undervalued, squarely, frost, modestly, director, widely,
Nearest to recent: hispanic, figured, drugs, resoundingly, cautious, defying, trillion, supplied,
Nearest to economists: roust, laggards, intraday, thomas, compensatory, relies, easily, feeds,
Nearest to rise: retroactive, ineffective, differentials, brake, narrowed, formulate, cheaply, lapping,
Nearest to sales: translate, nonfood, lambs, home-loan, in

Average loss at step  42000 :  3.61640396178
Average loss at step  44000 :  3.58337484741
Average loss at step  46000 :  3.54703198516
Average loss at step  48000 :  3.53917959368
Average loss at step  50000 :  3.53142368519
Nearest to higher: rising, lower, sport-utility, suburbs, straphangers, per-share, gasoline, less-stimulative,
Nearest to economic: economy, torvormayororyt, policysetting, lets-get-ours-now, precursor, unsecured, northern, intend,
Nearest to months: years, days, month, downgraded, year, week, dimmed, silvia,
Nearest to consumer: kennedy, allowance, now-useless, weinberg, atmospheric, ammunition, overtake, dormant,
Nearest to rate: percent, rates, sadly, kilman, muggy, finish, executions, wrangling,
Nearest to trading: buoying, performances, khedouri, billings, romney, burned, lighter, flowing,
Nearest to recent: resoundingly, heaviest, aside, disbelief, hispanic, ment, redeem, crummy,
Nearest to economists: analysts, observers, mccarthy, pictures, princeton, trade

Average loss at step  92000 :  3.13483540112
Average loss at step  94000 :  3.13644749981
Average loss at step  96000 :  3.09720442224
Average loss at step  98000 :  3.10535270256
Average loss at step  100000 :  3.10903804255
Nearest to higher: rising, lower, per-share, suburbs, sport-utility, ilmited, landline, shuffled,
Nearest to economic: policysetting, precursor, northern, lets-get-ours-now, visits, pro-gram, candidacy, discrimination,
Nearest to months: years, downgraded, month, days, unhappy, turreted, wood, notch,
Nearest to consumer: producer, ammunition, weinberg, slid, allowance, now-useless, wound, overtake,
Nearest to rate: rates, sadly, rale, kilman, percent, rodriguez, executions, muggy,
Nearest to trading: buoying, performances, billings, khedouri, romney, pernaps, flowing, mitt,
Nearest to recent: heaviest, aside, disbelief, posed, soothed, preceding, smooths, automobilesinspired,
Nearest to economists: analysts, observers, forecasters, some, adjustmentthe, pictures, g

Average loss at step  142000 :  2.89190782869
Average loss at step  144000 :  2.88818569219
Average loss at step  146000 :  2.87269023454
Average loss at step  148000 :  2.86962304056
Average loss at step  150000 :  2.87146334839
Nearest to higher: rising, lower, per-share, suburbs, ilmited, sport-utility, ultralow, ruin,
Nearest to economic: policysetting, candidacy, precursor, lets-get-ours-now, near--point, unsecured, pro-gram, pectations,
Nearest to months: month, downgraded, years, wood, silvia, week, coupons, days,
Nearest to consumer: producer, ammunition, snappy, retail, slid, weinberg, atmospheric, tax-exempt,
Nearest to rate: rale, rates, rodriguez, militar, sadly, butwith, astounding, percent,
Nearest to trading: performances, buoying, pernaps, khedouri, billings, leuthold, mitt, uniformly,
Nearest to recent: preceding, posed, smooths, disbelief, cloque, confusing, soothed, dichotomy,
Nearest to economists: analysts, observers, forecasters, chieftains, announces, pictures, p

Average loss at step  192000 :  2.75270380032
Average loss at step  194000 :  2.72230962199
Average loss at step  196000 :  2.72265428668
Average loss at step  198000 :  2.73260407889
Average loss at step  200000 :  2.70411172068
Nearest to higher: rising, lower, per-share, suburbs, ilmited, ultralow, short-circuited, sport-utility,
Nearest to economic: policysetting, pectations, candidacy, northern, precursor, discredit, upticks, economys,
Nearest to months: month, years, downgraded, turreted, wood, year, elections, signify,
Nearest to consumer: producer, ammunition, overtake, wholesale, retail, snappy, sends, recalling,
Nearest to rate: rale, rates, rodriguez, sadly, militar, butwith, five-tenths, kilman,
Nearest to trading: pernaps, equity-linked, uniformly, buoying, performances, mitt, khedouri, billings,
Nearest to recent: posed, preceding, smooths, oracle, soothed, jcome, disbelief, copycat,
Nearest to economists: analysts, observers, forecasters, purported, chieftains, nakasone,

Average loss at step  242000 :  2.63630158734
Average loss at step  244000 :  2.61227878964
Average loss at step  246000 :  2.59899818319
Average loss at step  248000 :  2.6184320522
Average loss at step  250000 :  2.60498313558
Nearest to higher: rising, lower, per-share, suburbs, eye-opener, ilmited, ruin, ultralow,
Nearest to economic: policysetting, pectations, debates, candidacy, export, magnitude, artificially, economy,
Nearest to months: month, years, downgraded, turreted, thirty-sixth, elections, billion, turnabouts,
Nearest to consumer: producer, ammunition, overtake, sends, rolling, recalling, re-examine, pushes,
Nearest to rate: rates, rale, rodriguez, executions, five-tenths, percent, militar, muggy,
Nearest to trading: pernaps, uniformly, khedouri, commemorate, performances, mitt, equity-linked, leuthold,
Nearest to recent: posed, smooths, oracle, preceding, last, disbelief, pretense, reignite,
Nearest to economists: analysts, observers, forecasters, purported, peo-, orche

Average loss at step  292000 :  2.53581237155
Average loss at step  294000 :  2.52999109167
Average loss at step  296000 :  2.52623327297
Average loss at step  298000 :  2.52403345424


In [13]:
with open('word-list.p', 'wb') as fopen:
    pickle.dump(list(reverse_dictionary.values()), fopen, protocol=2)

In [14]:
with open('vector.p', 'wb') as fopen:
    pickle.dump(final_embeddings, fopen, protocol=2)