In [1]:
import tensorflow as tf
import numpy as np
import re
import itertools
import time
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

Create a stream of words, then skipgram pairs, then training batches based on the input file.
These streams are built on-demand (see: Python generators) so the whole file does not have to be read into memory at once, allowing training on big datasets.

In [2]:
def word_stream(file_name, buf_bytes=1000000):
    with open(file_name, "r") as f:
        chars = f.read(buf_bytes)
        max_index = 1
        while max_index != 0:
            max_index = 0
            for match in re.finditer("([a-z]+)\\s", chars):
                yield match.group(1)
                max_index = match.end(0)
            chars = chars[max_index:] + f.read(buf_bytes)
        if re.match("[a-z]+", chars):
            yield chars

In [3]:
def vocabulary_statistics(word_stream):
    words = sorted(list(set(word_stream)))
    stats = {}
    for i, w in enumerate(words):
        stats[i] = w
        stats[w] = i
    return stats

In [4]:
def int_stream(word_stream, stats):
    for w in word_stream:
        yield stats[w]

In [5]:
def skipgram_pair_stream(stream, window_size):
    buffer = list(itertools.islice(stream, window_size + 1))
    pointer = 0
    while pointer < len(buffer):
        for i in range(-window_size, window_size + 1):
            other = pointer + i
            if other < 0 or other >= len(buffer) or other == pointer:
                continue
            yield (buffer[pointer], buffer[other])
        # append next of stream to head of buffer (if available)
        try:
            buffer.append(next(stream))
        except StopIteration:
            pass
        # move center point to the head
        pointer += 1
        # remove from tail if no longer needed
        if pointer > window_size:
            buffer.pop(0)
            pointer -= 1

In [6]:
def training_batch_stream(skipgram_stream, batch_size, cache_size=100000):
    cache = list(itertools.islice(skipgram_stream, cache_size))
    while True:
        for i in range(0, len(cache) - batch_size + 1, batch_size):
            block = cache[i:i + batch_size]
            inputs = [pair[0] for pair in block]
            targets = [pair[1] for pair in block]
            yield (inputs, targets)
        cache = cache[len(cache) - (len(cache) % batch_size):]
        new_elements = list(itertools.islice(skipgram_stream, cache_size))
        cache += new_elements
        if len(new_elements) == 0:
            break
    if len(cache) > 0:
        inputs = [pair[0] for pair in cache]
        targets = [pair[1] for pair in cache]
        yield (inputs, targets)

In [7]:
def build_training_stream(text_file_name, stats, window_size, batch_size):
    w_stream = word_stream(text_file_name)
    i_stream = int_stream(w_stream, stats)
    sgp_stream = skipgram_pair_stream(i_stream, window_size)
    batch_stream = training_batch_stream(sgp_stream, batch_size)
    return batch_stream

Build the TensorFlow execution graph for the neural network. The network is fed a list (batch) of input classes and a list of target classes (in the form of 1d vectors of word indices). The result is a 1d vector of the loss for each input.

In [50]:
def build_network(vocab_size, embedding_size, num_samples):
    tf.reset_default_graph()
    
    # input and target output are passed into the network via these placeholders and feed_dict
    inputs_placeholder = tf.placeholder(shape=(None, ), dtype=tf.int32)
    targets_placeholder = tf.placeholder(shape=(None, None), dtype=tf.int32)
    
    weights_initializer = tf.random_uniform_initializer(minval=-0.05, maxval=0.05)
    # weights of input -> hidden (embeddings matrix)
    weights_1 = tf.get_variable("weights_1", shape=(vocab_size, embedding_size),
                                dtype=tf.float32, initializer=weights_initializer)
    # weights of hidden -> output
    #weights_2 = tf.get_variable("weights_2", shape=(embedding_size, vocab_size),
    #                            dtype=tf.float32, initializer=weights_initializer)
    

    
    # Network input is a 1d vector of word indices
    # convert to a 2d matrix of 1-hot vectors
    #net_inputs = tf.one_hot(inputs_placeholder, depth=vocab_size)
    # multiply with embedding matrix
    #net_mul1 = tf.matmul(net_inputs, weights_1)
    net_mul1 = tf.nn.embedding_lookup(weights_1, inputs_placeholder)
    
    # use sampled softmax loss (number of samples specified)
    if num_samples is not None:
        weights_2 = tf.get_variable("weights_2", shape=(vocab_size, embedding_size),
                                    dtype=tf.float32, initializer=weights_initializer)
        zero_bias = tf.zeros(vocab_size, dtype=tf.float32)
        #w2_transposed = tf.transpose(weights_2)
        loss = tf.nn.sampled_softmax_loss(inputs=net_mul1, weights=weights_2, biases=zero_bias,
                                          labels=targets_placeholder, num_sampled=num_samples, 
                                          num_classes=vocab_size)
    # use regular softmax loss (no number of samples specified)
    else:
        weights_2 = tf.get_variable("weights_2", shape=(embedding_size, vocab_size),
                                    dtype=tf.float32, initializer=weights_initializer)
        net_output = tf.matmul(net_mul1, weights_2)
        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=targets_placeholder,
                                                              logits=net_output)
    
    # return only what is necessary
    # input and target placeholders are for feeding data
    # loss is connected to an optimizer which works its way back to the weights to adjust them
    # weights_1 is the embedding matrix containing the word embeddings
    loss = tf.reduce_mean(loss)
    return (inputs_placeholder, targets_placeholder, loss, weights_1)

In [58]:
def train_network(inputs_placeholder, targets_placeholder, weights_1, loss,
                  train_stream_builder, epochs, learning_rate, total_pairs,
                  words):
    print("training started")
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)
    time_baseline = time.time()
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for e in range(epochs):
            batch_count, pairs_count, sum_loss = 0, 0, 0.0
            for batch_inputs, batch_targets in train_stream_builder():
                batch_inputs_np = np.array(batch_inputs)
                batch_targets_np = np.array(batch_targets)[:, None]
                #print(batch_inputs_np)
                #print(batch_targets_np)
                feed_dict = {inputs_placeholder: batch_inputs_np, targets_placeholder: batch_targets_np}
                time_start = time.time()
                sess.run(optimizer, feed_dict=feed_dict)
                time_end = time.time()
                batch_count += 1
                pairs_count += len(batch_inputs)
                batch_loss = sess.run(tf.reduce_mean(loss), feed_dict=feed_dict)
                sum_loss += batch_loss
                if time.time() - time_baseline >= 10.0:
                    status_info = "{}/{} pairs, avg loss: {:.5f}, time per batch: {:.5f}s"
                    status_info = status_info.format(pairs_count, total_pairs,
                                                     sum_loss / float(batch_count),
                                                     time_end - time_start)
                    print(status_info)
                    test_analogies_quality(words, sess.run(weights_1))
                    print()
                    time_baseline = time.time()
        print("training complete")
        return sess.run(weights_1)

In [10]:
def plot_embeddings(words, embeddings_matrix, dot_size=1):
    tsne = TSNE(n_components=2, random_state=1)
    embeddings_matrix_2d = tsne.fit_transform(embeddings_matrix)
    %matplotlib notebook
    plt.scatter(embeddings_matrix_2d[:,0], embeddings_matrix_2d[:,1], s=dot_size)
    for i, word in enumerate(words):
        plt.text(embeddings_matrix_2d[i][0], embeddings_matrix_2d[i][1], word)
    plt.show()

In [11]:
def parse_term(s):
    term = []
    for m in re.finditer("(\\+|-)?(\\w+)", s):
        word, symbol = m.group(2), m.group(1)
        if symbol is None or symbol == "+":
            factor = 1
        elif symbol == "-":
            factor = -1
        else:
            raise ValueError("invalid symbol")
        term.append((word, factor))
    return term

In [12]:
def embedding_sum(words, embeddings_matrix, term):
    vector = np.zeros(len(embeddings_matrix[0]), dtype=np.float32)
    for word, factor in term:
        vector += embeddings_matrix[words.index(word)] * factor
    return vector

In [13]:
def cosine_similarities(words, embeddings_matrix, vector):
    similarities = []
    for i, word in enumerate(words):
        embedding = embeddings_matrix[i]
        similarity = embedding.dot(vector) / (np.linalg.norm(embedding) * np.linalg.norm(vector))
        similarities.append((word, similarity))
    return sorted(similarities, key = lambda s : -s[1])

In [14]:
def cosine_similarities_s(words, embeddings_matrix, s):
    term = parse_term(s)
    term_words = [t[0] for t in term]
    vector = embedding_sum(words, embeddings_matrix, term)
    similarities = cosine_similarities(words, embeddings_matrix, vector)
    similarities = [s for s in similarities if s[0] not in term_words]
    return similarities

In [15]:
def test_analogies_quality(words, embeddings_matrix):
    test_words = ["man", "his", "north", "one", "green", "king"]
    for tw in test_words:
        similarities = cosine_similarities_s(words, embeddings_matrix, tw)
        similarities_short = [s[0] for s in similarities[:5]]
        print("'{}': {}".format(tw, similarities_short))

In [60]:
text_file_name = "text8"
window_size = 5
batch_size = 1000
embedding_size = 128
num_samples = 1
epochs = 1
learning_rate = 0.02

In [26]:
stats = vocabulary_statistics(word_stream(text_file_name))
words = sorted(list(set(word_stream(text_file_name))))
train_pairs_estimated = sum(2 * window_size for w in word_stream(text_file_name))
train_stream_builder = lambda : build_training_stream(text_file_name, stats, window_size, batch_size)
vocab_size = len(words)
print("vocab size: ", vocab_size)


vocab size:  253854


In [61]:
network = build_network(vocab_size, embedding_size, num_samples)
inputs_placeholder, targets_placeholder, loss, weights_1 = network
embeddings_matrix = train_network(inputs_placeholder, targets_placeholder, weights_1,
                                  loss, train_stream_builder, epochs, learning_rate,
                                  train_pairs_estimated, words)

training started
15000/170052070 pairs, avg loss: 0.31240, time per batch: 0.57898s
'man': ['august', 'the', 'individualist', 'of', 'to']
'his': ['he', 'one', 'property', 'the', 'that']
'north': ['piron', 'schraufite', 'backflips', 'hung', 'cocoanut']
'one': ['he', 'his', 'the', 'property', 'which']
'green': ['aaas', 'tetons', 'derbies', 'bichat', 'uznam']
'king': ['abuse', 'ruler', 'been', 'chief', 'belief']

32000/170052070 pairs, avg loss: 0.28217, time per batch: 0.60257s
'man': ['expound', 'greene', 'doctrine', 'august', 'now']
'his': ['one', 'the', 'nine', 'he', 'that']
'north': ['institute', 'prior', 'syndicalism', 'power', 'include']
'one': ['the', 'nine', 'to', 'in', 'that']
'green': ['aaas', 'tetons', 'derbies', 'bichat', 'uznam']
'king': ['abuse', 'ruler', 'chief', 'belief', 'still']

49000/170052070 pairs, avg loss: 0.25875, time per batch: 0.55599s
'man': ['expound', 'william', 'greene', 'doctrine', 'now']
'his': ['one', 'nine', 'he', 'the', 'state']
'north': ['include', '

336000/170052070 pairs, avg loss: 0.12275, time per batch: 0.56143s
'man': ['democracy', 'imposed', 'senior', 'attack', 'company']
'his': ['agriculture', 'researchers', 'largest', 'archaic', 'hometown']
'north': ['along', 'rudolf', 'camel', 'alphabet', 'help']
'one': ['in', 'three', 'the', 'to', 'that']
'green': ['opportunity', 'sequences', 'neocolonialism', 'patriarchy', 'list']
'king': ['barasti', 'differentiate', 'noticed', 'diagnostic', 'strategic']

353000/170052070 pairs, avg loss: 0.12127, time per batch: 0.54936s
'man': ['democracy', 'imposed', 'senior', 'company', 'decision']
'his': ['agriculture', 'researchers', 'largest', 'archaic', 'existing']
'north': ['rudolf', 'camel', 'alphabet', 'help', 'along']
'one': ['in', 'three', 'to', 'the', 'that']
'green': ['opportunity', 'sequences', 'neocolonialism', 'patriarchy', 'list']
'king': ['ruler', 'defined', 'been', 'buoying', 'hugs']

370000/170052070 pairs, avg loss: 0.11934, time per batch: 0.57397s
'man': ['democracy', 'imposed',

'green': ['unknowingly', 'committee', 'ethologists', 'hierarchies', 'humanity']
'king': ['beyond', 'important', 'irving', 'received', 'organisms']

646000/170052070 pairs, avg loss: 0.09644, time per batch: 0.55740s
'man': ['reader', 'curbed', 'jews', 'e', 'membership']
'his': ['supply', 'arrangement', 'town', 'summit', 'remain']
'north': ['giving', 'arabic', 'fully', 'end', 'pistol']
'one': ['to', 'in', 'the', 'three', 'that']
'green': ['unknowingly', 'committee', 'ethologists', 'hierarchies', 'humanity']
'king': ['beyond', 'important', 'irving', 'received', 'organisms']

662000/170052070 pairs, avg loss: 0.09531, time per batch: 0.57804s
'man': ['reader', 'jews', 'curbed', 'e', 'membership']
'his': ['supply', 'arrangement', 'town', 'street', 'skill']
'north': ['giving', 'needed', 'arabic', 'end', 'pistol']
'one': ['to', 'the', 'in', 'three', 'was']
'green': ['unknowingly', 'committee', 'ethologists', 'diagnose', 'hendricks']
'king': ['beyond', 'irving', 'received', 'price', 'organism

'north': ['forms', 'austria', 'along', 'founded', 'growth']
'one': ['in', 'nine', 'two', 'the', 'provided']
'green': ['unknowingly', 'umbrella', 'bruckner', 'ethologists', 'diagnose']
'king': ['runway', 'nicolas', 'irving', 'verlag', 'noteworthy']

953000/170052070 pairs, avg loss: 0.08765, time per batch: 0.59528s
'man': ['reader', 'curbed', 'jews', 'membership', 'owned']
'his': ['gives', 'affinity', 'services', 'explains', 'determining']
'north': ['forms', 'austria', 'founded', 'effectively', 'along']
'one': ['in', 'nine', 'two', 'the', 'old']
'green': ['bruckner', 'unknowingly', 'umbrella', 'schubert', 'ethologists']
'king': ['runway', 'nicolas', 'sighted', 'irving', 'verlag']

970000/170052070 pairs, avg loss: 0.08652, time per batch: 0.55944s
'man': ['reader', 'curbed', 'jews', 'membership', 'owned']
'his': ['gives', 'affinity', 'services', 'explains', 'determining']
'north': ['forms', 'nations', 'austria', 'total', 'markets']
'one': ['in', 'nine', 'two', 'the', 'old']
'green': ['

'north': ['few', 'nations', 'desert', 'along', 'aggressive']
'one': ['nine', 'in', 'seven', 'two', 'federer']
'green': ['bruckner', 'unknowingly', 'umbrella', 'schubert', 'ethologists']
'king': ['testament', 'compatible', 'computers', 'trojan', 'expression']

1262000/170052070 pairs, avg loss: 0.09359, time per batch: 0.58285s
'man': ['india', 'stories', 'third', 'demographics', 'automobile']
'his': ['in', 'he', 'after', 'to', 'two']
'north': ['few', 'nations', 'desert', 'along', 'aggressive']
'one': ['nine', 'in', 'seven', 'two', 'federer']
'green': ['bruckner', 'unknowingly', 'umbrella', 'schubert', 'ethologists']
'king': ['compatible', 'computers', 'testament', 'sons', 'expression']

1278000/170052070 pairs, avg loss: 0.09277, time per batch: 0.55543s
'man': ['india', 'stories', 'third', 'worn', 'demographics']
'his': ['in', 'he', 'after', 'to', 'agassi']
'north': ['few', 'nations', 'along', 'aggressive', 'minority']
'one': ['nine', 'in', 'seven', 'six', 'federer']
'green': ['bruckn

KeyboardInterrupt: 

In [None]:
plot_embeddings(words, embeddings_matrix)

In [None]:
test_analogies_quality(words, embeddings_matrix)