In [1]:
''' Multithreaded word2vec'''

import os
import sys
import threading
import time

import numpy as np
import tensorflow as tf

# word2vec = tf.load_op_library(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'word2vec_ops.so'))

flags = tf.app.flags
flags.DEFINE_string("save_path", None, "Directory to write the model and "
                    "training summaries.")
flags.DEFINE_string("train_data", None, "Training text file. "
                    "E.g., unzipped file http://mattmahoney.net/dc/text8.zip.")
flags.DEFINE_string(
    "eval_data", None, "File consisting of analogies of four tokens."
    "embedding 2 - embedding 1 + embedding 3 should be close "
    "to embedding 4."
    "See README.md for how to get 'questions-words.txt'.")
flags.DEFINE_integer("embedding_size", 200, "The embedding dimension size.")
flags.DEFINE_integer(
    "epochs_to_train", 15,
    "Number of epochs to train. Each epoch processes the training data once "
    "completely.")
flags.DEFINE_float("learning_rate", 0.2, "Initial learning rate.")
flags.DEFINE_integer("num_neg_samples", 100,
                     "Negative samples per training example.")
flags.DEFINE_integer("batch_size", 16,
                     "Number of training examples processed per step "
                     "(size of a minibatch).")
flags.DEFINE_integer("concurrent_steps", 12,
                     "The number of concurrent training steps.")
flags.DEFINE_integer("window_size", 5,
                     "The number of words to predict to the left and right "
                     "of the target word.")
flags.DEFINE_integer("min_count", 5,
                     "The minimum number of word occurrences for it to be "
                     "included in the vocabulary.")
flags.DEFINE_float("subsample", 1e-3,
                   "Subsample threshold for word occurrence. Words that appear "
                   "with higher frequency will be randomly down-sampled. Set "
                   "to 0 to disable.")
flags.DEFINE_boolean(
    "interactive", False,
    "If true, enters an IPython interactive session to play with the trained "
    "model. E.g., try model.analogy(b'france', b'paris', b'russia') and "
    "model.nearby([b'proton', b'elephant', b'maxwell'])")
flags.DEFINE_integer("statistics_interval", 5,
                     "Print statistics every n seconds.")
flags.DEFINE_integer("summary_interval", 5,
                     "Save training summary to file every n seconds (rounded "
                     "up to statistics interval).")
flags.DEFINE_integer("checkpoint_interval", 600,
                     "Checkpoint the model (i.e. save the parameters) every n "
                     "seconds (rounded up to statistics interval).")

FLAGS = tf.flags

In [2]:
class Options(object):
    
    def __init__(self):
        self.emb_dim = FLAGS.embedding_size
        self.train_data = FLAGS.train_data
        self.num_samples = FLAGS.num_neg_samples
        self.learning_rate = FLAGS.learning_rate
        self.epochs_to_train = FLAGS.epochs_to_train
        self.concurrent_steps = FLAGS.concurrent_steps
        self.batch_size = FLAGS.batch_size
        self.window_size = FLAGS.window_size
        self.min_count = FLAGS.min_count
        self.subsample = FLAGS.subsample
        self.statistics_interval = FLAGS.statistics_interval
        self.summary_interval = FLAGS.summary_interval
        self.checkpoint_interval = FLAGS.checkpoint_interval

        self.save_path = FLAGS.save_path
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)

        self.eval_data = FLAGS.eval_data

In [6]:
class Word2Vec(object):

    def __init__(self, options, session):
        self._options = options
        self._session = session
        self._word2id = {}
        self._id2word = []
        self.buildgraph()
        self.build_eval_graph()
        self.save_vocab()

    def read_analogies(self):
        questions = []
        questions_skipped = 0
        with open(self._options.eval_data, 'rb') as analogy_f:
            for line in analogy_f:
                if line.startswith(':'):
                    continue
                else:
                    words = line.strip().lower.split(" ")
                    ids = [self._word2id.get(w.strip()) for w in words]
                    if None in ids or len(ids) != 4:
                        questions_skipped += 1
                    else:
                        questions.append(np.array(ids))
        print("Eval analogy file: ", self._options.eval_data)
        print("Questions: ", len(questions))
        print("Skipped: ", questions_skipped)
        self._analogy_questions = np.array(questions, dtype=np.int32)


    def forward(self, examples, labels):
        opts = self._options

        # embedding: [vocab_size, emb_dim]
        init_width = 0.5 / opts.emb_dim
        emb = tf.Variable(
            tf.random_uniform(
                [opts.vocab_size, opts.emb_dim],
                -init_width, init_width),
            name='emb')
        self._emb = emb

        # softmax weight: [vocab_size, emb_dim].T
        sm_w_t = tf.Variable(
            tf.zeros([opts.vocab_size, opts.embedding_size]), 
            name='sw_m_t')

        # softmax bias: [vocab_size]
        sm_b = tf.Variable(tf.zeros(opts.vocab_size), name='sm_b')

        # global step
        self.global_step = tf.Variable(0, name='global_step')

        # nodes to compute the NCE loss w/ candidate sampling
        labels_matrix = tf.reshape(
            tf.cast(labels, dtype=tf.int64),
        [opts.batch_size, 1])
        
        # Negative Sampling
        sampled_ids, _, _ = (tf.nn.fixed_unigram_candidate_sampler(
            true_classes=labels_matrix, 
            num_true=1,
            num_sampled=opts.vocab_size,
            distortion=0.75,
            unigram=opts.vocab_size.tolist()))
        
        # Embeedings for examples: [batch_size, emb_dim]
        example_emb = tf.nn.embedding_lookup(emb, examples)
        
        # Biases for labels: [batch_size, 1]
        true_w = tf.nn.embedding_lookup(sm_w_t, examples)
        # Biases for labels: []
        true_b = tf.nn.embedding_lookup(sm_b, labels)
        
        # weight for sampled ids: [num_samples, emb_dim]
        sampled_w = tf.nn.embedding_lookup(sm_w_t, sampled_ids)
        #biases for sampled id: [num_sampled, 1]
        sampled_b = tf.nn.embedding_lookup(sm_b, sampled_ids)
        
        # True logits: [batch_size, num_sampled]
        

# Negative Sampling

## Introduction
Neural Language Probabilistic Language Model specifies rge distribution for the target words given the a sequence of words $h$ from content  
  
$$ P_{\theta}^{h}(w) = \frac{exp(s_\theta (w,h)} {\sum exp(s_\theta (w, h))}$$

This equation is intractable, and three solutions are presented:
1. tree structured vocabulary --> nontrival
2. importance sampling --> instable
3. NCE

## Scalable Log-Bilinear Models
let $q_w$ and $\gamma_w$ be the target and content representations for word $w$, given a sequence of context words $h = w_1, ..., w_n$, the model computes the predicted representation for the target word by $\widehat{q} = \sum c_i \odot \gamma_w$ where $c_i$ is the weight, and $s_\theta(w,h) = \widehat{q}_h ^T q_w + b_w$  
  
As our main concern is learning word representations, we are free to move away from the paradigm of predicting the target from context and do the reverse. This is motivated by the **distributed hypothesis**, words with similar meaning often occur in similar contexts. Thus we'll be looking for word representations that capture the context distributions -- to predict context from words

Unfortunately, predicting n-word context requires modeling the joing distribution of n-words. This is considerably harder than modeling one of the words. We can make this trackable by assuming words in different context positions are conditionally independent: given current word $w$

$$P_\theta^w(h) = \prod p_{i,\theta}^w(w_i) $$
$$s_{i,\theta}(w_i,w) = (c_i \odot \gamma_w)^T q_{w_i} + b_{w_i}$$

where the $c_i$, position-specific weight is optional

