Implementation of word2vec skip-gram using NCE loss

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

import numpy as np
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector

from process_data import process_data


In [2]:
VOCAB_SIZE = 50000
BATCH_SIZE = 128
EMBED_SIZE = 128 # feature dimension of the word embedding vectors
SKIP_WINDOW = 1 # the context window

NUM_SAMPLED = 64    # Number of negative examples to sample

LEARNING_RATE = 1.0
NUM_TRAIN_STEPS = 100000
SKIP_STEP = 5000 # how many steps to skip before reporting the loss


In [6]:
def word2vec(dataset):
    '''Phase 1: Assemble the graph'''
    # Step 1: define the placeholder for input(center word) and output(context word)
    with tf.name_scope('data'):
        center_words = tf.placeholder(tf.int32,shape=[BATCH_SIZE], name='center_words')
        context_words = tf.placeholder(tf.int32,shape=[BATCH_SIZE, 1], name='context_words')
        
    # Step 2: define weights in embedding matrix (weights for hidden layer)
    with tf.name_scope('embedding_matrix'):
        embed = tf.Variable(tf.random_uniform([VOCAB_SIZE, EMBED_SIZE], -1.0, 1.0, name='embed_matrix'))
        
    # Step 3: define forward inference
    with tf.name_scope('loss'):
        representation = tf.nn.embedding_lookup(embed, center_words, name='lookup')
        
        # Step 4: compute NCE loss (weights for output layer)
        nce_weight = tf.Variable(tf.truncated_normal([VOCAB_SIZE, EMBED_SIZE], stddev=1.0), name='nce_weight')
        nce_bias = tf.Variable(tf.zeros([VOCAB_SIZE]), name='nce_bias')
        loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weight, 
                                             biases=nce_bias,
                                             labels=context_words,
                                             inputs=representation,
                                             num_sampled=NUM_SAMPLED,
                                             num_classes=VOCAB_SIZE), name='loss')
    
    # Step 5: define optimizer
    optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(loss)
    
    '''Phase 2: Execute the computation'''
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        writer = tf.summary.FileWriter('./graphs/no_frills/', sess.graph)
        
        total_loss = 0.0
        
        for index in range(NUM_TRAIN_STEPS):
            centers, context = next(dataset)
            batch_loss, _ = sess.run([loss, optimizer], 
                                     feed_dict={center_words:centers, context_words:context})
            
            total_loss += batch_loss
            
            if(index+1) % SKIP_STEP == 0:
                print('Average loss at step {} : {}'.format(index, total_loss/SKIP_STEP))
                total_loss = 0
        writer.close()

def main():
    batch_gen = process_data(VOCAB_SIZE, BATCH_SIZE, SKIP_WINDOW)
    word2vec(batch_gen)

if __name__ == '__main__':
    main()

Dataset ready
Average loss at step 4999 : 87.01694188632965
Average loss at step 9999 : 28.162707678699494
Average loss at step 14999 : 15.702117580413818
Average loss at step 19999 : 10.9430840051651
Average loss at step 24999 : 8.775613186168671
Average loss at step 29999 : 7.956060899782181
Average loss at step 34999 : 7.2501611609458925
Average loss at step 39999 : 6.706117380619049
Average loss at step 44999 : 6.396649955368042
Average loss at step 49999 : 6.121219446563721
Average loss at step 54999 : 6.078530253648758
Average loss at step 59999 : 5.902586997079849
Average loss at step 64999 : 5.688620286917686
Average loss at step 69999 : 5.677082128810882
Average loss at step 74999 : 5.526431411600113
Average loss at step 79999 : 5.551550924563408
Average loss at step 84999 : 5.50891513915062
Average loss at step 89999 : 5.412726171064377
Average loss at step 94999 : 5.333961833238602
Average loss at step 99999 : 5.263475458574295
