In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.contrib.tensorboard.plugins import projector
import os
from process_data import process_data,download

In [2]:
VOCAB_SIZE = 50000
BATCH_SIZE = 128
EMBED_SIZE = 128 # dimension of the word embedding vectors
SKIP_WINDOW = 1 # the context window
NUM_SAMPLED = 64    # Number of negative examples to sample.
LEARNING_RATE = 1.0
NUM_TRAIN_STEPS = 10000
WEIGHTS_FLD = 'processed/'
SKIP_STEP = 2000 # how many steps to skip before reporting the loss

# 클래스 형식 TF 코드 

* FLAG로 config 받기
* init으로 초기화 후, build model으로 연결할 것
* train (or run_epoch) // test 함수를 따로 둘 것
* summary로 텐서보드 사용하기
* generator로 batch를 가져와서 epoch 돌도록 하기 (generator를 공부하기!)
* save와 restore로 serving까지

In [3]:
class SkipGramModel:     
    """ Build the graph for word2vec model """     
    def __init__(self, vocab_size, embed_size, batch_size, num_sampled, learning_rate):
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.batch_size = batch_size
        self.num_sampled = num_sampled
        self.lr = learning_rate
        self.global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step')

    def _create_placeholders(self):
        """ Step 1: define the placeholders for input and output """
        with tf.name_scope("data"):
            self.center_words = tf.placeholder(tf.int32, shape=[self.batch_size], name='center_words')
            self.target_words = tf.placeholder(tf.int32, shape=[self.batch_size, 1], name='target_words')

    def _create_embedding(self):
        """ Step 2: define weights. In word2vec, it's actually the weights that we care about """
        # Assemble this part of the graph on the CPU. You can change it to GPU if you have GPU
        with tf.device('/cpu:0'):
            with tf.name_scope("embed"):
                self.embed_matrix = tf.Variable(tf.random_uniform([self.vocab_size, 
                                                                    self.embed_size], -1.0, 1.0), 
                                                                    name='embed_matrix')

    def _create_loss(self):
        """ Step 3 + 4: define the model + the loss function """
        with tf.device('/cpu:0'):
            with tf.name_scope("loss"):
                # Step 3: define the inference
                embed = tf.nn.embedding_lookup(self.embed_matrix, self.center_words, name='embed')

                # Step 4: define loss function
                # construct variables for NCE loss
                nce_weight = tf.Variable(tf.truncated_normal([self.vocab_size, self.embed_size],
                                                            stddev=1.0 / (self.embed_size ** 0.5)), 
                                                            name='nce_weight')
                nce_bias = tf.Variable(tf.zeros([VOCAB_SIZE]), name='nce_bias')

                # define loss function to be NCE loss function
                self.loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weight, 
                                                    biases=nce_bias, 
                                                    labels=self.target_words, 
                                                    inputs=embed, 
                                                    num_sampled=self.num_sampled, 
                                                    num_classes=self.vocab_size), name='loss')
    def _create_optimizer(self):
        """ Step 5: define optimizer """
        with tf.device('/cpu:0'):
            self.optimizer = tf.train.GradientDescentOptimizer(self.lr).minimize(self.loss, 
                                                              global_step=self.global_step)

    def _create_summaries(self):
        with tf.name_scope("summaries"):
            tf.summary.scalar("loss", self.loss)
            tf.summary.histogram("histogram_loss", self.loss)
            # because you have several summaries, we should merge them all
            # into one op to make it easier to manage
            self.summary_op = tf.summary.merge_all()

    def build_graph(self):
        """ Build the graph for our model """
        self._create_placeholders()
        self._create_embedding()
        self._create_loss()
        self._create_optimizer()
        self._create_summaries()
        
    def train(self,batch_gen):
        saver = tf.train.Saver() 
        # defaults to saving all variables - in this case embed_matrix, nce_weight, nce_bias

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            ckpt = tf.train.get_checkpoint_state(os.path.dirname('ckp/skip_gram/'))
            # if that checkpoint exists, restore from checkpoint
            
            #print(ckpt.model_checkpoint_path)
            if ckpt and ckpt.model_checkpoint_path:
                saver.restore(sess, ckpt.model_checkpoint_path)
                
            total_loss = 0.0 # we use this to calculate late average loss in the last SKIP_STEP steps
            writer = tf.summary.FileWriter('improved_graph/lr' + str(LEARNING_RATE), sess.graph)
            for index in range(NUM_TRAIN_STEPS):
                centers, targets = next(batch_gen)
                loss_batch, _ = sess.run([self.loss, self.optimizer], 
                                            feed_dict={self.center_words: centers, self.target_words: targets})
                #writer.add_summary(summary, global_step=index)
                total_loss += loss_batch
                if (index + 1) % SKIP_STEP == 0:
                    print('Average loss at step {}: {:5.1f}'.format(index, total_loss / SKIP_STEP))
                    total_loss = 0.0
                    saver.save(sess, 'ckp/skip_gram/SK.model',index+1)
                    
           
            writer.close()

In [3]:
batch_gen = process_data(VOCAB_SIZE, BATCH_SIZE, SKIP_WINDOW)

Dataset ready


In [4]:
centers, _ = next(batch_gen)

In [7]:
centers

array([ 5236,  3082,  3082,    12,    12,     6,     6,   195,   195,
           2,     2,  3137,  3137,    46,    46,    59,    59,   156,
         156,   128,   128,   742,   742,   477,   477, 10588, 10588,
         134,   134,     1,     1, 27511, 27511,     2,     2,     1,
           1,   103,   103,   855,   855,     3,     3,     1,     1,
       15111, 15111,     0,     0,     2,     2,     1,     1,   151,
         151,   855,   855,  3582,  3582,     1,     1,   195,   195,
          11,    11,   191,   191,    59,    59,     5,     5,     6,
           6, 10730, 10730,   215,   215,     7,     7,  1326,  1326,
         105,   105,   455,   455,    20,    20,    59,    59,  2732,
        2732,   363,   363,     7,     7,  3676,  3676,     1,     1,
         709,   709,     2,     2,   372,   372,    27,    27,    41,
          41,    37,    37,    54,    54,   540,   540,    98,    98,
          12,    12,     6,     6,  1424,  1424,  2759,  2759,    19,
          19,   568]

In [4]:
def main():
    model = SkipGramModel(VOCAB_SIZE, EMBED_SIZE, BATCH_SIZE, NUM_SAMPLED, LEARNING_RATE)
    model.build_graph()
    batch_gen = process_data(VOCAB_SIZE, BATCH_SIZE, SKIP_WINDOW)
    model.train(batch_gen)


if __name__ == '__main__':
    main()

Dataset ready
Average loss at step 1999: 113.1
Average loss at step 3999:  52.5
Average loss at step 5999:  33.4
Average loss at step 7999:  23.8
Average loss at step 9999:  17.4


# API 

#### tf.nn.embedding_lookup(params, ids, partition_strategy='mod', name=None, validate_indices=True, max_norm=None) 

먼저 임베딩 매트릭스 [Vocab size, Embedding size(dimension)] 를 Variable로 가지고 있어야 함..! 이 파라미터도 jointly 훈련시킬 수 있지만, pre_trained 된 매트릭스를 <strong>params</strong>으로 넣을 수도 있을 것. <strong>ids</strong>에는 {'vocab id' : 'vocab's embedding'}으로 넘길 수 있는 one-hot id 벡터가 들어갈 것. (Vocab size)만큼의

#### tf.nn.nce_loss(weights, biases, labels, inputs, num_sampled, num_classes, ...)

배치 사이즈 만큼의 negative sampling softmax 로 계산한 loss 리턴
* weights는 embedding_matrix와 동일한 shape의 [Vocab D, Embedding D]의 nce weights
* biases는 Vocab size만큼의 shape
* labels이 배치사이즈 만큼의 현재 centor word로부터 나타나는 target_words
* inputs은 배치사이즈 만큼의 centor words
* num_sampled : negative sampling 갯수
* num_classes :  전체 Vocab의 수