In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.contrib.tensorboard.plugins import projector

from process_data import process_data,download

# Hyperparameter 

In [2]:
VOCAB_SIZE = 50000
BATCH_SIZE = 128
EMBED_SIZE = 128 # dimension of the word embedding vectors
SKIP_WINDOW = 1 # the context window
NUM_SAMPLED = 64    # Number of negative examples to sample.
LEARNING_RATE = 1.0
NUM_TRAIN_STEPS = 10000
SKIP_STEP = 2000 # how many steps to skip before reporting the loss

# 클래스 형식 TF 코드 

* FLAG로 config 받기
* init으로 초기화 후, build model으로 연결할 것
* train (or run_epoch) // test 함수를 따로 둘 것
* summary로 텐서보드 사용하기
* generator로 batch를 가져와서 epoch 돌도록 하기 (generator를 공부하기!)
* save와 restore로 serving까지

In [23]:
class SkipGramModel:     
    """ Build the graph for word2vec model """     
    def __init__(self):     
        self._create_placeholders()
        self._create_embedding()
        self._create_loss()
        self._create_optimizer()
        
    def _create_placeholders(self):
        """ Step 1: define the placeholders for input and output """
        self.center_words = tf.placeholder(tf.int32, shape=[BATCH_SIZE],name='center_words')
        self.target_words = tf.placeholder(tf.int32, shape=[BATCH_SIZE,1],name='target_words')
    
    def _create_embedding(self):         
        """ Step 2: define weights. In word2vec, it's actually the weights that we care about """

        self.embed_matrix = tf.Variable(tf.random_uniform([VOCAB_SIZE, EMBED_SIZE],-1.0,1.0,),
                                       name='embed_matrix')
        
        self.embed = tf.nn.embedding_lookup(self.embed_matrix,self.center_words,name='embed')
        
    def _create_loss(self):         
        """ Step 3 + 4: define the inference + the loss function """         
        

        self.nce_weight = tf.Variable(tf.truncated_normal([VOCAB_SIZE,EMBED_SIZE]),
                                name='nce_weight')
        self.nce_bias = tf.Variable(tf.zeros([VOCAB_SIZE]),name='nce_bias')
        
        self.loss = tf.reduce_mean(tf.nn.nce_loss(weights=self.nce_weight,
                                            biases=self.nce_bias,
                                            labels=self.target_words,
                                            inputs=self.embed,
                                            num_sampled=NUM_SAMPLED,
                                            num_classes=VOCAB_SIZE),
                             name='loss')
             
    def _create_optimizer(self):
        """ Step 5: define optimizer """         
        self.optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(self.loss)
        
    
    def run_epoch(self,batch_gen):
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())

            total_loss = 0.0 # we use this to calculate late average loss in the last SKIP_STEP steps
            writer = tf.summary.FileWriter('./my_graph/word2vec/', sess.graph)
            writer.add_graph(sess.graph)
            for index in range(NUM_TRAIN_STEPS):
                centers, targets = next(batch_gen)
                loss_batch, _ = sess.run([self.loss, self.optimizer], 
                                            feed_dict={self.center_words: centers, self.target_words: targets})
                total_loss += loss_batch
                if (index + 1) % SKIP_STEP == 0:
                    print('Average loss at step {}: {:5.1f}'.format(index, total_loss / SKIP_STEP))
                    total_loss = 0.0
            writer.close()

In [24]:
def main():
    batch_gen = process_data(VOCAB_SIZE, BATCH_SIZE, SKIP_WINDOW)
    model = SkipGramModel()
    model.run_epoch(batch_gen)

if __name__ == '__main__':
    main()

Dataset ready
Average loss at step 1999: 132.6
Average loss at step 3999:  62.8
Average loss at step 5999:  40.8
Average loss at step 7999:  28.7
Average loss at step 9999:  22.1
