In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import numpy as np
from tensorflow.contrib.tensorboard.plugins import projector
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [3]:
import random
import urllib
import zipfile
from collections import Counter

In [4]:
DOWNLOAD_URL = 'http://mattmahoney.net/dc/'
EXPECTED_BYTES = 31344016
DATA_FOLDER = 'data/'
FILE_NAME = 'text8.zip'

# 下载数据
def download(file_name, expected_bytes):
    file_path = DATA_FOLDER + file_name
    if os.path.exists(file_path):
        print("Dataset ready")
        return file_path
    file_name, _ = urllib.request.urlretrieve(DOWNLOAD_URL + file_name, file_path)
    file_stat = os.stat(file_path)
    if file_stat.st_size == expected_bytes:
        print('Successfully downloaded the file', file_name)
    else:
        raise Exception('File ' + file_name +
                        ' might be corrupted. You should try downloading it with a browser.')
    return file_path

# 读取数据
def read_data(file_path):
    with zipfile.ZipFile(file_path) as f:
        words = tf.compat.as_str(f.read(f.namelist()[0])).split() 
    return words

# 构建字典
def build_vocab(words, vocab_size):
    dictionary = dict()
    count = [('UNK', -1)]
    count.extend(Counter(words).most_common(vocab_size - 1))
    index = 0
    #utils.make_dir('processed')
    with open('processed/vocab_1000.tsv', "w") as f:
        for word, _ in count:
            dictionary[word] = index
            if index < 1000:
                f.write(word + "\n")
            index += 1
    index_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return dictionary, index_dictionary

# 词和下标的映射转换
def convert_words_to_index(words, dictionary):
    return [dictionary[word] if word in dictionary else 0 for word in words]

# 按照skip-gram重新组织数据
def generate_sample(index_words, context_window_size):
    for index, center in enumerate(index_words):
        context = random.randint(1, context_window_size)
        # get a random target before the center word
        for target in index_words[max(0, index - context): index]:
            yield center, target
        # get a random target after the center wrod
        for target in index_words[index + 1: index + context + 1]:
            yield center, target

# 把一个batch数据处理成numpy格式返回
def get_batch(iterator, batch_size):
    while True:
        center_batch = np.zeros(batch_size, dtype=np.int32)
        target_batch = np.zeros([batch_size, 1])
        for index in range(batch_size):
            center_batch[index], target_batch[index] = next(iterator)
        yield center_batch, target_batch

# 数据处理
def process_data(vocab_size, batch_size, skip_window):
    file_path = download(FILE_NAME, EXPECTED_BYTES)
    words = read_data(file_path)
    dictionary, _ = build_vocab(words, vocab_size)
    index_words = convert_words_to_index(words, dictionary)
    del words # 节省空间
    single_gen = generate_sample(index_words, skip_window)
    return get_batch(single_gen, batch_size)

# 如果文件夹不存在，创建文件夹
def make_dir(path):
    try:
        os.mkdir(path)
    except OSError:
        pass

In [5]:
VOCAB_SIZE = 50000
BATCH_SIZE = 128
EMBED_SIZE = 128 # dimension of the word embedding vectors
SKIP_WINDOW = 1 # the context window
NUM_SAMPLED = 64    # Number of negative examples to sample.
LEARNING_RATE = 1.0
NUM_TRAIN_STEPS = 100000
WEIGHTS_FLD = 'processed/'
SKIP_STEP = 2000

In [6]:
class SkipGramModel:
    """ 初始化成员变量 """
    def __init__(self, vocab_size, embed_size, batch_size, num_sampled, learning_rate):
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.batch_size = batch_size
        self.num_sampled = num_sampled
        self.lr = learning_rate
        self.global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step')

    def _create_placeholders(self):
        """ 定义容易用于存储数据 """
        with tf.name_scope("data"):
            self.center_words = tf.placeholder(tf.int32, shape=[self.batch_size], name='center_words')
            self.target_words = tf.placeholder(tf.int32, shape=[self.batch_size, 1], name='target_words')

    def _create_embedding(self):
        """ 设置权重(注意word2vec里实际上这个权重矩阵里，每一行(或者列)就是我们的词向量，所以很重要) """
        # 在CPU上跑
        with tf.device('/cpu:0'):
            with tf.name_scope("embed"):
                self.embed_matrix = tf.Variable(tf.random_uniform([self.vocab_size, 
                                                                    self.embed_size], -1.0, 1.0), 
                                                                    name='embed_matrix')

    def _create_loss(self):
        """ 定义word2vec结果，同时定义损失函数 """
        with tf.device('/cpu:0'):
            with tf.name_scope("loss"):
                # 其实就是一个简单的查表
                embed = tf.nn.embedding_lookup(self.embed_matrix, self.center_words, name='embed')

                # 定义损失函数，通常词表很大，我们会用层次化softmax或者negative sampling
                nce_weight = tf.Variable(tf.truncated_normal([self.vocab_size, self.embed_size],
                                                            stddev=1.0 / (self.embed_size ** 0.5)), 
                                                            name='nce_weight')
                nce_bias = tf.Variable(tf.zeros([VOCAB_SIZE]), name='nce_bias')

                self.loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weight, 
                                                    biases=nce_bias, 
                                                    labels=self.target_words, 
                                                    inputs=embed, 
                                                    num_sampled=self.num_sampled,
                                                    num_classes=self.vocab_size), name='loss')
    def _create_optimizer(self):
        """ 设定optimizer """
        with tf.device('/cpu:0'):
            self.optimizer = tf.train.GradientDescentOptimizer(self.lr).minimize(self.loss, 
                                                              global_step=self.global_step)

    def _create_summaries(self):
        """ 设定summary，以便在Tensorboard里进行可视化 """
        with tf.name_scope("summaries"):
            tf.summary.scalar("loss", self.loss)
            tf.summary.histogram("histogram_loss", self.loss)
            # 好几个summary，所以这里要merge_all
            self.summary_op = tf.summary.merge_all()

    def build_graph(self):
        """ 构建整个图的Graph """
        self._create_placeholders()
        self._create_embedding()
        self._create_loss()
        self._create_optimizer()
        self._create_summaries()

In [7]:
def train_model(model, batch_gen, num_train_steps, weights_fld):
    saver = tf.train.Saver() # defaults to saving all variables - in this case embed_matrix, nce_weight, nce_bias

    initial_step = 0
    make_dir('checkpoints')
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        ckpt = tf.train.get_checkpoint_state(os.path.dirname('checkpoints/checkpoint'))
        # if that checkpoint exists, restore from checkpoint
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)

        total_loss = 0.0 # we use this to calculate late average loss in the last SKIP_STEP steps
        writer = tf.summary.FileWriter('improved_graph/lr' + str(LEARNING_RATE), sess.graph)
        initial_step = model.global_step.eval()
        for index in range(initial_step, initial_step + num_train_steps):
            centers, targets = next(batch_gen)
            centers = centers.reshape(centers.shape[0])
            targets = targets.reshape((targets.shape[0],1))
            centers = centers.astype(np.int32)
            targets = targets.astype(np.int32)
            #print("centers",centers.shape,centers.dtype,type(centers))
            #print("targets",targets.shape,targets.dtype,type(targets))
            
            feed_dict={model.center_words: centers, model.target_words: targets}
            loss_batch, _, summary = sess.run([model.loss, model.optimizer, model.summary_op], 
                                              feed_dict=feed_dict)
            writer.add_summary(summary, global_step=index)
            total_loss += loss_batch
            if (index + 1) % SKIP_STEP == 0:
                print('Average loss at step {}: {:5.1f}'.format(index, total_loss / SKIP_STEP))
                total_loss = 0.0
                saver.save(sess, 'checkpoints/skip-gram', index)
        
        ####################
        # 如果要查看embeddings之后的结果
        # 把下面注释去掉，然后执行"'tensorboard --logdir='processed'"
        
        # final_embed_matrix = sess.run(model.embed_matrix)
        
        # 注意这里一定是Variable，千万不要设置成constant了
        # embedding_var = tf.Variable(final_embed_matrix[:1000], name='embedding')
        # sess.run(embedding_var.initializer)

        # config = projector.ProjectorConfig()
        # summary_writer = tf.summary.FileWriter('processed')

        # 添加embedding到配置/config文件
        # embedding = config.embeddings.add()
        # embedding.tensor_name = embedding_var.name
        
        # 和metadata(元数据)连接，这里是开始的500个词
        # embedding.metadata_path = 'processed/vocab_1000.tsv'

        # 保存词向量
        # projector.visualize_embeddings(summary_writer, config)
        # saver_embed = tf.train.Saver([embedding_var])
        # saver_embed.save(sess, 'processed/model3.ckpt', 1)

In [8]:
def main():
    model = SkipGramModel(VOCAB_SIZE, EMBED_SIZE, BATCH_SIZE, NUM_SAMPLED, LEARNING_RATE)
    model.build_graph()
    batch_gen = process_data(VOCAB_SIZE, BATCH_SIZE, SKIP_WINDOW)
    train_model(model, batch_gen, NUM_TRAIN_STEPS, WEIGHTS_FLD)

In [9]:
main()

Dataset ready
Average loss at step 1999: 113.5
Average loss at step 3999:  52.5
Average loss at step 5999:  33.3
Average loss at step 7999:  23.5
Average loss at step 9999:  18.1
Average loss at step 11999:  14.1
Average loss at step 13999:  11.9
Average loss at step 15999:   9.9
Average loss at step 17999:   8.5
Average loss at step 19999:   8.1
Average loss at step 21999:   7.1
Average loss at step 23999:   6.8
Average loss at step 25999:   6.8
Average loss at step 27999:   6.4
Average loss at step 29999:   6.0
Average loss at step 31999:   6.0
Average loss at step 33999:   5.7
Average loss at step 35999:   5.8
Average loss at step 37999:   5.5
Average loss at step 39999:   5.3
Average loss at step 41999:   5.4
Average loss at step 43999:   5.2
Average loss at step 45999:   5.2
Average loss at step 47999:   5.2
Average loss at step 49999:   5.0
Average loss at step 51999:   5.1
Average loss at step 53999:   5.2
Average loss at step 55999:   5.0
Average loss at step 57999:   5.0
Avera

In [10]:
np.ndarray.astype?