In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import numpy as np
from tensorflow.contrib.tensorboard.plugins import projector
import tensorflow as tf 
import random
import urllib
import zipfile
from collections import Counter

# step1 : process data
#1.1下载数据
#1.2读取数据
#1.3构建字典
#1.4将词转换为下标
#1.5按照skip-gram重新组织数据
#1.6把一个batch数据处理成numpy格式返回
#集成 ： 数据处理函数

In [3]:
#1.1 下载数据
DOWNLOAD_URL = 'http://mattmahoney.net/dc/'#下载地址
FILE_NAME = 'text8.zip'#文件名
DATA_FOLDER = 'data/'#存放路径
EXPECTED_BYTES = 31344016

#数据预处理参数
VOCAB_SIZE = 50000
BATCH_SIZE = 128
EMBED_SIZE = 128 # dimension of the word embedding vectors
SKIP_WINDOW = 1 # the context window
NUM_SAMPLED = 64    # Number of negative examples to sample.
LEARNING_RATE = 1.0
NUM_TRAIN_STEPS = 100000
WEIGHTS_FLD = 'processed/'
SKIP_STEP = 2000

def download_data(file_name,from_url,to_floder,expected_bytes):
    download_from_path = from_url + file_name    #下载路径
    save_to_path = to_floder + file_name         #存储路径
    if os.path.exists(save_to_path):             #检查文件是否已经下载
        print("Data Ready : Data has Already been download")
        return save_to_path
    print("Down load Data,please wait")
    file_path,_ = urllib.request.urlretrieve(download_from_path,filename=save_to_path)
    
    #验证文件已经下载完成
    if os.stat('data/text8.zip').st_size == EXPECTED_BYTES :
        print("Data Ready : Down load complete")
    else:
        raise Exception('File'+file_name+"might have been corrupted,download again")
    return file_path

In [4]:
#1.2 读取数据
def read_data(file_path):
    with zipfile.ZipFile(DATA_FOLDER+FILE_NAME) as file:
        doc_str=""
        for f_name in file.namelist():
            file_str = file.read(f_name)
            #print("FileName :",f_name,";    [:20]:",file_str[:20])
            doc_str+=file_str.decode()
        words = tf.compat.as_str(doc_str).split()
    return words

In [5]:
#1.3 构建字典
#为了节省内存，限制字典的大小
#因此将出现频率最高的词抽取出来构建字典
#vocab_size = 1000
def build_word2vec_dict(words,vocab_size):
    counted = Counter(words) #构建Counter对象
    words_most_common = [('UNK',-1)]
    words_most_common.extend(counted.most_common(n=vocab_size-1))#抽取高频词汇，输出一个list
                                                                    #形式[(word_1,112),(word_2,111)...]
    word2vec_dictionary ={}#构建字典
    with open('processed/vocab_1000.tsv','w') as f: #部分words写入文档
        index = 0
        for word,_ in words_most_common:
            word2vec_dictionary[word]=index
            if index<1000:
                f.write(word+"\n")
            index += 1
        index_dict_for_word2vec = dict(zip(word2vec_dictionary.values(),word2vec_dictionary.keys()))
    return word2vec_dictionary,index_dict_for_word2vec

In [6]:
#1.4 将词转换为下标
def convert_words_to_index(words,vocab_dict):
    return [vocab_dict[word] if word in vocab_dict else 0 for word in words]
#converted_words = convert_words_to_index(words,word2vec_dictionary)
#len(converted_words)

#1.5 按照skip-gram重新组织数据


为什么窗口随机? : -https://www.leiphone.com/news/201706/QprrvzsrZCl4S2lw.html

In [7]:
def get_example(converted_words,skip_window=SKIP_WINDOW):#可迭代，每次生成一个example
    window_size = random.randint(1,skip_window)
    for center_index,center_word in enumerate(converted_words):  #抽取中心词
        #从中心词左边抽取，生成example
        for word in converted_words[max(0,center_index-window_size):center_index]:
            yield center_word,word               #,center_index,"left"
        #从中心词右边抽取，生成example
        for word in converted_words[center_index + 1 : min(center_index+window_size+1,len(converted_words)-1)]:
            yield center_word,word               #,center_index,"right"


In [8]:
def example_batch_producer(raw_data,method_generate_example,batch_size,skip_window=SKIP_WINDOW): #可迭代,每次生成一个batch
    example_iterator = method_generate_example(raw_data,skip_window)
    while True:
        batch_data_list = []
        for i in range(batch_size):
            batch_data_list.append(next(example_iterator))
        example_batch = np.array(batch_data_list)
        yield example_batch

#atch_example_iterator = example_batch_producer(converted_words,get_example,4)

In [9]:
#1.6 数据处理
def process_data(vocab_size=VOCAB_SIZE,batch_size=BATCH_SIZE,skip_window=SKIP_WINDOW):
    """Return a iterator that generate example_batch
        each batch is [
        [center_word,target_word], #in index num
        [center_word,target_word]
        [center_word,target_word]
        ... ...
        ]


    Example
    example_batch_generator = process_data()
    for i in range(5):
        print(next(example_batch_generator).shape)
    """ 
    data_path = download_data(FILE_NAME,DOWNLOAD_URL,DATA_FOLDER,None)#下载数据，返回存储路径
    docs = read_data(data_path)
    word2vec_dictionary,index_dict_for_word2vec = build_word2vec_dict(docs,vocab_size)
    converted_docs = convert_words_to_index(docs,word2vec_dictionary)
    del docs#节省空间
    batch_example_iterator = example_batch_producer(converted_docs,get_example,batch_size)
    return batch_example_iterator



# step 2 : build the model

In [10]:
class SkipGramModel(object):
    def __init__(self,vocab_size,embed_size,batch_size,num_sample,learning_rate):
        self.name = "Skip Gram Model"
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.batch_size = batch_size
        self.num_sample = num_sample
        self.learning_rate = learning_rate
        self.global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step')
    
    def _create_placeholder(self):
        """ 定义place_holder作为数据入口 """
        with tf.variable_scope("data"):
            self.center_words_batch = tf.placeholder(tf.int32,shape=[self.batch_size],name="center_words")
            self.target_words_batch = tf.placeholder(tf.int32,shape=[self.batch_size,1],name="target_words")
    
    """ 定义权重矩阵(注意word2vec里实际上这个权重矩阵里，每一行(或者列)就是我们的词向量，所以很重要) """
    def _create_embedding(self):
        with tf.device('/cpu:0'):
            with tf.variable_scope("embed"):
                self.embed_matrix = tf.Variable(tf.random_uniform([self.vocab_size,self.embed_size],-1.0,1.0),name="embed_matrix")
                
    """ 定义word2vec结果，同时定义损失函数 """
    def _get_loss(self):
        with tf.device('/cpu:0'):
            with tf.name_scope("loss"):
                embed = tf.nn.embedding_lookup(self.embed_matrix,self.center_words_batch)
                soft_max_weight = tf.Variable(tf.truncated_normal([self.vocab_size,self.embed_size],stddev=1.0/(self.embed_size**0.5)),name='nce_weight')
                soft_max_bias   = tf.Variable(tf.zeros([self.vocab_size]),name='nce_bias')
                self.loss = tf.reduce_mean(tf.nn.nce_loss(soft_max_weight,
                                                          soft_max_bias,
                                                          self.target_words_batch,
                                                          embed,
                                                          self.num_sample,
                                                          self.vocab_size,
                                                          name='loss')
                                                         )
    
    """ 设定optimizer """
    def _create_optimizer(self):
        with tf.device('/cpu:0'):
            self.optimizer = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(self.loss,global_step=self.global_step)
    
    """ 设定summary，以便在Tensorboard里进行可视化 """
    def _create_summaries(self):
        with tf.name_scope("summaries"):
            tf.summary.scalar('loss',self.loss)
            tf.summary.histogram('histogram_loss',self.loss)
            self.summary_op = tf.summary.merge_all()
    
    """ 构建整个图的Graph """
    def _build_graph(self):
        self._create_placeholder()
        self._create_embedding()
        self._get_loss()
        self._create_optimizer()
        self._create_summaries()

In [11]:
skip_gram = SkipGramModel(VOCAB_SIZE,EMBED_SIZE,BATCH_SIZE,NUM_SAMPLED,LEARNING_RATE)

In [12]:
example_batch_generator = process_data()
skip_gram._build_graph()

Data Ready : Data has Already been download


In [20]:
#training
saver = tf.train.Saver()
initial_step = 0

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    ckpt = tf.train.get_checkpoint_state(os.path.dirname('checkpoints/checkpoint'))
    if ckpt is not None:
        #print('embed_matrix')
        #print(sess.run(skip_gram.embed_matrix))
        checkpoint_path = ckpt.model_checkpoint_path
        saver.restore(sess,checkpoint_path)
        #print('check_point restored')
        #print(sess.run(skip_gram.embed_matrix))
    initial_step = skip_gram.global_step.eval()
    total_loss=0
    for i in range(initial_step,initial_step+NUM_TRAIN_STEPS): 
        example_batch = next(example_batch_generator)
        center_word_batch = example_batch[:,0]
        target_word_batch = example_batch[:,1]
        center_word_batch = center_word_batch.reshape(center_word_batch.shape[0])
        target_word_batch = target_word_batch.reshape((target_word_batch.shape[0],1))
        center_word_batch = center_word_batch.astype(np.int32)
        target_word_batch = target_word_batch.astype(np.int32)

        feed_dict = {
            skip_gram.center_words_batch : center_word_batch,
            skip_gram.target_words_batch : target_word_batch
        }
        loss,_,summary = sess.run([skip_gram.loss,skip_gram.optimizer,skip_gram.optimizer],feed_dict=feed_dict)
        total_loss+= loss
        if (i+1)%SKIP_STEP ==0:
            
            print("Average_loss at step {} : {}".format(i,total_loss/SKIP_STEP))
            saver.save(sess,'checkpoints/skip-gram',global_step=i)
            total_loss = 0

INFO:tensorflow:Restoring parameters from checkpoints/skip-gram-67999
Average_loss at step 69999 : 4.9190473415255545
Average_loss at step 71999 : 4.909809581637383
Average_loss at step 73999 : 4.84408086168766
Average_loss at step 75999 : 4.857043523907661
Average_loss at step 77999 : 4.793568802595138
Average_loss at step 79999 : 4.849250820279122
Average_loss at step 81999 : 4.657039531826973
Average_loss at step 83999 : 4.572699164271355
Average_loss at step 85999 : 4.738110695719719
Average_loss at step 87999 : 4.6825666139125826
Average_loss at step 89999 : 4.730355766296387
Average_loss at step 91999 : 4.654880146503449
Average_loss at step 93999 : 4.693806883096695
Average_loss at step 95999 : 4.706434701919556
Average_loss at step 97999 : 4.592566548585892
Average_loss at step 99999 : 4.525672862529754
Average_loss at step 101999 : 4.612213665604592
Average_loss at step 103999 : 4.587191364228725
Average_loss at step 105999 : 4.59926750433445
Average_loss at step 107999 : 4.60

In [41]:
#take out word_embedding weight
with tf.Session() as sess:
    ckpt = tf.train.get_checkpoint_state(os.path.dirname('checkpoints/checkpoint'))
    if ckpt is not None:
        checkpoint_path = ckpt.model_checkpoint_path
        saver.restore(sess,checkpoint_path)
    embed_mat = sess.run(skip_gram.embed_matrix)
    #print(embed_mat.shape)
    #print(embed_mat.dtype)
    #print(type(embed_mat))
    embedding_var = tf.Variable(embed_mat[:1000],name='embedding')
    sess.run(embedding_var.initializer)

    config = projector.ProjectorConfig()
    writer = tf.summary.FileWriter('processed')

    embedding = config.embeddings.add()
    embedding.tensor_name = embedding_var.name
    embedding.metadata_path = '/processed/vocab_1000.tsv'

    projector.visualize_embeddings(writer,config)
    saver_embed = tf.train.Saver([embedding_var])
    saver_embed.save(sess,'processed/model3.ckpt',1)
    

INFO:tensorflow:Restoring parameters from checkpoints/skip-gram-167999
