In [1]:
import bz2
import collections
import os
import re
import tensorflow as tf

In [9]:
from lxml import etree 
import numpy as np
from attrdict import AttrDict

In [17]:
class Wikipedia:
    def __init__(self, wikipedia_path, cache_dir, vocabulary_size=10000):
        self._cache_dir = os.path.expanduser(cache_dir)
        self._pages_path = os.path.join(self._cache_dir, 'pages.bz2')
        self._vocabulary_path = os.path.join(self._cache_dir, 'vocabulary.bz2')
        
        if not os.path.isfile(self._pages_path):
            print('Read pages')
            self._read_pages(wikipedia_path)
        if not os.path.isfile(self._vocabulary_path):
            print('Build vocabulary')
            self._build_vocabulary(vocabulary_size)
        with bz2.open(self._vocabulary_path,'rt') as vocabulary:
            print('Read vocabulary')
            self._vocabulary = [x.strip() for x in vocabulary]
        
        self._indices = {x: i for i, x in enumerate(self._vocabulary)}
        
    def __iter__(self):
        with bz2.open(self._pages_path,'rt') as pages:
            for page in pages:
                words = page.strip().split()
                words = [self.encode(x) for x in words]
                yield words
            
    def vocabulary_size(self):
        return len(self._vocabulary)
    
    def encode(self, word):
        return self._indices.get(word,0)
    
    def decode(self,index):
        return self._vocabulary[index]
    
    TOKEN = re.compile(r'[A-Za-z]+|[!?.:,()]')
    def _read_pages(self,wikipedia_path):
        
        with bz2.open(wikipedia_path) as wikipedia, bz2.open(self._pages_path,'wt') as pages:
            for _,element in etree.iterparse(wikipedia, tag='{*}page'):
                if element.find('./{*}redirect') is not None:
                    continue
                page = element.findtext('./{*}revision/{*}text')
                words = self._tokenize(page)
                pages.write(' '.join(words)+'\n')
                element.clear()
                
    def _tokenize(cls, page):
        words = cls.TOKEN.findall(page)
        words = [x.lower() for x in words]
        return words
    
    def _build_vocabulary(self,vocabulary_size):
        
        counter = collections.Counter()
        with bz2.open(self._pages_path, 'rt') as pages:
            for page in pages:
                words = page.strip().split()
                counter.update(words)
                
        common = ['<unk>'] + counter.most_common(vocabulary_size-1)
        common = [x[0] for x in common]
        
        with bz2.open(self._vocabulary_path,'wt') as vocabulary:
            for word in common:
                vocabulary.write(word+'\n')
                
    def skip_grams(pages, max_context):
        for words in pages:
            for index, current in enumerate(words):
                context = np.random.randint(1,max_context)
                for target in words[max(0,current-context):index]:
                    yield current, target
                for target in words[index+1:index+1+context]:
                    yield current, target
                    
    
    def batched(iterator, batch_size):
        data = np.zeros(batch_size)
        target = np.zeros(batch_size)
        
        for index in range(batch_size):
            data[index], target[index] = next(iterator)
            
        yield data, target
        
        
    

In [22]:
def skip_grams(pages, max_context):
        for words in pages:
            for index, current in enumerate(words):
                context = np.random.randint(1,max_context)
                for target in words[max(0,current-context):index]:
                    yield current, target
                for target in words[index+1:index+1+context]:
                    yield current, target
                    
    
def batched(iterator, batch_size):
        data = np.zeros(batch_size)
        target = np.zeros(batch_size)
        
        for index in range(batch_size):
            data[index], target[index] = next(iterator)
            
        yield data, target

In [14]:
class EmbeddingModel:
    
    def __init__(self,data,target,params):
        self.data=data
        self.target=target
        self.params=params
        self.embeddings
        self.cost
        self.optimize
        
        
    def embeddings(self):
        initial = tf.random_uniform([self.params.vocabulary_size, self.params.embedding_size], -1.0, 1.0)
        return tf.Variable(initial)
    
    def optimize(self):
        optimizer = tf.train.MomentumOptimizer(learning_rate=self.params.learning_rate, momentum=self.params.momentum)
        return optimizer.minimize(self.cost)
    
    def cost(self):
        embedded = tf.nn.embedding_lookup(self.embeddings, self.data)
        weight = tf.Variable(tf.truncated_normal([self.params.vocabulary_size, self.params.embedding_size],
                                                 stddev=0.1/self.params.embedding_size**5))
        bias = tf.Variable(tf.zeros([self.params.vocabulary_size]))
        target = tf.expand_dims(self.target, 1)
        return tf.reduce_mean(tf.nn.nce_loss(weight, bias, embedded, target, self.params.constrastive_examples, self.params.vocabulary_size))
    
    

In [10]:
params = AttrDict(
vocabulary_size=10000,
max_context=10,
embedding_size=200,
contrastive_examples=100,
learning_rate=0.5,
momentum=0.5,
batch_size=1000,
)

In [11]:
data = tf.placeholder(tf.int32,[None])
target = tf.placeholder(tf.int32,[None])
model = EmbeddingModel(data, target, params)

In [21]:
corpus = Wikipedia('C:\\Users\\Lei\\regression\\TensorFlow\\enwiki-20170701.bz2',
                   'C:\\Users\\Lei\\regression\\TensorFlow\\wikipedia',params.vocabulary_size)

Read pages
Build vocabulary
Read vocabulary


In [23]:
examples = skip_grams(corpus, params.max_context)

In [25]:
batches = batched(examples, params.batch_size)

In [26]:
sess = tf.Session()

In [27]:
sess.run(tf.global_variables_initializer())

In [28]:
average = collections.deque(maxlen=100)

In [30]:
for index, batch in enumerate(batches):
    feed_dict = {data:batch[0], target:batch[1]}
    cost, _ = sess.run([model.cost, model.optimize], feed_dict=feed_dict)
    average.append(cost)
    print('{}:{:5.1f}'.format(index+1, sum(average)/len(average)) )
    


TypeError: Fetch argument <bound method EmbeddingModel.cost of <__main__.EmbeddingModel object at 0x0000028062BFE2B0>> has invalid type <class 'method'>, must be a string or Tensor. (Can not convert a method into a Tensor or Operation.)

In [33]:
embeddings = sess.run(model.embeddings)

TypeError: Fetch argument <bound method EmbeddingModel.embeddings of <__main__.EmbeddingModel object at 0x0000028062BFE2B0>> has invalid type <class 'method'>, must be a string or Tensor. (Can not convert a method into a Tensor or Operation.)

In [None]:
np.save('/home/user/wikipedia/embeddings.npy', embeddings)