In [1]:
import numpy as np
import tensorflow as tf
import random  
from collections import Counter
import datetime, time, json

  from ._conv import register_converters as _register_converters


In [2]:
def create_word_pairs(int_corpus, window_size):
    idx_pairs = []
    # for each snetence 
    for sentence in int_corpus:
        # for each center word
        for center_word_pos in range(len(sentence)):
            center_word_idx = sentence[center_word_pos]
            # for each context word within window
            for w in range(-window_size, window_size + 1):
                context_word_pos = center_word_pos + w
                # make soure not jump out sentence
                if context_word_pos < 0 or context_word_pos >= len(sentence) or center_word_pos == context_word_pos:
                    continue
                context_word_idx = sentence[context_word_pos]
                idx_pairs.append((center_word_idx, context_word_idx))
                    
    return idx_pairs
        

In [3]:
def get_batches(idx_pairs, batch_size):
    n_batches = len(idx_pairs) // batch_size
    idx_pairs = idx_pairs[:n_batches*batch_size]
    for idx in range(0, len(idx_pairs), batch_size):
        x, y = [], []
        batch = idx_pairs[idx:idx+batch_size]
        for ii in range (len(batch)):
            x.append(batch[ii][0])
            y.append(batch[ii][1])        
        yield x, y  

### create word pairs

In [4]:
corpus = np.load("/Users/zhang/MscProject_tweak2vec/corpus/quora_corpus_int5.npy").tolist()
idx_pairs_SG = create_word_pairs(corpus, window_size = 5)
print('totally {0} word pairs'.format(len(idx_pairs_SG)))

totally 46007576 word pairs


In [5]:
wordlist = np.load('/Users/zhang/MscProject_tweak2vec/corpus/quora_vocab5.npy').tolist()
wordlist.append(['UNK','0'])
word2idx = {w[0]: wordlist.index(w) for w in wordlist }
idx2word = {wordlist.index(w): w[0] for w in wordlist }

### load pivot word vectors

In [7]:
f = open('pivots_google_10000.txt','r')
a = f.read()
pivots_vec = eval(a)
f.close()
print('load {0} pivot words'.format(len(list(pivots_vec.keys()))))

load 10000 pivot words


In [8]:
dict_slice = lambda adict, start, end: dict((k, adict[k]) for k in list(adict.keys())[start:end])
pivots_100 = dict_slice(pivots_vec, 0, 100)

In [9]:
pivots_idx = []
pivots_vec = []
for i in pivots_100.keys():
    pivots_idx.append(i)
    pivots_vec.append(pivots_100[i])

### create subset

In [22]:
corpus_shuffle = corpus[:]
random.shuffle(corpus_shuffle)

In [44]:
idx_pairs_shuffle = idx_pairs_SG[:]
random.shuffle(idx_pairs_shuffle)

In [45]:
idx_pairs_10m = idx_pairs_shuffle[:10000000]
idx_pairs_5m = idx_pairs_shuffle[:5000000]
idx_pairs_1m = idx_pairs_shuffle[:1000000]
idx_pairs_500k = idx_pairs_shuffle[:500000]

### a small tf lab :)

In [106]:
embed = tf.Variable([[0,0],[1,1]])
embed_2 = tf.Variable(tf.identity(embed))
ao = tf.scatter_update(embed_2,[0],[[-5,5]])
diff = tf.reduce_sum((embed-embed_2)**2)

sess = tf.Session()
sess.run(tf.global_variables_initializer())
print(sess.run(diff))
sess.run(ao)
print(sess.run(diff))

0
50


### build graph with negative sampling

In [19]:
n_vocab = len(word2idx)
n_embedding = 50
reg_constant = 1
n_sampled = 100
learning_rate = 0.001
epochs = 10
batch_size = 1000 # number of samples each iteration

In [14]:
train_graph = tf.Graph()
with train_graph.as_default():
    # input layer
    inputs = tf.placeholder(tf.int32, [batch_size], name='inputs')
    # labels is 2 dimensional as required by tf.nn.sampled_softmax_loss used for negative sampling.
    labels = tf.placeholder(tf.int32, [None, None], name='labels')
    
    # embedding layer
    init_width = 0.5 / n_embedding
    embedding = tf.Variable(tf.random_uniform((n_vocab, n_embedding), -init_width, init_width))
    embed = tf.nn.embedding_lookup(embedding, inputs)

    # add regularization term
    embedding_copy = tf.Variable(tf.identity(embedding), trainable=False)
    update_embed_op = tf.scatter_update(embedding_copy,pivots_idx,pivots_vec)
    reg_loss = tf.reduce_sum((embedding-embedding_copy)**2)
    
    # sampled softmax layer
    softmax_w = tf.Variable(tf.truncated_normal((n_vocab, n_embedding)), name="softmax_weights")
    softmax_b = tf.Variable(tf.zeros(n_vocab), name="softmax_bias")
    # Calculate the loss using negative sampling
    loss = tf.nn.sampled_softmax_loss(
        weights=softmax_w,
        biases=softmax_b,
        labels=labels,
        inputs=embed,
        num_sampled=n_sampled,
        num_classes=n_vocab)
    cost = tf.reduce_mean(loss)
    
    total_cost = cost + reg_loss


    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(total_cost)

### training

In [None]:
print("Starting training at", datetime.datetime.now())
t0 = time.time()

with train_graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=train_graph) as sess:
    iteration = 1
    loss = 0
    regular_loss = 0
    loss_best = 100
    loss_list = []
    iteration_best = 0
    sess.run(tf.global_variables_initializer())

    for e in range(1, epochs + 1):
        batches = get_batches(idx_pairs_SG, batch_size)
        start = time.time()
        for x, y in batches:
            feed = {inputs: x,
                    labels: np.array(y)[:, None]}
            sess.run(update_embed_op)
            train_loss, _ = sess.run([total_cost, optimizer], feed_dict=feed)
            
            loss += train_loss
            regular_loss += sess.run(reg_loss)

            if loss < loss_best:
                W = sess.run(embedding).tolist()
                iteration_best = iteration
                loss_best = loss

            if iteration % 100 == 0:
                end = time.time()
                loss_list.append(loss / 100)
                print("Epoch {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Avg. Training loss: {:.4f}".format(loss / 100),
                      "Avg. Reg. loss: {:.4f}".format(regular_loss / 100),
                      "{:.4f} sec/batch".format((end - start) / 100))


                loss = 0
                regular_loss = 0
                start = time.time()
            iteration += 1


Starting training at 2018-07-01 22:29:01.569067
Epoch 1/10 Iteration: 100 Avg. Training loss: 143.1327 Avg. Reg. loss: 134.7475 0.0511 sec/batch
Epoch 1/10 Iteration: 200 Avg. Training loss: 78.6602 Avg. Reg. loss: 70.8882 0.0444 sec/batch
Epoch 1/10 Iteration: 300 Avg. Training loss: 47.9476 Avg. Reg. loss: 40.3660 0.0572 sec/batch
Epoch 1/10 Iteration: 400 Avg. Training loss: 31.2946 Avg. Reg. loss: 23.9538 0.0468 sec/batch
Epoch 1/10 Iteration: 500 Avg. Training loss: 21.6965 Avg. Reg. loss: 14.4811 0.0472 sec/batch
Epoch 1/10 Iteration: 600 Avg. Training loss: 15.8977 Avg. Reg. loss: 8.8075 0.0467 sec/batch
Epoch 1/10 Iteration: 700 Avg. Training loss: 12.3788 Avg. Reg. loss: 5.3570 0.0512 sec/batch
Epoch 1/10 Iteration: 800 Avg. Training loss: 10.1588 Avg. Reg. loss: 3.2565 0.0468 sec/batch
Epoch 1/10 Iteration: 900 Avg. Training loss: 8.7913 Avg. Reg. loss: 1.9846 0.0485 sec/batch
Epoch 1/10 Iteration: 1000 Avg. Training loss: 7.9647 Avg. Reg. loss: 1.2181 0.0487 sec/batch
Epoch 

Epoch 1/10 Iteration: 8800 Avg. Training loss: 5.0737 Avg. Reg. loss: 0.0307 0.0490 sec/batch
Epoch 1/10 Iteration: 8900 Avg. Training loss: 5.0953 Avg. Reg. loss: 0.0303 0.0494 sec/batch
Epoch 1/10 Iteration: 9000 Avg. Training loss: 5.0479 Avg. Reg. loss: 0.0299 0.0498 sec/batch
Epoch 1/10 Iteration: 9100 Avg. Training loss: 5.0493 Avg. Reg. loss: 0.0304 0.0489 sec/batch
Epoch 1/10 Iteration: 9200 Avg. Training loss: 5.0312 Avg. Reg. loss: 0.0299 0.0491 sec/batch
Epoch 1/10 Iteration: 9300 Avg. Training loss: 5.0313 Avg. Reg. loss: 0.0297 0.0490 sec/batch
Epoch 1/10 Iteration: 9400 Avg. Training loss: 5.0180 Avg. Reg. loss: 0.0301 0.0502 sec/batch
Epoch 1/10 Iteration: 9500 Avg. Training loss: 5.0046 Avg. Reg. loss: 0.0290 0.0494 sec/batch
Epoch 1/10 Iteration: 9600 Avg. Training loss: 4.9806 Avg. Reg. loss: 0.0290 0.0490 sec/batch
Epoch 1/10 Iteration: 9700 Avg. Training loss: 5.0163 Avg. Reg. loss: 0.0295 0.0492 sec/batch
Epoch 1/10 Iteration: 9800 Avg. Training loss: 4.9799 Avg. R

Epoch 1/10 Iteration: 17500 Avg. Training loss: 4.6882 Avg. Reg. loss: 0.0256 0.0494 sec/batch
Epoch 1/10 Iteration: 17600 Avg. Training loss: 4.6640 Avg. Reg. loss: 0.0251 0.0472 sec/batch
Epoch 1/10 Iteration: 17700 Avg. Training loss: 4.6548 Avg. Reg. loss: 0.0252 0.0522 sec/batch
Epoch 1/10 Iteration: 17800 Avg. Training loss: 4.6890 Avg. Reg. loss: 0.0253 0.0508 sec/batch
Epoch 1/10 Iteration: 17900 Avg. Training loss: 4.6558 Avg. Reg. loss: 0.0247 0.0503 sec/batch
Epoch 1/10 Iteration: 18000 Avg. Training loss: 4.6617 Avg. Reg. loss: 0.0245 0.0500 sec/batch
Epoch 1/10 Iteration: 18100 Avg. Training loss: 4.6721 Avg. Reg. loss: 0.0247 0.0500 sec/batch
Epoch 1/10 Iteration: 18200 Avg. Training loss: 4.6778 Avg. Reg. loss: 0.0249 0.0504 sec/batch
Epoch 1/10 Iteration: 18300 Avg. Training loss: 4.6746 Avg. Reg. loss: 0.0245 0.0498 sec/batch
Epoch 1/10 Iteration: 18400 Avg. Training loss: 4.6517 Avg. Reg. loss: 0.0247 0.0503 sec/batch
Epoch 1/10 Iteration: 18500 Avg. Training loss: 4.

Epoch 1/10 Iteration: 26200 Avg. Training loss: 4.5582 Avg. Reg. loss: 0.0243 0.0523 sec/batch
Epoch 1/10 Iteration: 26300 Avg. Training loss: 4.5464 Avg. Reg. loss: 0.0238 0.0503 sec/batch
Epoch 1/10 Iteration: 26400 Avg. Training loss: 4.5486 Avg. Reg. loss: 0.0238 0.0505 sec/batch
Epoch 1/10 Iteration: 26500 Avg. Training loss: 4.5623 Avg. Reg. loss: 0.0240 0.0502 sec/batch
Epoch 1/10 Iteration: 26600 Avg. Training loss: 4.5528 Avg. Reg. loss: 0.0240 0.0504 sec/batch
Epoch 1/10 Iteration: 26700 Avg. Training loss: 4.5484 Avg. Reg. loss: 0.0240 0.0547 sec/batch
Epoch 1/10 Iteration: 26800 Avg. Training loss: 4.5435 Avg. Reg. loss: 0.0240 0.0541 sec/batch
Epoch 1/10 Iteration: 26900 Avg. Training loss: 4.5449 Avg. Reg. loss: 0.0239 0.0537 sec/batch
Epoch 1/10 Iteration: 27000 Avg. Training loss: 4.5450 Avg. Reg. loss: 0.0233 0.0522 sec/batch
Epoch 1/10 Iteration: 27100 Avg. Training loss: 4.5145 Avg. Reg. loss: 0.0232 0.0560 sec/batch
Epoch 1/10 Iteration: 27200 Avg. Training loss: 4.

Epoch 1/10 Iteration: 34900 Avg. Training loss: 4.4879 Avg. Reg. loss: 0.0242 0.0488 sec/batch
Epoch 1/10 Iteration: 35000 Avg. Training loss: 4.4905 Avg. Reg. loss: 0.0236 0.0485 sec/batch
Epoch 1/10 Iteration: 35100 Avg. Training loss: 4.4995 Avg. Reg. loss: 0.0237 0.0482 sec/batch
Epoch 1/10 Iteration: 35200 Avg. Training loss: 4.4875 Avg. Reg. loss: 0.0236 0.0482 sec/batch
Epoch 1/10 Iteration: 35300 Avg. Training loss: 4.4903 Avg. Reg. loss: 0.0234 0.0484 sec/batch
Epoch 1/10 Iteration: 35400 Avg. Training loss: 4.4772 Avg. Reg. loss: 0.0235 0.0483 sec/batch
Epoch 1/10 Iteration: 35500 Avg. Training loss: 4.4844 Avg. Reg. loss: 0.0236 0.0487 sec/batch
Epoch 1/10 Iteration: 35600 Avg. Training loss: 4.5023 Avg. Reg. loss: 0.0239 0.0483 sec/batch
Epoch 1/10 Iteration: 35700 Avg. Training loss: 4.4776 Avg. Reg. loss: 0.0236 0.0482 sec/batch
Epoch 1/10 Iteration: 35800 Avg. Training loss: 4.4840 Avg. Reg. loss: 0.0231 0.0484 sec/batch
Epoch 1/10 Iteration: 35900 Avg. Training loss: 4.

Epoch 1/10 Iteration: 43600 Avg. Training loss: 4.4672 Avg. Reg. loss: 0.0240 0.0502 sec/batch
Epoch 1/10 Iteration: 43700 Avg. Training loss: 4.4536 Avg. Reg. loss: 0.0241 0.0504 sec/batch
Epoch 1/10 Iteration: 43800 Avg. Training loss: 4.4609 Avg. Reg. loss: 0.0244 0.0502 sec/batch
Epoch 1/10 Iteration: 43900 Avg. Training loss: 4.4454 Avg. Reg. loss: 0.0241 0.0500 sec/batch
Epoch 1/10 Iteration: 44000 Avg. Training loss: 4.4386 Avg. Reg. loss: 0.0242 0.0504 sec/batch
Epoch 1/10 Iteration: 44100 Avg. Training loss: 4.4434 Avg. Reg. loss: 0.0241 0.0501 sec/batch
Epoch 1/10 Iteration: 44200 Avg. Training loss: 4.4431 Avg. Reg. loss: 0.0242 0.0501 sec/batch
Epoch 1/10 Iteration: 44300 Avg. Training loss: 4.4455 Avg. Reg. loss: 0.0243 0.0505 sec/batch
Epoch 1/10 Iteration: 44400 Avg. Training loss: 4.5171 Avg. Reg. loss: 0.0253 0.0504 sec/batch
Epoch 1/10 Iteration: 44500 Avg. Training loss: 4.4425 Avg. Reg. loss: 0.0240 0.0498 sec/batch
Epoch 1/10 Iteration: 44600 Avg. Training loss: 4.

Epoch 2/10 Iteration: 52300 Avg. Training loss: 4.4071 Avg. Reg. loss: 0.0237 0.0506 sec/batch
Epoch 2/10 Iteration: 52400 Avg. Training loss: 4.4170 Avg. Reg. loss: 0.0247 0.0503 sec/batch
Epoch 2/10 Iteration: 52500 Avg. Training loss: 4.4137 Avg. Reg. loss: 0.0246 0.0501 sec/batch
Epoch 2/10 Iteration: 52600 Avg. Training loss: 4.4229 Avg. Reg. loss: 0.0236 0.0498 sec/batch
Epoch 2/10 Iteration: 52700 Avg. Training loss: 4.4126 Avg. Reg. loss: 0.0246 0.0501 sec/batch
Epoch 2/10 Iteration: 52800 Avg. Training loss: 4.4218 Avg. Reg. loss: 0.0248 0.0499 sec/batch
Epoch 2/10 Iteration: 52900 Avg. Training loss: 4.4585 Avg. Reg. loss: 0.0247 0.0508 sec/batch
Epoch 2/10 Iteration: 53000 Avg. Training loss: 4.4227 Avg. Reg. loss: 0.0239 0.0505 sec/batch
Epoch 2/10 Iteration: 53100 Avg. Training loss: 4.4015 Avg. Reg. loss: 0.0247 0.0501 sec/batch
Epoch 2/10 Iteration: 53200 Avg. Training loss: 4.4259 Avg. Reg. loss: 0.0241 0.0499 sec/batch
Epoch 2/10 Iteration: 53300 Avg. Training loss: 4.

Epoch 2/10 Iteration: 61000 Avg. Training loss: 4.3874 Avg. Reg. loss: 0.0242 0.0508 sec/batch
Epoch 2/10 Iteration: 61100 Avg. Training loss: 4.4087 Avg. Reg. loss: 0.0246 0.0505 sec/batch
Epoch 2/10 Iteration: 61200 Avg. Training loss: 4.4012 Avg. Reg. loss: 0.0249 0.0518 sec/batch
Epoch 2/10 Iteration: 61300 Avg. Training loss: 4.3859 Avg. Reg. loss: 0.0252 0.0512 sec/batch
Epoch 2/10 Iteration: 61400 Avg. Training loss: 4.4059 Avg. Reg. loss: 0.0242 0.0517 sec/batch
Epoch 2/10 Iteration: 61500 Avg. Training loss: 4.4097 Avg. Reg. loss: 0.0258 0.0508 sec/batch
Epoch 2/10 Iteration: 61600 Avg. Training loss: 4.3866 Avg. Reg. loss: 0.0247 0.0515 sec/batch
Epoch 2/10 Iteration: 61700 Avg. Training loss: 4.4108 Avg. Reg. loss: 0.0241 0.0516 sec/batch
Epoch 2/10 Iteration: 61800 Avg. Training loss: 4.4065 Avg. Reg. loss: 0.0255 0.0512 sec/batch
Epoch 2/10 Iteration: 61900 Avg. Training loss: 4.4000 Avg. Reg. loss: 0.0254 0.0511 sec/batch
Epoch 2/10 Iteration: 62000 Avg. Training loss: 4.

In [48]:
# save embedding matrics
np.save('w2v_pivots100_50d.npy',np.array(W))
np.save('loss_pivots100_50d.npy',np.array(loss_list))

print('best result at iteration:{0}'.format(iteration_best))

49601