In [1]:
import numpy as np
import tensorflow as tf
import random  
from collections import Counter
import datetime, time, json

  from ._conv import register_converters as _register_converters


In [2]:
def create_word_pairs(int_corpus, window_size):
    idx_pairs = []
    # for each snetence 
    for sentence in int_corpus:
        # for each center word
        for center_word_pos in range(len(sentence)):
            center_word_idx = sentence[center_word_pos]
            # for each context word within window
            for w in range(-window_size, window_size + 1):
                context_word_pos = center_word_pos + w
                # make soure not jump out sentence
                if context_word_pos < 0 or context_word_pos >= len(sentence) or center_word_pos == context_word_pos:
                    continue
                context_word_idx = sentence[context_word_pos]
                idx_pairs.append((center_word_idx, context_word_idx))
                    
    return idx_pairs
        

In [3]:
def get_batches(idx_pairs, batch_size):
    n_batches = len(idx_pairs) // batch_size
    idx_pairs = idx_pairs[:n_batches*batch_size]
    for idx in range(0, len(idx_pairs), batch_size):
        x, y = [], []
        batch = idx_pairs[idx:idx+batch_size]
        for ii in range (len(batch)):
            x.append(batch[ii][0])
            y.append(batch[ii][1])        
        yield x, y  

### create word pairs

In [4]:
corpus = np.load("/Users/zhang/MscProject_tweak2vec/corpus/quora_corpus_int5.npy").tolist()
idx_pairs_SG = create_word_pairs(corpus, window_size = 5)
print('totally {0} word pairs'.format(len(idx_pairs_SG)))

totally 46007576 word pairs


In [5]:
wordlist = np.load('/Users/zhang/MscProject_tweak2vec/corpus/quora_vocab5.npy').tolist()
wordlist.append(['UNK','0'])
word2idx = {w[0]: wordlist.index(w) for w in wordlist }
idx2word = {wordlist.index(w): w[0] for w in wordlist }

### load pivot word vectors

In [7]:
f = open('pivots_google_10000.txt','r')
a = f.read()
pivots_vec = eval(a)
f.close()
print('load {0} pivot words'.format(len(list(pivots_vec.keys()))))

load 10000 pivot words


In [8]:
dict_slice = lambda adict, start, end: dict((k, adict[k]) for k in list(adict.keys())[start:end])
pivots_100 = dict_slice(pivots_vec, 0, 100)

In [9]:
pivots_idx = []
pivots_vec = []
for i in pivots_100.keys():
    pivots_idx.append(i)
    pivots_vec.append(pivots_100[i])

### create subset

In [22]:
corpus_shuffle = corpus[:]
random.shuffle(corpus_shuffle)

In [44]:
idx_pairs_shuffle = idx_pairs_SG[:]
random.shuffle(idx_pairs_shuffle)

In [45]:
idx_pairs_10m = idx_pairs_shuffle[:10000000]
idx_pairs_5m = idx_pairs_shuffle[:5000000]
idx_pairs_1m = idx_pairs_shuffle[:1000000]
idx_pairs_500k = idx_pairs_shuffle[:500000]

### a small tf lab :)

In [106]:
embed = tf.Variable([[0,0],[1,1]])
embed_2 = tf.Variable(tf.identity(embed))
ao = tf.scatter_update(embed_2,[0],[[-5,5]])
diff = tf.reduce_sum((embed-embed_2)**2)

sess = tf.Session()
sess.run(tf.global_variables_initializer())
print(sess.run(diff))
sess.run(ao)
print(sess.run(diff))

0
50


### build graph with negative sampling

In [37]:
n_vocab = len(word2idx)
n_embedding = 50
reg_constant = 0.1
n_sampled = 100
learning_rate = 0.001
epochs = 5
batch_size = 1000 # number of samples each iteration

In [38]:
train_graph = tf.Graph()
with train_graph.as_default():
    # input layer
    inputs = tf.placeholder(tf.int32, [batch_size], name='inputs')
    # labels is 2 dimensional as required by tf.nn.sampled_softmax_loss used for negative sampling.
    labels = tf.placeholder(tf.int32, [None, None], name='labels')
    
    # embedding layer
    init_width = 0.5 / n_embedding
    embedding = tf.Variable(tf.random_uniform((n_vocab, n_embedding), -init_width, init_width))
    embed = tf.nn.embedding_lookup(embedding, inputs)

    # add regularization term
    embedding_copy = tf.Variable(tf.identity(embedding), trainable=False)
    update_embed_op = tf.scatter_update(embedding_copy,pivots_idx,pivots_vec)
    embed_copy = tf.nn.embedding_lookup(embedding_copy, inputs)
    
    reg_loss = reg_constant * tf.reduce_sum((embed-embed_copy)**2)
    
    # sampled softmax layer
    softmax_w = tf.Variable(tf.truncated_normal((n_vocab, n_embedding)), name="softmax_weights")
    softmax_b = tf.Variable(tf.zeros(n_vocab), name="softmax_bias")
    # Calculate the loss using negative sampling
    loss = tf.nn.sampled_softmax_loss(
        weights=softmax_w,
        biases=softmax_b,
        labels=labels,
        inputs=embed,
        num_sampled=n_sampled,
        num_classes=n_vocab)
    cost = tf.reduce_mean(loss)
    
    total_cost = cost + reg_loss


    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(total_cost)

### training

In [None]:
print("Starting training at", datetime.datetime.now())
t0 = time.time()

with train_graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=train_graph) as sess:
    iteration = 1
    loss = 0
    regular_loss = 0
    loss_best = 100
    loss_list = []
    iteration_best = 0
    sess.run(tf.global_variables_initializer())

    for e in range(1, epochs + 1):
        batches = get_batches(idx_pairs_SG, batch_size)
        start = time.time()
        for x, y in batches:
            feed = {inputs: x,
                    labels: np.array(y)[:, None]}
            sess.run(update_embed_op)
            train_loss, _, regu_loss = sess.run([total_cost, optimizer, reg_loss], feed_dict=feed)
            
            loss += train_loss
            regular_loss += regu_loss
            #regular_loss += sess.run(reg_loss, feed_dict = {inputs: x})

            if loss < loss_best:
                W = sess.run(embedding).tolist()
                iteration_best = iteration
                loss_best = loss

            if iteration % 100 == 0:
                end = time.time()
                loss_list.append(loss / 100)
                print("Epoch {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Avg. Training loss: {:.4f}".format(loss / 100),
                      "Avg. Reg. loss: {:.4f}".format(regular_loss / 100),
                      "{:.4f} sec/batch".format((end - start) / 100))


                loss = 0
                regular_loss = 0
                start = time.time()
            iteration += 1


Starting training at 2018-07-02 11:40:31.932924
Epoch 1/5 Iteration: 100 Avg. Training loss: 75.4638 Avg. Reg. loss: 68.0146 0.0422 sec/batch
Epoch 1/5 Iteration: 200 Avg. Training loss: 52.6324 Avg. Reg. loss: 45.2920 0.0241 sec/batch
Epoch 1/5 Iteration: 300 Avg. Training loss: 40.9645 Avg. Reg. loss: 33.6569 0.0242 sec/batch
Epoch 1/5 Iteration: 400 Avg. Training loss: 32.3355 Avg. Reg. loss: 25.0986 0.0240 sec/batch
Epoch 1/5 Iteration: 500 Avg. Training loss: 27.3704 Avg. Reg. loss: 20.1975 0.0229 sec/batch
Epoch 1/5 Iteration: 600 Avg. Training loss: 21.3062 Avg. Reg. loss: 14.2504 0.0240 sec/batch
Epoch 1/5 Iteration: 700 Avg. Training loss: 18.6573 Avg. Reg. loss: 11.6839 0.0366 sec/batch
Epoch 1/5 Iteration: 800 Avg. Training loss: 16.2559 Avg. Reg. loss: 9.3668 0.0243 sec/batch
Epoch 1/5 Iteration: 900 Avg. Training loss: 13.9183 Avg. Reg. loss: 7.1321 0.0242 sec/batch
Epoch 1/5 Iteration: 1000 Avg. Training loss: 12.2962 Avg. Reg. loss: 5.5680 0.0223 sec/batch
Epoch 1/5 Iter

Epoch 1/5 Iteration: 8900 Avg. Training loss: 5.1936 Avg. Reg. loss: 0.1497 0.0241 sec/batch
Epoch 1/5 Iteration: 9000 Avg. Training loss: 5.1623 Avg. Reg. loss: 0.1542 0.0252 sec/batch
Epoch 1/5 Iteration: 9100 Avg. Training loss: 5.1455 Avg. Reg. loss: 0.1417 0.0251 sec/batch
Epoch 1/5 Iteration: 9200 Avg. Training loss: 5.1398 Avg. Reg. loss: 0.1447 0.0245 sec/batch
Epoch 1/5 Iteration: 9300 Avg. Training loss: 5.1323 Avg. Reg. loss: 0.1430 0.0241 sec/batch
Epoch 1/5 Iteration: 9400 Avg. Training loss: 5.1271 Avg. Reg. loss: 0.1485 0.0233 sec/batch
Epoch 1/5 Iteration: 9500 Avg. Training loss: 5.1049 Avg. Reg. loss: 0.1339 0.0231 sec/batch
Epoch 1/5 Iteration: 9600 Avg. Training loss: 5.0893 Avg. Reg. loss: 0.1418 0.0227 sec/batch
Epoch 1/5 Iteration: 9700 Avg. Training loss: 5.1137 Avg. Reg. loss: 0.1473 0.0232 sec/batch
Epoch 1/5 Iteration: 9800 Avg. Training loss: 5.0754 Avg. Reg. loss: 0.1482 0.0238 sec/batch
Epoch 1/5 Iteration: 9900 Avg. Training loss: 5.0705 Avg. Reg. loss: 0

In [26]:
np.array(W)[1]

array([-0.69832498, -0.40161774, -0.15408248, -0.15335205,  0.11083074,
       -0.2064511 , -0.06023936,  0.18341886,  0.03842158, -0.15629894,
       -0.12491406,  0.10320943, -0.03120194, -0.09884547, -0.21968023,
       -0.06601377, -0.35994545, -0.07534487,  0.28153044,  0.2593551 ,
        0.18049815, -0.18095501, -0.13389163, -0.11190755, -0.09025559,
       -0.02896996,  0.0823247 , -0.35764125, -0.16760482, -0.10051528,
       -0.05259525, -0.06677286,  0.09823136,  0.16070138, -0.11842185,
       -0.03522317,  0.02519188, -0.17191665, -0.1800096 , -0.00420266,
       -0.1877479 , -0.16964671, -0.14890842, -0.08514175, -0.04537127,
       -0.0524059 ,  0.11016116, -0.28318945, -0.21428087,  0.03470111])

In [23]:
# save embedding matrics
np.save('w2v_pivots100_50d.npy',np.array(W))
np.save('loss_pivots100_50d.npy',np.array(loss_list))

print('best result at iteration:{0}'.format(iteration_best))

best result at iteration:122001
