In [1]:
import numpy as np
import tensorflow as tf
import random  
from collections import Counter
import datetime, time, json
from copy import deepcopy

  from ._conv import register_converters as _register_converters


In [2]:
def create_word_pairs(int_corpus, window_size, stop_size):
    idx_pairs = []
    tokens = 0
    # for each snetence 
    for sentence in int_corpus:
        # for each center word
        for center_word_pos in range(len(sentence)):
            center_word_idx = sentence[center_word_pos]
            tokens += 1
            if tokens >= stop_size:
                return idx_pairs, tokens
            else:
                # for each context word within window
                for w in range(-window_size, window_size + 1):
                    context_word_pos = center_word_pos + w
                    # make soure not jump out sentence
                    if context_word_pos < 0 or context_word_pos >= len(sentence) or center_word_pos == context_word_pos:
                        continue
                    context_word_idx = sentence[context_word_pos]
                    idx_pairs.append((center_word_idx, context_word_idx))

                    
    return idx_pairs, tokens
        

In [3]:
def get_batches(idx_pairs, batch_size):
    n_batches = len(idx_pairs) // batch_size
    idx_pairs = idx_pairs[:n_batches*batch_size]
    for idx in range(0, len(idx_pairs), batch_size):
        x, y = [], []
        batch = idx_pairs[idx:idx+batch_size]
        for ii in range (len(batch)):
            x.append(batch[ii][0])
            y.append(batch[ii][1])        
        yield x, y  

### create word pairs

In [18]:
corpus = np.load("/Users/zhang/MscProject_tweak2vec/corpus/quora_corpus_int5.npy").tolist()
#corpus = np.load("/Users/zhang/MscProject_tweak2vec/corpus/pubmed_corpus_int5.npy").tolist()

corpus_shuffle = corpus[:]

random.shuffle(corpus_shuffle)
quora_idx_pairs, tokens = create_word_pairs(corpus_shuffle, window_size = 5, stop_size=7000000)
print('totally {0} word pairs'.format(len(pubmed_idx_pairs)))
print('totally {0} tokens'.format(tokens))


totally 26324522 word pairs
totally 6975371 tokens


### split Quora corpus

In [37]:
quo_tokens_lst = [7,5,3,1,0.5,0.1,0.05,0.01]
quo_idx_pairs = []
for i in quo_tokens_lst:
    random.shuffle(corpus_shuffle)
    pairs, tokens = create_word_pairs(corpus_shuffle, window_size = 5, stop_size = i * 1000000)
    quo_idx_pairs.append(pairs)
    print('totally {0} word pairs'.format(len(pairs)))
    print('totally {0} tokens'.format(tokens))

totally 46007576 word pairs
totally 6975371 tokens
totally 32977208 word pairs
totally 5000000 tokens
totally 19791565 word pairs
totally 3000000 tokens
totally 6606864 word pairs
totally 1000000 tokens
totally 3298821 word pairs
totally 500000 tokens
totally 660021 word pairs
totally 100000 tokens
totally 330586 word pairs
totally 50000 tokens
totally 66163 word pairs
totally 10000 tokens


### split PubMed corpus

In [5]:
tokens_lst = [4,3,2,1,0.5,0.1,0.05,0.01]
idx_pairs = []
for i in tokens_lst:
    random.shuffle(corpus_shuffle)
    pairs, tokens = create_word_pairs(corpus_shuffle, window_size = 5, stop_size = i * 1000000)
    idx_pairs.append(pairs)
    print('totally {0} word pairs'.format(len(pairs)))
    print('totally {0} tokens'.format(tokens))

totally 26324522 word pairs
totally 3258438 tokens
totally 24235016 word pairs
totally 3000000 tokens
totally 16156584 word pairs
totally 2000000 tokens
totally 8082335 word pairs
totally 1000000 tokens
totally 4038440 word pairs
totally 500000 tokens
totally 806713 word pairs
totally 100000 tokens
totally 403567 word pairs
totally 50000 tokens
totally 80929 word pairs
totally 10000 tokens


In [20]:
wordlist = np.load('/Users/zhang/MscProject_tweak2vec/corpus/quora_vocab5.npy').tolist()
wordlist.append(['UNK',0])
word2idx = {w[0]: wordlist.index(w) for w in wordlist }
idx2word = {wordlist.index(w): w[0] for w in wordlist }

# wordlist = np.load('/Users/zhang/MscProject_tweak2vec/corpus/pubmed_vocab5.npy').tolist()
# wordlist.append('UNK')
# word2idx = {w: wordlist.index(w) for w in wordlist }
# idx2word = {wordlist.index(w): w for w in wordlist }

### load pivot word vectors

In [39]:
f = open('/Users/zhang/MscProject_tweak2vec/corpus/quora_pivots_google_full.txt','r')
# f = open('/Users/zhang/MscProject_tweak2vec/corpus/pubmed_pivots_google_full.txt','r')
a = f.read()
pivots_dict = eval(a)
f.close()
print('load {0} pivot words'.format(len(pivots_dict.keys())))

load 24965 pivot words


In [22]:
dict_slice = lambda adict, start, end: dict((k, adict[k]) for k in list(adict.keys())[start:end])
def get_pivots_slice(pivots_dict, size):
    pivots = deepcopy(pivots_dict)
    pivots_slice = dict_slice(pivots, 0, size)
    pivots_idx = []
    pivots_vec = []
    for i in pivots_slice.keys():
        pivots_idx.append(i)
        pivots_vec.append(pivots_slice[i])
    return pivots_idx, pivots_vec

In [41]:
pivots_idx = []
pivots_vec = []
for i in pivots_dict.keys():
    pivots_idx.append(i)
    pivots_vec.append(pivots_dict[i])
len(pivots_idx)

24965

In [24]:
n_pivots = 5000
pivots_idx, pivots_vec = get_pivots_slice(pivots_dict, n_pivots)

### a small tf lab :)

In [10]:
embed = tf.Variable([[0,0],[1,1]])
embed_2 = tf.Variable(tf.identity(embed))
ao = tf.scatter_update(embed_2,[0],[[-5,5]])
diff = tf.reduce_sum((embed-embed_2)**2)

sess = tf.Session()
sess.run(tf.global_variables_initializer())
print(sess.run(diff))
sess.run(ao)
print(sess.run(diff))

0
50


### build graph with negative sampling

In [83]:
google_pretrain = np.load('/Users/zhang/MscProject_tweak2vec/word2vecModel/quora/w2v_google_50d.npy')

In [42]:
n_vocab = len(word2idx)
n_embedding = 50
reg_constant = 0.0001
n_sampled = 100
learning_rate = 0.001
epochs = 10
batch_size = 1000 # number of samples each iteration

In [43]:
train_graph = tf.Graph()
with train_graph.as_default():
    # input layer
    inputs = tf.placeholder(tf.int32, [batch_size], name='inputs')
    # labels is 2 dimensional as required by tf.nn.sampled_softmax_loss used for negative sampling.
    labels = tf.placeholder(tf.int32, [None, None], name='labels')
    
    # embedding layer
    init_width = 0.5 / n_embedding
    embedding = tf.Variable(tf.random_uniform((n_vocab, n_embedding), -init_width, init_width))
#     embedding = tf.Variable(google_pretrain)
    embed = tf.nn.embedding_lookup(embedding, inputs)

    # add regularization term
    embedding_copy = tf.Variable(tf.identity(embedding), trainable=False)
    update_embed_op = tf.scatter_update(embedding_copy,pivots_idx,pivots_vec)
    embed_copy = tf.nn.embedding_lookup(embedding_copy, inputs)
    reg_loss = reg_constant * tf.reduce_sum((embed-embed_copy)**2)
    
    # sampled softmax layer
    softmax_w = tf.Variable(tf.truncated_normal((n_vocab, n_embedding)), name="softmax_weights")
    softmax_b = tf.Variable(tf.zeros(n_vocab), name="softmax_bias")
    # Calculate the loss using negative sampling
    loss = tf.nn.sampled_softmax_loss(
        weights=softmax_w,
        biases=softmax_b,
        labels=labels,
        inputs=embed,
        num_sampled=n_sampled,
        num_classes=n_vocab)
    cost = tf.reduce_mean(loss)
    
#     total_cost = cost 
    total_cost = cost + reg_loss


    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(total_cost)

### training

In [44]:
for i in range(len(quo_idx_pairs)):
    current_tokens = quo_tokens_lst[i] * 1000000
    
    print("Tokens: ", current_tokens)
    print("Starting training at ", datetime.datetime.now())
    t0 = time.time()

    with train_graph.as_default():
        saver = tf.train.Saver()

    with tf.Session(graph=train_graph) as sess:
        iteration = 1
        loss = 0
        regular_loss = 0
        loss_best = 100
        loss_list = []
        iteration_best = 0
        sess.run(tf.global_variables_initializer())

        for e in range(1, epochs + 1):
            batches = get_batches(quo_idx_pairs[i], batch_size)
            start = time.time()
            for x, y in batches:
                feed = {inputs: x,
                        labels: np.array(y)[:, None]}
                sess.run(update_embed_op)
                train_loss, _, regu_loss = sess.run([total_cost, optimizer, reg_loss], feed_dict=feed)
#                 train_loss, _ = sess.run([total_cost, optimizer], feed_dict=feed)

                loss += train_loss
                regular_loss += regu_loss

                if loss < loss_best:
                    W = sess.run(embedding).tolist()
                    iteration_best = iteration
                    loss_best = loss

                if iteration % 1000 == 0:
                    end = time.time()
                    loss_list.append(loss / 1000)
                    print("Epoch {}/{}".format(e, epochs),
                          "Iteration: {}".format(iteration),
                          "Avg. Training loss: {:.4f}".format(loss / 1000),
                          "Avg. Reg. loss: {:.4f}".format(regular_loss / 100),
                          "{:.4f} sec/batch".format((end - start) / 1000))


                    loss = 0
                    regular_loss = 0
                    start = time.time()
                iteration += 1
                
        np.save('w2v_pivotsfull_'+str(quo_idx_pairs[i])+'m.npy',np.array(W))        
        print("Finish training at ", datetime.datetime.now()) 
        print("-------------------------------------------------------------------------") 
        print("-------------------------------------------------------------------------")


Tokens:  7000000
Starting training at  2018-07-21 17:13:49.827191
Epoch 1/10 Iteration: 1000 Avg. Training loss: 6.7379 Avg. Reg. loss: 2.8513 0.0425 sec/batch
Epoch 1/10 Iteration: 2000 Avg. Training loss: 5.3986 Avg. Reg. loss: 2.8845 0.0278 sec/batch
Epoch 1/10 Iteration: 3000 Avg. Training loss: 5.0377 Avg. Reg. loss: 2.7677 0.0277 sec/batch
Epoch 1/10 Iteration: 4000 Avg. Training loss: 4.8716 Avg. Reg. loss: 2.6760 0.0275 sec/batch
Epoch 1/10 Iteration: 5000 Avg. Training loss: 4.7755 Avg. Reg. loss: 2.5883 0.0273 sec/batch
Epoch 1/10 Iteration: 6000 Avg. Training loss: 4.7029 Avg. Reg. loss: 2.5097 0.0273 sec/batch
Epoch 1/10 Iteration: 7000 Avg. Training loss: 4.6407 Avg. Reg. loss: 2.4571 0.0267 sec/batch
Epoch 1/10 Iteration: 8000 Avg. Training loss: 4.6066 Avg. Reg. loss: 2.3913 0.0250 sec/batch
Epoch 1/10 Iteration: 9000 Avg. Training loss: 4.5775 Avg. Reg. loss: 2.3389 0.0242 sec/batch
Epoch 1/10 Iteration: 10000 Avg. Training loss: 4.5448 Avg. Reg. loss: 2.2667 0.0256 sec

Epoch 2/10 Iteration: 87000 Avg. Training loss: 4.0533 Avg. Reg. loss: 1.0518 0.0247 sec/batch
Epoch 2/10 Iteration: 88000 Avg. Training loss: 4.0494 Avg. Reg. loss: 1.0433 0.0231 sec/batch
Epoch 2/10 Iteration: 89000 Avg. Training loss: 4.0426 Avg. Reg. loss: 1.0406 0.0230 sec/batch
Epoch 2/10 Iteration: 90000 Avg. Training loss: 4.0358 Avg. Reg. loss: 1.0321 0.0229 sec/batch
Epoch 2/10 Iteration: 91000 Avg. Training loss: 4.0395 Avg. Reg. loss: 1.0387 0.0229 sec/batch
Epoch 2/10 Iteration: 92000 Avg. Training loss: 4.0292 Avg. Reg. loss: 1.0280 0.0225 sec/batch
Epoch 3/10 Iteration: 93000 Avg. Training loss: 4.0359 Avg. Reg. loss: 1.0132 0.0237 sec/batch
Epoch 3/10 Iteration: 94000 Avg. Training loss: 4.0195 Avg. Reg. loss: 1.0186 0.0228 sec/batch
Epoch 3/10 Iteration: 95000 Avg. Training loss: 4.0306 Avg. Reg. loss: 1.0174 0.0229 sec/batch
Epoch 3/10 Iteration: 96000 Avg. Training loss: 4.0253 Avg. Reg. loss: 1.0132 0.0223 sec/batch
Epoch 3/10 Iteration: 97000 Avg. Training loss: 4.

Epoch 4/10 Iteration: 173000 Avg. Training loss: 3.9585 Avg. Reg. loss: 0.8234 0.0218 sec/batch
Epoch 4/10 Iteration: 174000 Avg. Training loss: 3.9557 Avg. Reg. loss: 0.8248 0.0237 sec/batch
Epoch 4/10 Iteration: 175000 Avg. Training loss: 3.9457 Avg. Reg. loss: 0.8245 0.0236 sec/batch
Epoch 4/10 Iteration: 176000 Avg. Training loss: 3.9497 Avg. Reg. loss: 0.8242 0.0236 sec/batch
Epoch 4/10 Iteration: 177000 Avg. Training loss: 3.9497 Avg. Reg. loss: 0.8269 0.0233 sec/batch
Epoch 4/10 Iteration: 178000 Avg. Training loss: 3.9494 Avg. Reg. loss: 0.8221 0.0236 sec/batch
Epoch 4/10 Iteration: 179000 Avg. Training loss: 3.9560 Avg. Reg. loss: 0.8225 0.0236 sec/batch
Epoch 4/10 Iteration: 180000 Avg. Training loss: 3.9514 Avg. Reg. loss: 0.8190 0.0236 sec/batch
Epoch 4/10 Iteration: 181000 Avg. Training loss: 3.9469 Avg. Reg. loss: 0.8191 0.0237 sec/batch
Epoch 4/10 Iteration: 182000 Avg. Training loss: 3.9407 Avg. Reg. loss: 0.8145 0.0237 sec/batch
Epoch 4/10 Iteration: 183000 Avg. Traini

Epoch 6/10 Iteration: 259000 Avg. Training loss: 3.9150 Avg. Reg. loss: 0.7851 0.0255 sec/batch
Epoch 6/10 Iteration: 260000 Avg. Training loss: 3.9005 Avg. Reg. loss: 0.7849 0.0246 sec/batch
Epoch 6/10 Iteration: 261000 Avg. Training loss: 3.9181 Avg. Reg. loss: 0.7879 0.0252 sec/batch
Epoch 6/10 Iteration: 262000 Avg. Training loss: 3.9124 Avg. Reg. loss: 0.7858 0.0264 sec/batch
Epoch 6/10 Iteration: 263000 Avg. Training loss: 3.9114 Avg. Reg. loss: 0.7848 0.0236 sec/batch
Epoch 6/10 Iteration: 264000 Avg. Training loss: 3.9268 Avg. Reg. loss: 0.7802 0.0246 sec/batch
Epoch 6/10 Iteration: 265000 Avg. Training loss: 3.9322 Avg. Reg. loss: 0.7827 0.0252 sec/batch
Epoch 6/10 Iteration: 266000 Avg. Training loss: 3.9234 Avg. Reg. loss: 0.7835 0.0251 sec/batch
Epoch 6/10 Iteration: 267000 Avg. Training loss: 3.9128 Avg. Reg. loss: 0.7847 0.0254 sec/batch
Epoch 6/10 Iteration: 268000 Avg. Training loss: 3.9062 Avg. Reg. loss: 0.7843 0.0254 sec/batch
Epoch 6/10 Iteration: 269000 Avg. Traini

Epoch 8/10 Iteration: 345000 Avg. Training loss: 3.9111 Avg. Reg. loss: 0.8018 0.0235 sec/batch
Epoch 8/10 Iteration: 346000 Avg. Training loss: 3.9089 Avg. Reg. loss: 0.8004 0.0234 sec/batch
Epoch 8/10 Iteration: 347000 Avg. Training loss: 3.9046 Avg. Reg. loss: 0.7978 0.0236 sec/batch
Epoch 8/10 Iteration: 348000 Avg. Training loss: 3.9054 Avg. Reg. loss: 0.8014 0.0235 sec/batch
Epoch 8/10 Iteration: 349000 Avg. Training loss: 3.8943 Avg. Reg. loss: 0.7947 0.0232 sec/batch
Epoch 8/10 Iteration: 350000 Avg. Training loss: 3.9057 Avg. Reg. loss: 0.7986 0.0239 sec/batch
Epoch 8/10 Iteration: 351000 Avg. Training loss: 3.9042 Avg. Reg. loss: 0.8000 0.0233 sec/batch
Epoch 8/10 Iteration: 352000 Avg. Training loss: 3.8926 Avg. Reg. loss: 0.8023 0.0229 sec/batch
Epoch 8/10 Iteration: 353000 Avg. Training loss: 3.9110 Avg. Reg. loss: 0.8040 0.0236 sec/batch
Epoch 8/10 Iteration: 354000 Avg. Training loss: 3.9005 Avg. Reg. loss: 0.8027 0.0239 sec/batch
Epoch 8/10 Iteration: 355000 Avg. Traini

Epoch 10/10 Iteration: 431000 Avg. Training loss: 3.9050 Avg. Reg. loss: 0.8314 0.0243 sec/batch
Epoch 10/10 Iteration: 432000 Avg. Training loss: 3.8975 Avg. Reg. loss: 0.8327 0.0241 sec/batch
Epoch 10/10 Iteration: 433000 Avg. Training loss: 3.9051 Avg. Reg. loss: 0.8364 0.0250 sec/batch
Epoch 10/10 Iteration: 434000 Avg. Training loss: 3.9076 Avg. Reg. loss: 0.8349 0.0253 sec/batch
Epoch 10/10 Iteration: 435000 Avg. Training loss: 3.9018 Avg. Reg. loss: 0.8316 0.0251 sec/batch
Epoch 10/10 Iteration: 436000 Avg. Training loss: 3.9010 Avg. Reg. loss: 0.8379 0.0269 sec/batch
Epoch 10/10 Iteration: 437000 Avg. Training loss: 3.9072 Avg. Reg. loss: 0.8397 0.0294 sec/batch
Epoch 10/10 Iteration: 438000 Avg. Training loss: 3.9119 Avg. Reg. loss: 0.8370 0.0313 sec/batch
Epoch 10/10 Iteration: 439000 Avg. Training loss: 3.9037 Avg. Reg. loss: 0.8361 0.0321 sec/batch
Epoch 10/10 Iteration: 440000 Avg. Training loss: 3.9001 Avg. Reg. loss: 0.8394 0.0248 sec/batch
Epoch 10/10 Iteration: 441000 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [38]:
print(pivots_dict[1])
print(W[1])

[-0.696653425693512, -0.4016351103782654, -0.15490229427814484, -0.153431236743927, 0.11273664981126785, -0.20632797479629517, -0.05852844938635826, 0.18393464386463165, 0.04037215933203697, -0.15603983402252197, -0.12679000198841095, 0.10461419820785522, -0.03136150911450386, -0.09917640686035156, -0.21953696012496948, -0.06557910144329071, -0.3572455048561096, -0.07304935902357101, 0.2829059362411499, 0.25940605998039246, 0.18046262860298157, -0.18454191088676453, -0.13335512578487396, -0.11446908116340637, -0.09217895567417145, -0.028645846992731094, 0.07994083315134048, -0.3566879630088806, -0.16788771748542786, -0.09856567531824112, -0.05210083723068237, -0.06661748886108398, 0.09986916929483414, 0.1596103459596634, -0.1205173209309578, -0.03440592437982559, 0.028155574575066566, -0.17301133275032043, -0.17946180701255798, -0.0042143226601183414, -0.18912769854068756, -0.17107552289962769, -0.14589069783687592, -0.08563197404146194, -0.043947286903858185, -0.053388938307762146, 0.