In [None]:
import numpy as np
import tensorflow as tf
import random  
from collections import Counter
import datetime, time, json

In [32]:
def create_word_pairs(int_corpus, window_size, stop_size):
    idx_pairs = []
    tokens = 0
    # for each snetence 
    for sentence in int_corpus:
        # for each center word
        for center_word_pos in range(len(sentence)):
            center_word_idx = sentence[center_word_pos]
            tokens += 1
            if tokens >= stop_size:
                return idx_pairs, tokens
            else:
                # for each context word within window
                for w in range(-window_size, window_size + 1):
                    context_word_pos = center_word_pos + w
                    # make soure not jump out sentence
                    if context_word_pos < 0 or context_word_pos >= len(sentence) or center_word_pos == context_word_pos:
                        continue
                    context_word_idx = sentence[context_word_pos]
                    idx_pairs.append((center_word_idx, context_word_idx))

                    
    return idx_pairs, tokens
        

In [5]:
def get_batches(idx_pairs, batch_size):
    n_batches = len(idx_pairs) // batch_size
    idx_pairs = idx_pairs[:n_batches*batch_size]
    for idx in range(0, len(idx_pairs), batch_size):
        x, y = [], []
        batch = idx_pairs[idx:idx+batch_size]
        for ii in range (len(batch)):
            x.append(batch[ii][0])
            y.append(batch[ii][1])        
        yield x, y  

### create word pairs

In [27]:
corpus = np.load("/Users/zhang/MscProject_tweak2vec/corpus/quora_corpus_int5.npy").tolist()
#corpus = np.load("/Users/zhang/MscProject_tweak2vec/corpus/pubmed_corpus_int5.npy").tolist()

corpus_shuffle = corpus[:]

random.shuffle(corpus_shuffle)
idx_pairs_SG_7m, tokens = create_word_pairs(corpus_shuffle, window_size = 5, stop_size = 7000000)
print('totally {0} word pairs'.format(len(idx_pairs_SG_7m)))
print('totally {0} tokens'.format(tokens))


totally 46007576 word pairs
totally 6975371 tokens


In [54]:
tokens_lst = [7,6,5,4,3,2,1,0.5,0.1,0.05,0.01]
idx_pairs = []
for i in tokens_lst:
    random.shuffle(corpus_shuffle)
    pairs, tokens = create_word_pairs(corpus_shuffle, window_size = 5, stop_size = i * 1000000)
    idx_pairs.append(pairs)
    print('totally {0} word pairs'.format(len(pairs)))
    print('totally {0} tokens'.format(tokens))

totally 46007576 word pairs
totally 6975371 tokens
totally 39582075 word pairs
totally 6000000 tokens
totally 32972469 word pairs
totally 5000000 tokens
totally 26378696 word pairs
totally 4000000 tokens
totally 19784785 word pairs
totally 3000000 tokens
totally 13201468 word pairs
totally 2000000 tokens
totally 6596705 word pairs
totally 1000000 tokens
totally 3292688 word pairs
totally 500000 tokens
totally 661574 word pairs
totally 100000 tokens
totally 328753 word pairs
totally 50000 tokens
totally 66708 word pairs
totally 10000 tokens


In [41]:
wordlist = np.load('/Users/zhang/MscProject_tweak2vec/corpus/quora_vocab5.npy').tolist()
wordlist.append(['UNK',0])

#wordlist = np.load('/Users/zhang/MscProject_tweak2vec/corpus/pubmed_vocab5.npy').tolist()
#wordlist.append('UNK')

word2idx = {w[0]: wordlist.index(w) for w in wordlist }
idx2word = {wordlist.index(w): w[0] for w in wordlist }

### load pivot word vectors

In [65]:
f = open('/Users/zhang/MscProject_tweak2vec/corpus/quora_pivots_google_10000.txt','r')
#f = open('/Users/zhang/MscProject_tweak2vec/corpus/pubmed_pivots_google_5000.txt','r')
a = f.read()
pivots_dict = eval(a)
f.close()
print('load {0} pivot words'.format(len(pivots_dict.keys())))

load 10000 pivot words


In [66]:
dict_slice = lambda adict, start, end: dict((k, adict[k]) for k in list(adict.keys())[start:end])
def get_pivots_slice(pivots_dict, size):
    pivots = deepcopy(pivots_dict)
    pivots_slice = dict_slice(pivots, 0, size)
    pivots_idx = []
    pivots_vec = []
    for i in pivots_slice.keys():
        pivots_idx.append(i)
        pivots_vec.append(pivots_slice[i])
    return pivots_idx, pivots_vec

In [76]:
n_pivots = 500
pivots_idx, pivots_vec = get_pivots_slice(pivots_dict, n_pivots)

### a small tf lab :)

In [106]:
embed = tf.Variable([[0,0],[1,1]])
embed_2 = tf.Variable(tf.identity(embed))
ao = tf.scatter_update(embed_2,[0],[[-5,5]])
diff = tf.reduce_sum((embed-embed_2)**2)

sess = tf.Session()
sess.run(tf.global_variables_initializer())
print(sess.run(diff))
sess.run(ao)
print(sess.run(diff))

0
50


### build graph with negative sampling

In [83]:
google_pretrain = np.load('/Users/zhang/MscProject_tweak2vec/word2vecModel/quora/w2v_google_50d.npy')

In [68]:
n_vocab = len(word2idx)
n_embedding = 50
reg_constant = 0.0001
n_sampled = 100
learning_rate = 0.001
epochs = 10
batch_size = 1000 # number of samples each iteration

In [69]:
train_graph = tf.Graph()
with train_graph.as_default():
    # input layer
    inputs = tf.placeholder(tf.int32, [batch_size], name='inputs')
    # labels is 2 dimensional as required by tf.nn.sampled_softmax_loss used for negative sampling.
    labels = tf.placeholder(tf.int32, [None, None], name='labels')
    
    # embedding layer
    init_width = 0.5 / n_embedding
    embedding = tf.Variable(tf.random_uniform((n_vocab, n_embedding), -init_width, init_width))
#     embedding = tf.Variable(google_pretrain)
    embed = tf.nn.embedding_lookup(embedding, inputs)

    # add regularization term
    embedding_copy = tf.Variable(tf.identity(embedding), trainable=False)
    update_embed_op = tf.scatter_update(embedding_copy,pivots_idx,pivots_vec)
    embed_copy = tf.nn.embedding_lookup(embedding_copy, inputs)
    reg_loss = reg_constant * tf.reduce_sum((embed-embed_copy)**2)
    
    # sampled softmax layer
    softmax_w = tf.Variable(tf.truncated_normal((n_vocab, n_embedding)), name="softmax_weights")
    softmax_b = tf.Variable(tf.zeros(n_vocab), name="softmax_bias")
    # Calculate the loss using negative sampling
    loss = tf.nn.sampled_softmax_loss(
        weights=softmax_w,
        biases=softmax_b,
        labels=labels,
        inputs=embed,
        num_sampled=n_sampled,
        num_classes=n_vocab)
    cost = tf.reduce_mean(loss)
    
#     total_cost = cost 
    total_cost = cost + reg_loss


    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(total_cost)

### create subset

In [87]:
idx_pairs_shuffle = idx_pairs_SG[:]
random.shuffle(idx_pairs_shuffle)

In [31]:
idx_pairs_10m = idx_pairs_shuffle[:10000000]
idx_pairs_5m = idx_pairs_shuffle[:5000000]
idx_pairs_1m = idx_pairs_shuffle[:1000000]

### training

In [None]:
for i in range(len(idx_pairs)):
    current_tokens = tokens_lst[i] * 1000000
    
    print("Tokens: ", current_tokens)
    print("Starting training at ", datetime.datetime.now())
    t0 = time.time()

    with train_graph.as_default():
        saver = tf.train.Saver()

    with tf.Session(graph=train_graph) as sess:
        iteration = 1
        loss = 0
        regular_loss = 0
        loss_best = 100
        loss_list = []
        iteration_best = 0
        sess.run(tf.global_variables_initializer())

        for e in range(1, epochs + 1):
            batches = get_batches(idx_pairs[i], batch_size)
            start = time.time()
            for x, y in batches:
                feed = {inputs: x,
                        labels: np.array(y)[:, None]}
                sess.run(update_embed_op)
                train_loss, _, regu_loss = sess.run([total_cost, optimizer, reg_loss], feed_dict=feed)
#                 train_loss, _ = sess.run([total_cost, optimizer], feed_dict=feed)

                loss += train_loss
                regular_loss += regu_loss

                if loss < loss_best:
                    W = sess.run(embedding).tolist()
                    iteration_best = iteration
                    loss_best = loss

                if iteration % 1000 == 0:
                    end = time.time()
                    loss_list.append(loss / 1000)
                    print("Epoch {}/{}".format(e, epochs),
                          "Iteration: {}".format(iteration),
                          "Avg. Training loss: {:.4f}".format(loss / 1000),
                          "Avg. Reg. loss: {:.4f}".format(regular_loss / 100),
                          "{:.4f} sec/batch".format((end - start) / 1000))


                    loss = 0
                    regular_loss = 0
                    start = time.time()
                iteration += 1
                
        np.save('w2v_pivots500_'+str(tokens_lst[i])+'m.npy',np.array(W))        
        print("Finish training at ", datetime.datetime.now()) 
        print("-------------------------------------------------------------------------") 
        print("-------------------------------------------------------------------------")


In [75]:
print(pivots_dict[1])
print(W[1])

[-0.696653425693512, -0.4016351103782654, -0.15490229427814484, -0.153431236743927, 0.11273664981126785, -0.20632797479629517, -0.05852844938635826, 0.18393464386463165, 0.04037215933203697, -0.15603983402252197, -0.12679000198841095, 0.10461419820785522, -0.03136150911450386, -0.09917640686035156, -0.21953696012496948, -0.06557910144329071, -0.3572455048561096, -0.07304935902357101, 0.2829059362411499, 0.25940605998039246, 0.18046262860298157, -0.18454191088676453, -0.13335512578487396, -0.11446908116340637, -0.09217895567417145, -0.028645846992731094, 0.07994083315134048, -0.3566879630088806, -0.16788771748542786, -0.09856567531824112, -0.05210083723068237, -0.06661748886108398, 0.09986916929483414, 0.1596103459596634, -0.1205173209309578, -0.03440592437982559, 0.028155574575066566, -0.17301133275032043, -0.17946180701255798, -0.0042143226601183414, -0.18912769854068756, -0.17107552289962769, -0.14589069783687592, -0.08563197404146194, -0.043947286903858185, -0.053388938307762146, 0.