In [1]:
import numpy as np
import tensorflow as tf
import random  
from collections import Counter
import datetime, time, json
from copy import deepcopy

  from ._conv import register_converters as _register_converters


In [2]:
def create_word_pairs(int_corpus, window_size, stop_size):
    idx_pairs = []
    tokens = 0
    # for each snetence 
    for sentence in int_corpus:
        # for each center word
        for center_word_pos in range(len(sentence)):
            center_word_idx = sentence[center_word_pos]
            tokens += 1
            if tokens >= stop_size:
                return idx_pairs, tokens
            else:
                # for each context word within window
                for w in range(-window_size, window_size + 1):
                    context_word_pos = center_word_pos + w
                    # make soure not jump out sentence
                    if context_word_pos < 0 or context_word_pos >= len(sentence) or center_word_pos == context_word_pos:
                        continue
                    context_word_idx = sentence[context_word_pos]
                    idx_pairs.append((center_word_idx, context_word_idx))

                    
    return idx_pairs, tokens
        

In [3]:
def get_batches(idx_pairs, batch_size):
    n_batches = len(idx_pairs) // batch_size
    idx_pairs = idx_pairs[:n_batches*batch_size]
    for idx in range(0, len(idx_pairs), batch_size):
        x, y = [], []
        batch = idx_pairs[idx:idx+batch_size]
        for ii in range (len(batch)):
            x.append(batch[ii][0])
            y.append(batch[ii][1])        
        yield x, y  

### create word pairs

In [47]:
# corpus = np.load("/Users/zhang/MscProject_tweak2vec/corpus/quora_corpus_int5.npy").tolist()

corpus = np.load("/Users/zhang/MscProject_tweak2vec/corpus/pubmed_corpus_int5.npy").tolist()

corpus_shuffle = corpus[:]

random.shuffle(corpus_shuffle)
quora_idx_pairs, tokens = create_word_pairs(corpus_shuffle, window_size = 5, stop_size=7000000)
print('totally {0} word pairs'.format(len(quora_idx_pairs)))
print('totally {0} tokens'.format(tokens))


totally 26324522 word pairs
totally 3258438 tokens


### split Quora corpus

In [5]:
quo_tokens_lst = [0.01, 0.05, 0.1, 0.5, 1, 3, 5]
quo_idx_pairs = []
for i in quo_tokens_lst:
    random.shuffle(corpus_shuffle)
    pairs, tokens = create_word_pairs(corpus_shuffle, window_size = 5, stop_size = i * 1000000)
    quo_idx_pairs.append(pairs)
    print('totally {0} word pairs'.format(len(pairs)))
    print('totally {0} tokens'.format(tokens))

totally 66209 word pairs
totally 10000 tokens
totally 328653 word pairs
totally 50000 tokens
totally 659424 word pairs
totally 100000 tokens
totally 3301805 word pairs
totally 500000 tokens
totally 6593213 word pairs
totally 1000000 tokens
totally 19783540 word pairs
totally 3000000 tokens
totally 32976829 word pairs
totally 5000000 tokens


### split PubMed corpus

In [48]:
tokens_lst = [0.01,0.05,0.1,0.5,1,2,3]
idx_pairs = []
for i in tokens_lst:
    random.shuffle(corpus_shuffle)
    pairs, tokens = create_word_pairs(corpus_shuffle, window_size = 5, stop_size = i * 1000000)
    idx_pairs.append(pairs)
    print('totally {0} word pairs'.format(len(pairs)))
    print('totally {0} tokens'.format(tokens))

totally 80440 word pairs
totally 10000 tokens
totally 403227 word pairs
totally 50000 tokens
totally 806176 word pairs
totally 100000 tokens
totally 4042017 word pairs
totally 500000 tokens
totally 8074776 word pairs
totally 1000000 tokens
totally 16161575 word pairs
totally 2000000 tokens
totally 24239608 word pairs
totally 3000000 tokens


In [16]:
# wordlist = np.load('/Users/zhang/MscProject_tweak2vec/corpus/quora_vocab5.npy').tolist()
# wordlist.append(['UNK',0])
# word2idx = {w[0]: wordlist.index(w) for w in wordlist }
# idx2word = {wordlist.index(w): w[0] for w in wordlist }


wordlist = np.load('/Users/zhang/MscProject_tweak2vec/corpus/pubmed_vocab5.npy').tolist()
wordlist.append('UNK')
word2idx = {w: wordlist.index(w) for w in wordlist }
idx2word = {wordlist.index(w): w for w in wordlist }

### load pivots slice

In [6]:
q_pivots_dict = {}
for i in [1,10,20]:
    f = open('/Users/zhang/MscProject_tweak2vec/corpus/quora_pivot_'+str(i)+'.txt','r')
    a = f.read()
    q_pivots_dict[i] = eval(a)
    f.close()
    print('load {0} pivot words'.format(len(q_pivots_dict[i].keys())))

load 485 pivot words
load 5075 pivot words
load 9343 pivot words


In [7]:
q_pivots_idx = {}
q_pivots_vec = {}
for i in [1,10,20]:
    q_pivots_idx[i] = []
    q_pivots_vec[i] = []
    for p in q_pivots_dict[i].keys():
        q_pivots_idx[i].append(p)
        q_pivots_vec[i].append(q_pivots_dict[i][p])

In [26]:
np.array(cur_pivots_vec).shape

(9343, 50)

In [25]:
cur_pivots_idx = q_pivots_idx[20]
cur_pivots_vec = q_pivots_vec[20]

### full pivots

In [70]:
f = open('/Users/zhang/MscProject_tweak2vec/corpus/pubmed_pivots_google_full.txt','r')
a = f.read()
full_pivots_dict = eval(a)
f.close()
print('load {0} pivot words from quora corpus'.format(len(full_pivots_dict.keys())))

full_pivots_idx = []
full_pivots_vec = []
for w in full_pivots_dict.keys():
    full_pivots_idx.append(w)
    full_pivots_vec.append(full_pivots_dict[w])
cur_pivots_idx = full_pivots_idx
cur_pivots_vec = full_pivots_vec

load 16867 pivot words from quora corpus


In [114]:
alpha = np.load('pubmed_alpha.npy')
alpha = alpha.reshape(len(alpha),1)
alpha = alpha * 1000000
alpha = alpha.reshape(len(alpha))
alpha.shape

(27188,)

### ------------ closed --------------

In [33]:
dict_slice = lambda adict, start, end: dict((k, adict[k]) for k in list(adict.keys())[start:end])
def get_pivots_slice(pivots_dict, size):
    pivots = deepcopy(pivots_dict)
    pivots_slice = dict_slice(pivots, 0, size)
    pivots_idx = []
    pivots_vec = []
    for i in pivots_slice.keys():
        pivots_idx.append(i)
        pivots_vec.append(pivots_slice[i])
    return pivots_idx, pivots_vec

In [35]:
n_pivots = 10000
pivots_idx, pivots_vec = get_pivots_slice(pivots_dict, n_pivots)

### ------------ closed --------------

### a small tf lab :)

In [146]:
embed = tf.Variable([[0,0],[1,1]])
embed_2 = tf.Variable(tf.identity(embed))
ao = tf.scatter_update(embed_2,[0],[[-5,5]])
diff = tf.reduce_sum((embed-embed_2)**2)

sess = tf.Session()
sess.run(tf.global_variables_initializer())
print(sess.run(diff))
sess.run(ao)
print(sess.run(diff))

0
50


### build graph with negative sampling

In [40]:
#google_pretrain = np.load('/Users/zhang/MscProject_tweak2vec/word2vecModel/quora/tokens/w2v_quoragoogle_50d.npy')
google_pretrain = np.load('/Users/zhang/MscProject_tweak2vec/word2vecModel/pubmed/w2v_pubmedgoogle_50d.npy')
google_pretrain = np.float32(google_pretrain)

In [22]:
n_vocab = len(word2idx)
n_embedding = 50
reg_constant = 0.0001
n_sampled = 100
learning_rate = 0.001
epochs = 10
batch_size = 1000 # number of samples each iteration
print('quora(30300),pubmed(27188)')
print('current vocab word:',n_vocab)

quora(30300),pubmed(27188)
current vocab word: 27188


In [41]:
train_graph = tf.Graph()
with train_graph.as_default():
    # input layer
    inputs = tf.placeholder(tf.int32, [batch_size], name='inputs')
    # labels is 2 dimensional as required by tf.nn.sampled_softmax_loss used for negative sampling.
    labels = tf.placeholder(tf.int32, [None, None], name='labels')
    
#     alpha_table = tf.placeholder(tf.float32, [n_vocab], name='alpha')

    
    # embedding layer
#     init_width = 0.5 / n_embedding
#     embedding = tf.Variable(tf.random_uniform((n_vocab, n_embedding), -init_width, init_width))
    embedding = tf.get_variable(initializer=google_pretrain, name='embedding')
    embed = tf.nn.embedding_lookup(embedding, inputs)

#     # add regularization term
#     embedding_copy = tf.Variable(tf.identity(embedding), trainable=False)
#     update_embed_op = tf.scatter_update(embedding_copy, cur_pivots_idx, cur_pivots_vec)
#     embed_copy = tf.nn.embedding_lookup(embedding_copy, inputs)
#     alpha_subet = tf.nn.embedding_lookup(alpha_table, inputs)
#     alpha_subet = tf.cast(alpha_subet, tf.float32)
# #     reg_loss = reg_constant * tf.reduce_sum((embed-embed_copy)**2)
#     w_diff = tf.reduce_sum((embed-embed_copy)**2,1)
#     alpha_diff = tf.multiply(alpha_subet, w_diff)
#     reg_loss = tf.reduce_sum( alpha_diff )
    
    # sampled softmax layer
    softmax_w = tf.Variable(tf.truncated_normal((n_vocab, n_embedding)), name="softmax_weights")
    softmax_b = tf.Variable(tf.zeros(n_vocab), name="softmax_bias")
    # Calculate the loss using negative sampling
    loss = tf.nn.sampled_softmax_loss(
        weights=softmax_w,
        biases=softmax_b,
        labels=labels,
        inputs=embed,
        num_sampled=n_sampled,
        num_classes=n_vocab)
    cost = tf.reduce_mean(loss)
    
    total_cost = cost 
#     total_cost = cost + reg_loss


    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(total_cost)

### training

In [49]:
for i in range(7):

    current_tokens = tokens_lst[i] * 1000000
#     current_pivots_idx = np.array(q_pivots_idx[p])
#     current_pivots_vec = np.array(q_pivots_vec[p])

    print("Tokens: ", current_tokens)
    #print("Pivots: ", len(cur_pivots_idx))
    print("Starting training at ", datetime.datetime.now())
    t0 = time.time()

    with train_graph.as_default():
        saver = tf.train.Saver()

    with tf.Session(graph=train_graph) as sess:
        iteration = 1
        loss = 0
        regular_loss = 0
        loss_best = 100
        loss_list = []
        iteration_best = 0
        sess.run(tf.global_variables_initializer())

        for e in range(1, epochs + 1):
            batches = get_batches(idx_pairs[i], batch_size)
            start = time.time()
            for x, y in batches:
                feed = {inputs: x,
                        labels: np.array(y)[:, None]
#                       alpha_table: alpha
                       }
#                 sess.run(update_embed_op)
#                 train_loss, _, regu_loss = sess.run([total_cost, optimizer, reg_loss], feed_dict=feed)
                
                train_loss, _ = sess.run([total_cost, optimizer], feed_dict=feed)

                loss += train_loss
#                 regular_loss += regu_loss

                if loss < loss_best:
                    W = sess.run(embedding).tolist()
                    iteration_best = iteration
                    loss_best = loss

                if iteration % 1000 == 0:
                    end = time.time()
                    loss_list.append(loss / 1000)
                    print("Epoch {}/{}".format(e, epochs),
                          "Iteration: {}".format(iteration),
                          "Avg. Training loss: {:.4f}".format(loss / 1000),
#                           "Avg. Reg. loss: {:.4f}".format(regular_loss / 100),
                          "{:.4f} sec/batch".format((end - start) / 1000))


                    loss = 0
                    regular_loss = 0
                    start = time.time()
                iteration += 1

        np.save('w2v_retrain_'+str(tokens_lst[i])+'m.npy',np.array(W))        
        print("Finish training at ", datetime.datetime.now()) 
        print("-------------------------------------------------------------------------") 
        print("-------------------------------------------------------------------------")


Tokens:  10000.0
Starting training at  2018-08-01 21:06:30.806721
Finish training at  2018-08-01 21:06:46.200018
-------------------------------------------------------------------------
-------------------------------------------------------------------------
Tokens:  50000.0
Starting training at  2018-08-01 21:06:46.200382
Epoch 3/10 Iteration: 1000 Avg. Training loss: 7.6915 0.0036 sec/batch
Epoch 5/10 Iteration: 2000 Avg. Training loss: 6.1672 0.0072 sec/batch
Epoch 8/10 Iteration: 3000 Avg. Training loss: 5.2405 0.0032 sec/batch
Epoch 10/10 Iteration: 4000 Avg. Training loss: 4.7014 0.0066 sec/batch
Finish training at  2018-08-01 21:08:44.908590
-------------------------------------------------------------------------
-------------------------------------------------------------------------
Tokens:  100000.0
Starting training at  2018-08-01 21:08:44.915422
Epoch 2/10 Iteration: 1000 Avg. Training loss: 7.6544 0.0036 sec/batch
Epoch 3/10 Iteration: 2000 Avg. Training loss: 6.2636 0

Epoch 6/10 Iteration: 48000 Avg. Training loss: 3.8829 0.0186 sec/batch
Epoch 7/10 Iteration: 49000 Avg. Training loss: 3.8783 0.0105 sec/batch
Epoch 7/10 Iteration: 50000 Avg. Training loss: 3.8736 0.0185 sec/batch
Epoch 7/10 Iteration: 51000 Avg. Training loss: 3.8696 0.0185 sec/batch
Epoch 7/10 Iteration: 52000 Avg. Training loss: 3.8579 0.0188 sec/batch
Epoch 7/10 Iteration: 53000 Avg. Training loss: 3.8631 0.0187 sec/batch
Epoch 7/10 Iteration: 54000 Avg. Training loss: 3.8554 0.0183 sec/batch
Epoch 7/10 Iteration: 55000 Avg. Training loss: 3.8574 0.0183 sec/batch
Epoch 7/10 Iteration: 56000 Avg. Training loss: 3.8465 0.0180 sec/batch
Epoch 8/10 Iteration: 57000 Avg. Training loss: 3.8608 0.0090 sec/batch
Epoch 8/10 Iteration: 58000 Avg. Training loss: 3.8411 0.0714 sec/batch
Epoch 8/10 Iteration: 59000 Avg. Training loss: 3.8367 0.0190 sec/batch
Epoch 8/10 Iteration: 60000 Avg. Training loss: 3.8316 0.0186 sec/batch
Epoch 8/10 Iteration: 61000 Avg. Training loss: 3.8445 0.0189 se

Epoch 5/10 Iteration: 79000 Avg. Training loss: 3.8725 0.0194 sec/batch
Epoch 5/10 Iteration: 80000 Avg. Training loss: 3.8601 0.0215 sec/batch
Epoch 6/10 Iteration: 81000 Avg. Training loss: 3.8572 0.0050 sec/batch
Epoch 6/10 Iteration: 82000 Avg. Training loss: 3.8460 0.0239 sec/batch
Epoch 6/10 Iteration: 83000 Avg. Training loss: 3.8583 0.0180 sec/batch
Epoch 6/10 Iteration: 84000 Avg. Training loss: 3.8477 0.0181 sec/batch
Epoch 6/10 Iteration: 85000 Avg. Training loss: 3.8391 0.0174 sec/batch
Epoch 6/10 Iteration: 86000 Avg. Training loss: 3.8466 0.0179 sec/batch
Epoch 6/10 Iteration: 87000 Avg. Training loss: 3.8444 0.0173 sec/batch
Epoch 6/10 Iteration: 88000 Avg. Training loss: 3.8515 0.0184 sec/batch
Epoch 6/10 Iteration: 89000 Avg. Training loss: 3.8389 0.0183 sec/batch
Epoch 6/10 Iteration: 90000 Avg. Training loss: 3.8444 0.0174 sec/batch
Epoch 6/10 Iteration: 91000 Avg. Training loss: 3.8427 0.0183 sec/batch
Epoch 6/10 Iteration: 92000 Avg. Training loss: 3.8429 0.0184 se

Epoch 2/10 Iteration: 28000 Avg. Training loss: 4.0427 0.0230 sec/batch
Epoch 2/10 Iteration: 29000 Avg. Training loss: 4.0382 0.0215 sec/batch
Epoch 2/10 Iteration: 30000 Avg. Training loss: 4.0477 0.0194 sec/batch
Epoch 2/10 Iteration: 31000 Avg. Training loss: 4.0386 0.0205 sec/batch
Epoch 2/10 Iteration: 32000 Avg. Training loss: 4.0222 0.0219 sec/batch
Epoch 2/10 Iteration: 33000 Avg. Training loss: 4.0239 0.0217 sec/batch
Epoch 2/10 Iteration: 34000 Avg. Training loss: 4.0181 0.0227 sec/batch
Epoch 2/10 Iteration: 35000 Avg. Training loss: 4.0149 0.0199 sec/batch
Epoch 2/10 Iteration: 36000 Avg. Training loss: 4.0087 0.0216 sec/batch
Epoch 2/10 Iteration: 37000 Avg. Training loss: 4.0003 0.0214 sec/batch
Epoch 2/10 Iteration: 38000 Avg. Training loss: 3.9945 0.0199 sec/batch
Epoch 2/10 Iteration: 39000 Avg. Training loss: 3.9953 0.0211 sec/batch
Epoch 2/10 Iteration: 40000 Avg. Training loss: 3.9850 0.0187 sec/batch
Epoch 2/10 Iteration: 41000 Avg. Training loss: 3.9905 0.0224 se

Epoch 6/10 Iteration: 142000 Avg. Training loss: 3.8070 0.0196 sec/batch
Epoch 6/10 Iteration: 143000 Avg. Training loss: 3.8021 0.0201 sec/batch
Epoch 6/10 Iteration: 144000 Avg. Training loss: 3.8032 0.0216 sec/batch
Epoch 6/10 Iteration: 145000 Avg. Training loss: 3.8040 0.0213 sec/batch
Epoch 7/10 Iteration: 146000 Avg. Training loss: 3.8068 0.0117 sec/batch
Epoch 7/10 Iteration: 147000 Avg. Training loss: 3.7909 0.0182 sec/batch
Epoch 7/10 Iteration: 148000 Avg. Training loss: 3.7941 0.0210 sec/batch
Epoch 7/10 Iteration: 149000 Avg. Training loss: 3.7898 0.0234 sec/batch
Epoch 7/10 Iteration: 150000 Avg. Training loss: 3.7981 0.0216 sec/batch
Epoch 7/10 Iteration: 151000 Avg. Training loss: 3.8132 0.0266 sec/batch
Epoch 7/10 Iteration: 152000 Avg. Training loss: 3.8052 0.0245 sec/batch
Epoch 7/10 Iteration: 153000 Avg. Training loss: 3.7999 0.0241 sec/batch
Epoch 7/10 Iteration: 154000 Avg. Training loss: 3.8045 0.0240 sec/batch
Epoch 7/10 Iteration: 155000 Avg. Training loss: 3.

In [41]:
len(W)

30300