In [1]:
import urllib
import tensorflow as tf
from tensorflow import keras
import collections
import numpy as np
from keras.layers import Input, Embedding, Reshape, Lambda, Dot, Dense
from keras import Model
from keras.preprocessing import sequence
from keras.preprocessing.sequence import skipgrams
import keras.backend as K

In [None]:
############ HELPER FUNCTIONS ############

In [12]:
def read_data(path, filename):
    f = open(path+filename, "r")
    data = tf.compat.as_str(f.read()).split()
    f.close()
    return data

def build_dataset(words, n_words):
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

def similar_words(test_word, sim_words_size):
    sim = dict()
    emb_1 = np.array(embedding(dictionary[test_word]))
    for w in dictionary:
        emb_2 = np.array(embedding(dictionary[w]))
        sim[w] = np.dot(emb_1, emb_2) / np.linalg.norm(emb_1) / np.linalg.norm(emb_2)
    mydict = {k: v for k, v in sorted(sim.items(), key=lambda item: item[1])}
    return {k: mydict[k] for k in list(mydict)[-sim_words_size:]}

def load_pretrained(path):
    embedding_dict = {}
    with open(path, 'r', encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vectors = np.asarray(values[1:], 'float32')
            embedding_dict[word] = vectors
    f.close()
    return embedding_dict

In [None]:
############ PREPARE DATASET ############

In [3]:
vocab_size = 10000
window_size = 3
vector_dim = 300
epochs = 1000000
valid_size = 1
validation_words = ['eight']

vocabulary = read_data("dataset/", "text8.txt")
data, count, dictionary, reverse_dictionary = build_dataset(vocabulary, vocab_size)
# valid_examples = np.random.choice(valid_window, valid_size, replace=False)
valid_examples = [dictionary[w] for w in validation_words]

sampling_table = sequence.make_sampling_table(vocab_size)
couples, labels = skipgrams(data, vocab_size, window_size=window_size, sampling_table=sampling_table)
word_target, word_context = zip(*couples)
word_target = np.array(word_target, dtype="int32")
word_context = np.array(word_context, dtype="int32")

In [None]:
############ MAKE WORD2VEC MODEL ############

In [4]:
input_target = Input((1,))
input_context = Input((1,))

embedding = Embedding(vocab_size, vector_dim, input_length=1, name='embedding')

target = embedding(input_target)
target = Reshape((vector_dim, 1))(target)

context = embedding(input_context)
context = Reshape((vector_dim, 1))(context)

dot_product = Dot(axes=(1,1))([target, context])
dot_product = Reshape((1,))(dot_product)

output = Dense(1, activation='sigmoid')(dot_product)

model = Model(inputs=[input_target, input_context], outputs=output)
model.compile(loss='binary_crossentropy', optimizer='adam')

In [None]:
############ LOAD MODEL ############

In [8]:
model = keras.models.load_model('save/word2vec.300d')
embedding = model.get_layer('embedding')

In [None]:
############ TRAINING&VALIDATION ############

In [6]:
log = open("log."+str(vector_dim)+"d.txt", "a")
arr_1 = np.zeros((1,))
arr_2 = np.zeros((1,))
arr_3 = np.zeros((1,))
for cnt in range(800000, 1000000):
    idx = np.random.randint(0, len(labels)-1)
    arr_1[0,] = word_target[idx]
    arr_2[0,] = word_context[idx]
    arr_3[0,] = labels[idx]
    loss = model.train_on_batch([arr_1, arr_2], arr_3)
    if cnt % 100 == 0:
        log.write("Iteration {}, loss={}\n".format(cnt, loss))
        print("Iteration {}, loss={}".format(cnt, loss))
    if cnt % 10000 == 0:
        # test = sim_cb.run_sim()
        valid_sim = similar_words('eight', 10)
        print(valid_sim)
        log.write(str(valid_sim) + '\n')
model.save('save/word2vec.'+str(vector_dim)+'d')

Iteration 800000, loss=0.5003843307495117
{'one': 0.9586175, 'february': 0.9590623, 'three': 0.9592973, 'december': 0.9595695, 'five': 0.96350306, 'seven': 0.96358466, 'two': 0.9650619, 'six': 0.96973956, 'four': 0.97241116, 'eight': 1.0}
Iteration 800100, loss=0.00025168227148242295
Iteration 800200, loss=0.00023424698156304657
Iteration 800300, loss=0.5108246207237244
Iteration 800400, loss=0.036473412066698074
Iteration 800500, loss=0.2695469558238983
Iteration 800600, loss=0.8477667570114136
Iteration 800700, loss=1.3061277866363525
Iteration 800800, loss=0.5405568480491638
Iteration 800900, loss=0.3265925347805023
Iteration 801000, loss=0.49969157576560974
Iteration 801100, loss=0.8820124268531799
Iteration 801200, loss=0.024744488298892975
Iteration 801300, loss=0.9433138370513916
Iteration 801400, loss=0.9601577520370483
Iteration 801500, loss=0.4792303442955017
Iteration 801600, loss=0.0017340669874101877
Iteration 801700, loss=0.8733683228492737
Iteration 801800, loss=0.476623

KeyboardInterrupt: 

In [None]:
############ FIRST 10 SIMILAR WORDS ############

In [13]:
valid_word = 'man'
similar_words(valid_word, 10)

{'u': 0.9514369,
 'first': 0.95153284,
 'over': 0.95167065,
 'france': 0.9518879,
 'who': 0.9522274,
 'main': 0.9529657,
 'before': 0.9541421,
 'new': 0.95461816,
 'english': 0.9547105,
 'man': 1.0000001}

In [None]:
embeddings_index = load_pretrained('pretrained/word2vec.6B/word2vec.6B.300d.txt')
valid_word_pretrained = "man"
sims = dict()
emb_1 = embeddings_index[valid_word_pretrained]
for w in embeddings_index:
    emb_2 = embeddings_index[w]
    sims[w] = np.dot(emb_1, emb_2) / np.linalg.norm(emb_1) / np.linalg.norm(emb_2)
mydict = {k: v for k, v in sorted(sims.items(), key=lambda item: item[1])}
{k: mydict[k] for k in list(mydict)[-10:]}

In [None]:
############ TESTING KING-MAN+WOMAN ############

In [9]:
# Check king-man+woman
emb_king = np.array(embeddings_index['king'])
emb_man = np.array(embeddings_index['man'])
emb_woman = np.array(embeddings_index['woman'])
emb_queen = np.array(embeddings_index['queen'])
formula_queen = emb_king - emb_man + emb_woman
print('similarity score with queen:')
print(np.dot(formula_queen, emb_queen) / np.linalg.norm(formula_queen) / np.linalg.norm(emb_queen))
print('closest words to king-man+woman:')
sims = dict()
emb_1 = formula_queen
for w in embeddings_index:
    emb_2 = embeddings_index[w]
    sims[w] = np.dot(emb_1, emb_2) / np.linalg.norm(emb_1) / np.linalg.norm(emb_2)
mydict = {k: v for k, v in sorted(sims.items(), key=lambda item: item[1])}
{k: mydict[k] for k in list(mydict)[-10:]}

similarity score with queen:
0.6896163
closest words to king-man+woman:


{'elizabeth': 0.49080303,
 'prince': 0.501774,
 'kingdom': 0.5025345,
 'daughter': 0.5133157,
 'mother': 0.51421547,
 'princess': 0.55186844,
 'throne': 0.55653733,
 'monarch': 0.5575491,
 'queen': 0.6896163,
 'king': 0.8065858}

In [None]:
############ TESTING DIFFERENCE OF WORD EMBEDDINGS ############

In [10]:
# Check boy-man+woman
emb_boy = np.array(embeddings_index['boy'])
emb_man = np.array(embeddings_index['man'])
emb_woman = np.array(embeddings_index['woman'])
emb_girl = np.array(embeddings_index['girl'])
formula_girl = emb_boy - emb_man + emb_woman
print('similarity score with girl:')
print(np.dot(formula_girl, emb_girl) / np.linalg.norm(formula_girl) / np.linalg.norm(emb_girl))
print('closest words to boy-man+woman:')
sims = dict()
emb_1 = formula_girl
for w in embeddings_index:
    emb_2 = embeddings_index[w]
    sims[w] = np.dot(emb_1, emb_2) / np.linalg.norm(emb_1) / np.linalg.norm(emb_2)
mydict = {k: v for k, v in sorted(sims.items(), key=lambda item: item[1])}
{k: mydict[k] for k in list(mydict)[-10:]}

similarity score with girl:
0.8516442
closest words to boy-man+woman:


{'grandmother': 0.5779023,
 'teenage': 0.5985693,
 'pregnant': 0.6020849,
 'daughter': 0.60971236,
 'child': 0.6376473,
 'girls': 0.64778125,
 'mother': 0.6513353,
 'woman': 0.7282232,
 'boy': 0.7855764,
 'girl': 0.8516442}