In [1]:
import urllib
import tensorflow as tf
from tensorflow import keras
import collections
import numpy as np
from keras.layers import Input, Embedding, Reshape, Lambda, Dot, Dense
from keras import Model
from keras.preprocessing import sequence
from keras.preprocessing.sequence import skipgrams
import keras.backend as K

In [None]:
############ HELPER FUNCTIONS ############

In [2]:
def read_data(path, filename):
    f = open(path+filename, "r")
    data = tf.compat.as_str(f.read()).split()
    f.close()
    return data

def build_dataset(words, n_words):
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [None]:
############ PREPARE DATASET ############

In [3]:
vocab_size = 10000
window_size = 3
vector_dim = 300
epochs = 1000000
valid_size = 1
validation_words = ['eight']

vocabulary = read_data("dataset/", "text8.txt")
data, count, dictionary, reverse_dictionary = build_dataset(vocabulary, vocab_size)
# valid_examples = np.random.choice(valid_window, valid_size, replace=False)
valid_examples = [dictionary[w] for w in validation_words]

In [None]:
############ MAKE SAMPLING TABLE ############

In [5]:
sampling_table = sequence.make_sampling_table(vocab_size)
couples, labels = skipgrams(data, vocab_size, window_size=window_size, sampling_table=sampling_table)
word_target, word_context = zip(*couples)
word_target = np.array(word_target, dtype="int32")
word_context = np.array(word_context, dtype="int32")

In [None]:
############ MAKE WORD2VEC MODEL ############

In [7]:
input_target = Input((1,))
input_context = Input((1,))

embedding = Embedding(vocab_size, vector_dim, input_length=1, name='embedding')

target = embedding(input_target)
target = Reshape((vector_dim, 1))(target)

context = embedding(input_context)
context = Reshape((vector_dim, 1))(context)

dot_product = Dot(axes=(1,1))([target, context])
dot_product = Reshape((1,))(dot_product)

output = Dense(1, activation='sigmoid')(dot_product)

model = Model(inputs=[input_target, input_context], outputs=output)
model.compile(loss='binary_crossentropy', optimizer='adam')

In [None]:
############ SIMILARITY CALLBACK ############

In [8]:
def similar_words(test_word):
    sim = dict()
    emb_1 = np.array(embedding(dictionary[test_word]))
    for w in dictionary:
        emb_2 = np.array(embedding(dictionary[w]))
        sim[w] = np.dot(emb_1, emb_2) / np.linalg.norm(emb_1) / np.linalg.norm(emb_2)
    mydict = {k: v for k, v in sorted(sim.items(), key=lambda item: item[1])}
    return {k: mydict[k] for k in list(mydict)[-10:]}

In [None]:
############ TRAINING&VALIDATION ############

In [7]:
# model = keras.models.load_model("save/word2vec.300d")
log = open("log."+str(vector_dim)+"d.txt", "w")
arr_1 = np.zeros((1,))
arr_2 = np.zeros((1,))
arr_3 = np.zeros((1,))
for cnt in range(0, 200000):
    idx = np.random.randint(0, len(labels)-1)
    arr_1[0,] = word_target[idx]
    arr_2[0,] = word_context[idx]
    arr_3[0,] = labels[idx]
    loss = model.train_on_batch([arr_1, arr_2], arr_3)
    if cnt % 100 == 0:
        log.write("Iteration {}, loss={}\n".format(cnt, loss))
        print("Iteration {}, loss={}".format(cnt, loss))
    if cnt % 10000 == 0:
        # test = sim_cb.run_sim()
        valid_sim = similar_words('eight')
        print(valid_sim)
        log.write(str(valid_sim) + '\n')
model.save('save/word2vec.'+str(vector_dim)+'d')

6000, loss=0.861236035823822
Iteration 156100, loss=0.7527545690536499
Iteration 156200, loss=0.3800755739212036
Iteration 156300, loss=0.5544195771217346
Iteration 156400, loss=0.5299660563468933
Iteration 156500, loss=0.05654020980000496
Iteration 156600, loss=0.3988248407840729
Iteration 156700, loss=0.5295549035072327
Iteration 156800, loss=3.1463162741829365e-08
Iteration 156900, loss=0.6760252714157104
Iteration 157000, loss=1.1998937129974365
Iteration 157100, loss=1.2775481939315796
Iteration 157200, loss=0.4804970622062683
Iteration 157300, loss=0.5288588404655457
Iteration 157400, loss=0.6456080079078674
Iteration 157500, loss=0.5693507194519043
Iteration 157600, loss=0.3205789029598236
Iteration 157700, loss=0.7414059638977051
Iteration 157800, loss=0.5153306722640991
Iteration 157900, loss=0.8547376394271851
Iteration 158000, loss=0.20423564314842224
Iteration 158100, loss=0.5103796124458313
Iteration 158200, loss=0.3407629430294037
Iteration 158300, loss=1.134137749671936


In [None]:
############ PRETRAINED WORD2VEC MODEL ############

In [9]:
def load_pretrained():
    embedding_dict = {}
    path = 'pretrained/word2vec.6B/word2vec.6B.300d.txt'
    with open(path, 'r', encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vectors = np.asarray(values[1:], 'float32')
            embedding_dict[word] = vectors
    f.close()
    return embedding_dict

embeddings_index = load_pretrained()

In [None]:
valid_word_pretrained = "eight"
sims = dict()
emb_1 = embeddings_index[valid_word_pretrained]
for w in embeddings_index:
    emb_2 = embeddings_index[w]
    sims[w] = np.dot(emb_1, emb_2) / np.linalg.norm(emb_1) / np.linalg.norm(emb_2)
mydict = {k: v for k, v in sorted(sims.items(), key=lambda item: item[1])}
{k: mydict[k] for k in list(mydict)[-10:]}

In [None]:
############ TESTING MODELS ############

In [18]:
# Load Model
model = keras.models.load_model('save/word2vec.100d')
emb_layer = model.get_layer('embedding')

In [21]:
# Check king-man+woman with trained model
emb_king = np.array(emb_layer(dictionary['king']))
emb_man = np.array(emb_layer(dictionary['man']))
emb_woman = np.array(emb_layer(dictionary['woman']))
emb_queen = np.array(emb_layer(dictionary['queen']))
formula_queen = emb_king - emb_man + emb_woman
print('similarity score with queen:')
print(np.dot(formula_queen, emb_queen) / np.linalg.norm(formula_queen) / np.linalg.norm(emb_queen))
print('closest words to king-man+woman:')
sim = dict()
emb_1 = formula_queen
for w in dictionary:
    emb_2 = np.array(emb_layer(dictionary[w]))
    sim[w] = np.dot(emb_1, emb_2) / np.linalg.norm(emb_1) / np.linalg.norm(emb_2)
mydict = {k: v for k, v in sorted(sim.items(), key=lambda item: item[1])}
{k: mydict[k] for k in list(mydict)[-10:]}

similarity score with queen:
0.39472947
closest words to king-man+woman:


{'there': 0.67077494,
 'episode': 0.6708807,
 'aid': 0.67175865,
 'racing': 0.6724363,
 'and': 0.67937243,
 'extreme': 0.6812827,
 'woman': 0.6854999,
 'quickly': 0.6898997,
 'blackadder': 0.69605553,
 'king': 0.83221656}

In [22]:
# Check king-man+woman with pre-trained model
emb_king = np.array(embeddings_index['king'])
emb_man = np.array(embeddings_index['man'])
emb_woman = np.array(embeddings_index['woman'])
emb_queen = np.array(embeddings_index['queen'])
formula_queen = emb_king - emb_man + emb_woman
print('similarity score with queen:')
print(np.dot(formula_queen, emb_queen) / np.linalg.norm(formula_queen) / np.linalg.norm(emb_queen))
print('closest words to king-man+woman:')
sims = dict()
emb_1 = formula_queen
for w in embeddings_index:
    emb_2 = embeddings_index[w]
    sims[w] = np.dot(emb_1, emb_2) / np.linalg.norm(emb_1) / np.linalg.norm(emb_2)
mydict = {k: v for k, v in sorted(sims.items(), key=lambda item: item[1])}
{k: mydict[k] for k in list(mydict)[-10:]}

similarity score with queen:
0.6896163
closest words to king-man+woman:


{'elizabeth': 0.49080303,
 'prince': 0.501774,
 'kingdom': 0.5025345,
 'daughter': 0.5133157,
 'mother': 0.51421547,
 'princess': 0.55186844,
 'throne': 0.55653733,
 'monarch': 0.5575491,
 'queen': 0.6896163,
 'king': 0.8065858}