In [1]:
import urllib
import tensorflow as tf
from tensorflow import keras
import collections
import numpy as np
from keras.layers import Input, Embedding, Reshape, Lambda, Dot, Dense
from keras import Model
from keras.preprocessing import sequence
from keras.preprocessing.sequence import skipgrams
import keras.backend as K

In [None]:
############ HELPER FUNCTIONS ############

In [2]:
# Read the data into a list of strings.
def read_data(path, filename):
    f = open(path+filename, "r")
    data = tf.compat.as_str(f.read()).split()
    f.close()
    return data

def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [None]:
############ PREPARE DATASET ############

In [3]:
vocab_size = 10000
window_size = 3
vector_dim = 300
epochs = 1000000
valid_size = 1     # Random set of words to evaluate similarity on.
validation_words = ['eight']
valid_window = 100  # Only pick dev samples in the head of the distribution.

vocabulary = read_data("dataset/", "text8.txt")
data, count, dictionary, reverse_dictionary = build_dataset(vocabulary, vocab_size)
# valid_examples = np.random.choice(valid_window, valid_size, replace=False)
valid_examples = [dictionary[w] for w in validation_words]

In [None]:
############ MAKE SAMPLING TABLE ############

In [4]:
sampling_table = sequence.make_sampling_table(vocab_size)
couples, labels = skipgrams(data, vocab_size, window_size=window_size, sampling_table=sampling_table)
word_target, word_context = zip(*couples)
word_target = np.array(word_target, dtype="int32")
word_context = np.array(word_context, dtype="int32")

In [None]:
############ MAKE WORD2VEC MODEL ############

In [10]:
# create some input variables
input_target = Input((1,))
input_context = Input((1,))
embedding = Embedding(vocab_size, vector_dim, input_length=1, name='embedding')
target = embedding(input_target)
target = Reshape((vector_dim, 1))(target)
context = embedding(input_context)
context = Reshape((vector_dim, 1))(context)
similarity = Dot(axes=(1,1), normalize=True)([target, context])
# now perform the dot product operation to get a similarity measure
dot_product = Dot(axes=(1,1))([target, context])
dot_product = Reshape((1,))(dot_product)
# add the sigmoid output layer
output = Dense(1, activation='sigmoid')(dot_product)
# create the primary training model
model = Model(inputs=[input_target, input_context], outputs=output)
model.compile(loss='binary_crossentropy', optimizer='adam')
# create a secondary validation model to run our similarity checks during training
validation_model = Model(inputs=[input_target, input_context], outputs=similarity)

In [None]:
############ SIMILARITY CALLBACK ############

In [6]:
class SimilarityCallback:
    def run_sim(self):
        for i in range(valid_size):
            valid_word = reverse_dictionary[valid_examples[i]]
            top_k = 8  # number of nearest neighbors
            sim = self._get_sim(valid_examples[i])
            nearest = (-sim).argsort()[1:top_k + 1]
            log_str = 'Nearest to %s:' % valid_word
            for k in range(top_k):
                close_word = reverse_dictionary[nearest[k]]
                log_str = '%s %s, ' % (log_str, close_word)
            print(log_str)
            return log_str

    @staticmethod
    def _get_sim(valid_word_idx):
        sim = np.zeros((vocab_size,))
        in_arr1 = np.zeros((1,))
        in_arr2 = np.zeros((1,))
        for i in range(vocab_size):
            in_arr1[0,] = valid_word_idx
            in_arr2[0,] = i
            out = validation_model.predict_on_batch([in_arr1, in_arr2])
            sim[i] = out
        return sim
sim_cb = SimilarityCallback()

In [None]:
############ TRAINING&VALIDATION ############

In [7]:
# model = keras.models.load_model("save/word2vec.300d")
log = open("log.txt", "w")
arr_1 = np.zeros((1,))
arr_2 = np.zeros((1,))
arr_3 = np.zeros((1,))
for cnt in range(0, 400000):
    idx = np.random.randint(0, len(labels)-1)
    arr_1[0,] = word_target[idx]
    arr_2[0,] = word_context[idx]
    arr_3[0,] = labels[idx]
    loss = model.train_on_batch([arr_1, arr_2], arr_3)
    if cnt % 100 == 0:
        log.write("Iteration {}, loss={}\n".format(cnt, loss))
        print("Iteration {}, loss={}".format(cnt, loss))
    if cnt % 10000 == 0:
        test = sim_cb.run_sim()
        log.write(test + '\n')
        print(test)
model.save('save/word2vec.'+str(vector_dim)+'d')
validation_model.save('save/valword2vec.'+str(vector_dim)+'d')

Iteration 200000, loss=0.736491858959198
Nearest to eight: generates,  establishments,  my,  powered,  alone,  mice,  hole,  abbot, 
Nearest to eight: generates,  establishments,  my,  powered,  alone,  mice,  hole,  abbot, 
Iteration 200100, loss=0.22147411108016968
Iteration 200200, loss=0.1739426851272583
Iteration 200300, loss=0.7199013233184814
Iteration 200400, loss=0.026349104940891266


KeyboardInterrupt: 

In [None]:
############ LOAD MODEL ############

In [13]:
model = keras.models.load_model("save/word2vec.300d")
validation_model = keras.models.load_model("save/valword2vec.300d")
test_word = "eight"
sim = dict()
emb_1 = np.array(embedding(dictionary[test_word]))
for w in dictionary:
    emb_2 = np.array(embedding(dictionary[w]))
    sim[w] = np.dot(emb_1, emb_2) / np.linalg.norm(emb_1) / np.linalg.norm(emb_2)
mydict = {k: v for k, v in sorted(sim.items(), key=lambda item: item[1])}
{k: mydict[k] for k in list(mydict)[-10:]}

{'constant': 0.17906968,
 'waiting': 0.1802294,
 'by': 0.18080325,
 'puerto': 0.18296587,
 'luther': 0.19149779,
 'announcement': 0.19150901,
 'sees': 0.1949342,
 'orbitals': 0.19964841,
 'dover': 0.20668457,
 'eight': 1.0}

In [None]:
############ PRETRAINED GLOVE MODEL ############

In [28]:
def load_glove():
    embedding_dict = {}
    path = 'pretrained/glove.6B/glove.6B.300d.txt'
    with open(path, 'r', encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vectors = np.asarray(values[1:], 'float32')
            embedding_dict[word] = vectors
    f.close()

    return embedding_dict

embeddings_index = load_glove()

In [29]:
valid_word_glove = "eight"
sims = dict()
emb_1 = embeddings_index[valid_word_glove]
for w in embeddings_index:
    emb_2 = embeddings_index[w]
    sims[w] = np.dot(emb_1, emb_2) / np.linalg.norm(emb_1) / np.linalg.norm(emb_2)
mydict = {k: v for k, v in sorted(sims.items(), key=lambda item: item[1])}
{k: mydict[k] for k in list(mydict)[-10:]}

{'eleven': 0.7548681,
 'ten': 0.7740602,
 'two': 0.8049595,
 'three': 0.9032012,
 'four': 0.9342553,
 'five': 0.9508191,
 'seven': 0.9527923,
 'nine': 0.9564403,
 'six': 0.959811,
 'eight': 1.0000001}