In [1]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Reshape, Dot
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import skipgrams
from tensorflow.keras.preprocessing import sequence

import urllib
import collections
import os
import zipfile

import numpy as np
import tensorflow as tf

print(tf.__version__)
tf.config.list_physical_devices('GPU')

2.3.0


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [2]:
def maybe_download(filename, url, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    if not os.path.exists(filename):
        filename, _ = urllib.request.urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        print(statinfo.st_size)
        raise Exception(
            'Failed to verify ' + filename + '. Can you get to it with a browser?')
    return filename


# Read the data into a list of strings.
def read_data(filename):
    """Extract the first file enclosed in a zip file as a list of words."""
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data


def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

def collect_data(vocabulary_size=10000):
    url = 'http://mattmahoney.net/dc/'
    filename = maybe_download('text8.zip', url, 31344016)
    vocabulary = read_data(filename)
    print(vocabulary[:7])
    data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,
                                                                vocabulary_size)
    del vocabulary  # Hint to reduce memory.
    return data, count, dictionary, reverse_dictionary

In [3]:
vocab_size = 1000
data, count, dictionary, reverse_dictionary = collect_data(vocabulary_size=vocab_size)
print(data[:7])

Found and verified text8.zip
['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse']
[0, 0, 12, 6, 195, 2, 0]


In [4]:
' '.join([reverse_dictionary[i] for i in data[:50]])

'UNK UNK as a term of UNK first used against early working class UNK including the UNK of the english revolution and the UNK UNK of the french revolution UNK the term is still used in a UNK way to UNK any act that used UNK means to UNK the'

In [5]:
window_size = 3
vector_dim = 30
epochs = 200000

valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

sampling_table = sequence.make_sampling_table(vocab_size)
couples, labels = skipgrams(data, vocab_size, window_size=window_size, sampling_table=sampling_table)
word_target, word_context = zip(*couples)
word_target = np.array(word_target, dtype="int32")
word_context = np.array(word_context, dtype="int32")
labels = np.array(labels, dtype = "int32")

print(couples[:10], labels[:10])

[[61, 1], [940, 39], [133, 851], [698, 491], [20, 997], [178, 34], [819, 7], [844, 205], [304, 3], [799, 836]] [1, 1, 0, 0, 0, 1, 1, 0, 1, 0]


In [6]:
# create some input variables
input_target = Input((1,), name = 'tagert')
input_context = Input((1,), name = 'context')

embedding = Embedding(vocab_size, vector_dim, input_length=1, name='embedding')
target = embedding(input_target)
target = Reshape((vector_dim, 1))(target)
context = embedding(input_context)
context = Reshape((vector_dim, 1))(context)

In [7]:
# setup a cosine similarity operation which will be output in a secondary model
similarity = Dot(axes = 1, normalize = True, name = 'cos_sim')([target, context])
similarity = Reshape((1,))(similarity)

# add the sigmoid output layer
output = Dense(1, activation='sigmoid')(similarity)

# create the primary training model
model = Model(inputs=[input_target, input_context], outputs=output)
model.compile(loss='binary_crossentropy', optimizer='rmsprop')

# create a secondary validation model to run our similarity checks during training
validation_model = Model(inputs=[input_target, input_context], outputs=similarity)

In [8]:
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
tagert (InputLayer)             [(None, 1)]          0                                            
__________________________________________________________________________________________________
context (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1, 30)        30000       tagert[0][0]                     
                                                                 context[0][0]                    
__________________________________________________________________________________________________
reshape (Reshape)               (None, 30, 1)        0           embedding[0][0]       

In [11]:
class SimilarityCallback:
    def run_sim(self):
        for i in range(valid_size):
            valid_word = reverse_dictionary[valid_examples[i]]
            top_k = 8  # number of nearest neighbors
            sim = self._get_sim(valid_examples[i])
            nearest = (-sim).argsort()[1:top_k + 1]
            log_str = 'Nearest to %s:' % valid_word
            for k in range(top_k):
                close_word = reverse_dictionary[nearest[k]]
                log_str = '%s %s,' % (log_str, close_word)
            print(log_str)

    @staticmethod
    def _get_sim(valid_word_idx):
        sim = np.zeros((vocab_size,))
        in_arr1 = np.zeros((1,))
        in_arr2 = np.zeros((1,))
        in_arr1[0,] = valid_word_idx
        for i in range(vocab_size):
            in_arr2[0,] = i
            out = validation_model.predict_on_batch([in_arr1, in_arr2])
            sim[i] = out
        return sim
    
sim_cb = SimilarityCallback()

In [13]:
bs = 32
epochs = 200
arr_1 = np.zeros((bs,1,))
arr_2 = np.zeros((bs,1,))
arr_3 = np.zeros((bs,1,))
for cnt in range(epochs):
    idx = np.random.randint(0, len(labels)-1, size = bs)
    arr_1[:,0,] = word_target[idx]
    arr_2[:,0,] = word_context[idx]
    arr_3[:,0,] = labels[idx]
    loss = model.train_on_batch([arr_1, arr_2], arr_3)
    if cnt % 10 == 0:
        print("Iteration {}, loss={}".format(cnt, loss))
    if cnt % 100 == 0:
        sim_cb.run_sim()

Iteration 0, loss=0.6964299082756042
Nearest to d: male, sound, because, jewish, how, give, all, david,
Nearest to first: album, software, central, probably, frequently, roman, result, right,
Nearest to these: jews, map, take, character, alexander, french, out, money,
Nearest to a: five, could, actress, society, related, moved, enough, germany,
Nearest to american: remained, special, mother, foreign, natural, forces, another, technology,
Nearest to six: wife, technology, france, use, left, great, founded, current,
Nearest to some: western, became, provided, similar, style, become, its, went,
Nearest to to: scientific, bit, present, oil, but, network, lost, peace,
Nearest to been: relationship, little, market, written, reference, field, takes, above,
Nearest to united: me, origin, congress, america, go, name, currently, birth,
Nearest to UNK: version, west, r, again, us, structure, their, appear,
Nearest to world: memory, me, radio, as, position, it, century, africa,
Nearest to one: mov

In [17]:
epochs = bs * 200
arr_1 = np.zeros((1,))
arr_2 = np.zeros((1,))
arr_3 = np.zeros((1,))
for cnt in range(epochs):
    idx = np.random.randint(0, len(labels)-1)
    arr_1[0,] = word_target[idx]
    arr_2[0,] = word_context[idx]
    arr_3[0,] = labels[idx]
    loss = model.train_on_batch([arr_1, arr_2], arr_3)
    if cnt % (10 * bs) == 0:
        print("Iteration {}, loss={}".format(cnt, loss))
    if cnt % (100 * bs) == 0:
        sim_cb.run_sim()

Iteration 0, loss=0.7518122792243958
Nearest to d: all, male, popular, j, sound, late, american, sent,
Nearest to first: album, h, remained, frequently, central, industry, possible, growth,
Nearest to these: almost, the, rules, take, french, asia, throughout, an,
Nearest to a: was, the, to, with, of, then, is, by,
Nearest to american: true, popular, in, around, body, jewish, used, another,
Nearest to six: two, nine, zero, one, eight, seven, in, three,
Nearest to some: however, similar, oil, its, led, influence, wrote, western,
Nearest to to: the, need, a, claim, out, has, was, almost,
Nearest to been: but, adopted, has, means, market, an, little, by,
Nearest to united: me, church, origin, currently, rather, called, island, america,
Nearest to UNK: us, version, surface, r, three, again, west, structure,
Nearest to world: catholic, japanese, roman, last, included, take, god, memory,
Nearest to one: nine, eight, seven, two, zero, six, in, published,
Nearest to people: group, way, federal,

### output embedding

In [39]:
# output for tensorflow embedding visualization

import pandas as pd
weights = model.get_layer('embedding').weights[0].numpy()
np.save('embedding_weights', weights)
df_weights = pd.DataFrame(weights)
df_weights.to_csv('embedding_weights.tsv', sep='\t', index=False, header=False, float_format = '%8.6f')
pd.DataFrame(dictionary.keys()).to_csv('embedding_meta.tsv', sep = '\t', index = False, header = False)