# Word Embeddings

In this notebook we will exemplify how are embeddings created and we will do some vizualization of a pre-trained embedding layer with [GloVe](http://nlp.stanford.edu/projects/glove/) weights.

In [None]:
from __future__ import print_function
import tensorflow as tf
import numpy as np
import timeit
import os

Now we will specify the dimensionality of our embeddings, we have options: [50, 100, 200, 300]. Depending on which we choose we will load its corresponding pre-trained GloVe matrix.

In [None]:
emb_dir = '../../data/glove'
EMBEDDING_DIM = 100

# function that reads the contents of the downloaded embeddings
# ref: https://github.com/jarfo/dlsl/blob/master/news20/pretrained_word_embeddings.py
def read_glove_vectors(filename):
    embeddings_index = {}
    f = open(filename)
    coefs = None
    for i, line in enumerate(f):
        values = line.split()
        word = values[0]
        if coefs is None:
            coefs = [[0] * len(values[1:])]
        coefs.append(values[1:])
        embeddings_index[word] = i + 1
    f.close()
    coefsm = np.asarray(coefs, dtype='float32')
    return coefsm, embeddings_index


print('Reading word vectors.')
embedding_matrix, word2idx = read_glove_vectors(os.path.join(emb_dir, 'glove.6B.%dd.txt' % EMBEDDING_DIM))
print('Found %s word vectors.' % len(word2idx))

idx2word = dict((v, k) for k,v in word2idx.iteritems())

VOCAB_SIZE=len(word2idx) # Keep track of the vocabulary size

In [None]:
# Build an inverse mapping dict to get words back out of index predictions
idx2word = dict((v, k) for k, v in word2idx.iteritems())

In [None]:
tf.reset_default_graph()
# create the Tensorflow op to do the embedding operation with the pre-loaded matrix

# First: make the Tensorflow weights for the embeddings matrix
wemb_init = tf.constant(embedding_matrix)
Wemb = tf.get_variable('Weights', initializer=wemb_init)

**Exercise:** Define the cosine similarity projection of an input embedding to get the nearest embeddings to the one we inject through `nearby_word` input placeholder.

In [None]:
# Normalize the embedding weights to be norm 1 for the neighbour computation (hyper sphere surface radius 1)
nemb = tf.nn.l2_normalize(Wemb, 1)

# Add the nearby computation ops to check, out of a nearby_word, which are its neighbors
nearby_word = tf.placeholder(dtype=tf.int32)

# TODO: select word embedding based on index (nearby_word)
# nearby_emb = ...


# TODO: define the cosine similarity operation between our nearby_emb 
# nearby_dist = ...

# Now select the top k words
nearby_val, nearby_idx = tf.nn.top_k(nearby_dist,
                                     min(1000, VOCAB_SIZE))

In [None]:
sess = tf.InteractiveSession()
init_op = tf.global_variables_initializer()
sess.run(init_op)


# TODO: play around with word_examples to be projected
word_examples = ['dolphin', 'dog', 'house', 'barcelona', 'great']

# make nearby function to obtain nearby words given the list of words
# ref: https://github.com/tensorflow/models/blob/master/tutorials/embedding/word2vec.py
def nearby(sess, ids, num=20):
    """Prints out nearby words given a list of words."""
    #ids = np.array([word2idx[word] for word in words])
    print('ids shape: ', ids.shape)
    vals, idx = sess.run([nearby_val, nearby_idx], {nearby_word:ids})
    for i, word_idx in enumerate(ids):
        print("\n%s\n=====================================" % (idx2word[word_idx]))
        for (neighbor, distance) in zip(idx[i, :num], vals[i, :num]):
            print("%-20s %6.4f" % (idx2word[neighbor], distance))

# Encode the words to their indices and infer the mapped word
word_codes = []
for word in word_examples:
    try:
        word_idx = word2idx[word]
        print('word {} code {}'.format(word, word_idx))
    except KeyError:
        # if the word is not in the vocab, map to UNK (0)
        print('WARNING: {} not in vocabulary'.format(word))
        continue
    word_codes.append(word_idx)
    #print('word {} emb {}'.format(word, sess.run(word_emb, {word_in:word_idx})))

# do the neighbor mapping
nearby(sess, np.array(word_codes, dtype=np.int32))