In [127]:
# blog @ https://medium.com/towards-data-science/learn-word2vec-by-implementing-it-in-tensorflow-45641adaf2ac
# only 3 layers NN
import tensorflow as tf
import numpy as np

In [128]:
corpus_raw = 'He is the king . The king is royal . She is the royal  queen '
corpus_raw = corpus_raw.lower()

# dictionary translates words to integers, vice versa
words = []

for word in corpus_raw.split():
    if word != '.':  # avoiding '.' as word
        words.append(word)
words = set(words)  # avoiding duplicates
word2int = {}
int2word = {}
vocab_size = len(words)

print(words)
for i, word in enumerate(words):
    word2int[word] = i
    int2word[i] = word


{'she', 'queen', 'is', 'the', 'king', 'royal', 'he'}


In [129]:
# raw sentences is a list of sentences.
raw_sentences = corpus_raw.split('.')
sentences = []
for sentence in raw_sentences:
    sentences.append(sentence.split())

# word2vec - refer : http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
# window size = 2
# 
# source text										    training samples
# The quick brown fox jumps over the lazy dog			    (the, quick)
# ---------------       									(the, brown)
# 
# The quick brown fox jumps over the lazy dog			    (quick, the)
# -------------------        								(quick, brown)
# 												            (quick, fox)
# 
# The quick brown fox jumps over the lazy dog			    (brown, the)
# -------------------------         						(brown, quick)
# 												            (brown, fox)
# 												            (brown, jumps)
# 
# 
# The quick brown fox jumps over the lazy dog			    (fox, quick)
# 	  --------------------------        					(fox, brown)
# 												            (fox, jumps)
# 												            (fox, over)
# Note: If the word is at the beginning or ending of sentence, the window ignores the outer words.

data = []
WINDOW_SIZE = 2
# this is brilliant
for sentence in sentences:
    for word_index, word in enumerate(sentence):
        for nb_word in sentence[max(word_index - WINDOW_SIZE, 0): min(word_index + WINDOW_SIZE, len(sentence)) + 1]:
            if nb_word != word:
                data.append([word, nb_word])

# training data ==> data

In [130]:
# function to convert numbers to one hot vectors
def to_one_hot_vector(data_point_index, vocab_size):
    temp = np.zeros(vocab_size)
    temp[data_point_index] = 1
    return temp


x_train = []  # input word
y_train = []  # output word

for data_word in data:
    x_train.append(to_one_hot_vector(word2int[data_word[0]], vocab_size))
    y_train.append(to_one_hot_vector(word2int[data_word[1]], vocab_size))

# convert them to numpy arrays
x_train = np.asarray(x_train)
y_train = np.asarray(y_train)

In [131]:
learning_rate = .1
epochs = 10000

# making placeholders for x_train and y_train
X = tf.placeholder(tf.float32, shape=[None, vocab_size])
Y = tf.placeholder(tf.float32, shape=[None, vocab_size])

# we take our training data and convert into the embedded representation.
EMBEDDING_DIM = 5  # you can choose your own number
W1 = tf.Variable(tf.random_normal([vocab_size, EMBEDDING_DIM]))
b1 = tf.Variable(tf.random_normal([EMBEDDING_DIM]))  #bias
hidden_representation = tf.add(tf.matmul(X, W1), b1)

# Next, we take what we have in the embedded dimension and make a prediction about the neighbour. 
# To make the prediction we use softmax.
W2 = tf.Variable(tf.random_normal([EMBEDDING_DIM, vocab_size]))
b2 = tf.Variable(tf.random_normal([vocab_size]))
prediction = tf.nn.softmax(tf.add(tf.matmul(hidden_representation, W2), b2))

# define the loss function:
# Hy′(y):=−∑iy′ilog(yi)
cross_entropy_loss = tf.reduce_mean(-tf.reduce_sum(Y * tf.log(prediction), reduction_indices=[1]))

# train step
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(cross_entropy_loss)
init_var = tf.global_variables_initializer()


In [132]:
# train block
with tf.Session() as sess:
    sess.run(init_var)
    for _ in range(epochs):
        sess.run(train_step, feed_dict={X: x_train, Y: y_train})
        """It eventually stabilises on a constant loss. Even though we can’t get high accuracy, 
        we don’t care. All we are interested in is W1 and b1, i.e., the hidden representations."""
        # print('loss is : ', sess.run(cross_entropy_loss, feed_dict={X: x_train, Y: y_train}))
    # print('W1 :: ', sess.run(W1))
    # print('b1 :: ', sess.run(b1))
    """When we multiply the one hot vectors with W1 , 
    we basically get access to the row of the of W1 
    which is in fact the embedded representation of the word represented by the input one hot vector. 
    So W1 is essentially acting as a look up table."""

    vectors = sess.run(W1 + b1)
    # if you work it out, you will see that it has the same effect as running the node hidden representation
    # print(vectors)
    # If we want the representation for ‘queen’, all we have to do is:
    print(vectors[word2int['queen']])


[-0.1771524  -1.58012688  1.69621992  1.2537359  -1.51709461]
