# I followed along [Aneesh Joshi's blog post](https://medium.com/towards-data-science/learn-word2vec-by-implementing-it-in-tensorflow-45641adaf2ac) on word2vec in this notebook.

In [1]:
import numpy as np
import tensorflow as tf

## step 1. read in the data, create word dictionary, created one-hot vectors for each word

In [2]:
# load data
with open('darksouls_training.txt', 'r') as fh:
    training = [sent.replace('.','').replace('\n', '').lower() for sent in fh.readlines()]
# with open('darksouls_test.txt', 'r') as fh:
#     test = [sent.replace('.','').replace('\n', '').lower() for sent in fh.readlines()]

In [5]:
# create vocabulary
word_list = []
for sent in training:
    for word in sent.split(' '):
        word_list.append(word)
# for sent in test:
#     for word in sent.split(' '):
#         word_list.append(word)
voc = set(word_list)

In [6]:
# create one-hot vector for each word
word2int = {}
int2word = {}
for ind, word in enumerate(voc):
    word2int[word] = ind
    int2word[ind] = word

In [7]:
# split the sentences
sent_train = []
for sent in training:
    sent_train.append(sent.split(' '))
# sent_test = []
# for sent in test:
#     sent_test.append(sent.split(' '))

In [8]:
# create word pairs
data_train = []
WINDOW_SIZE = 5
for sentence in sent_train:
    for ind, word in enumerate(sentence):
        for nb_word in sentence[max(ind - WINDOW_SIZE, 0) : min(ind + WINDOW_SIZE, len(sentence)) + 1] :
            if nb_word != word:
                data_train.append([word, nb_word])

In [10]:
# convert to one-hot
def to_one_hot(data_point_index, vocab_size):
    temp = np.zeros(vocab_size)
    temp[data_point_index] = 1
    return temp

In [12]:
data_train[0]

['postrelease', 'the']

In [14]:
x_train = []
y_train = []

for word_pair in data_train:
    x_train.append(to_one_hot(word2int[word_pair[0]], len(voc)))
    y_train.append(to_one_hot(word2int[word_pair[1]], len(voc)))
x_train = np.asarray(x_train)
y_train = np.asarray(y_train)

## step 2. create tensorflow word2vec model

In [24]:
x = tf.placeholder(dtype=tf.float32, shape=(None, len(voc)))
y_label = tf.placeholder(dtype=tf.float32, shape=(None, len(voc)))

In [31]:
# hidden layer
EMBEDDING_DIM = 5
W1 = tf.Variable(tf.random_normal([len(voc), EMBEDDING_DIM]))
b1 = tf.Variable(tf.random_normal([EMBEDDING_DIM]))
hidden_rep = tf.add(tf.matmul(x, W1), b1)

W2 = tf.Variable(tf.random_normal([EMBEDDING_DIM, len(voc), ]))
b2 = tf.Variable(tf.random_normal([len(voc)]))
pred = tf.nn.softmax(tf.add(tf.matmul(hidden_rep, W2), b2))

In [65]:
# run the model
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

#loss function
cross_entropy_loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(pred),
                                                   reduction_indices=1))
# training step
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy_loss)
# epoch number
n_epoch = 10000
for epoch in xrange(n_epoch):
    sess.run(train_step,
             feed_dict={x: x_train, y_label:y_train})
    if epoch % 100 == 0:
        print('epoch {}: loss is '.format(epoch), sess.run(cross_entropy_loss,
                                                           feed_dict={x: x_train, y_label: y_train}))

('epoch 0: loss is ', 9.6988478)
('epoch 100: loss is ', 7.0252614)
('epoch 200: loss is ', 6.4158936)
('epoch 300: loss is ', 6.1022382)
('epoch 400: loss is ', 5.9212928)
('epoch 500: loss is ', 5.7993145)
('epoch 600: loss is ', 5.7100825)
('epoch 700: loss is ', 5.6409087)
('epoch 800: loss is ', 5.5849442)
('epoch 900: loss is ', 5.5382609)
('epoch 1000: loss is ', 5.498414)
('epoch 1100: loss is ', 5.4637589)
('epoch 1200: loss is ', 5.4331913)
('epoch 1300: loss is ', 5.4058681)
('epoch 1400: loss is ', 5.3811574)
('epoch 1500: loss is ', 5.3585715)
('epoch 1600: loss is ', 5.3377142)
('epoch 1700: loss is ', 5.3182721)
('epoch 1800: loss is ', 5.2999716)
('epoch 1900: loss is ', 5.2826042)
('epoch 2000: loss is ', 5.2659693)
('epoch 2100: loss is ', 5.249928)
('epoch 2200: loss is ', 5.2343483)
('epoch 2300: loss is ', 5.2191362)
('epoch 2400: loss is ', 5.204206)
('epoch 2500: loss is ', 5.1894779)
('epoch 2600: loss is ', 5.1749139)
('epoch 2700: loss is ', 5.1604729)
('epoch

In [38]:
saver = tf.train.Saver()
saver.save(sess, 'dark_souls_word2vec_model.ckpt')

'dark_souls_word2vec_model.ckpt'