In [20]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import random
import os
import pickle
import string
import requests
import collections
import io
import tarfile
import urllib.request
import text_helpers
from nltk.corpus import stopwords
from tensorflow.python.framework import ops
ops.reset_default_graph()
sess = tf.Session()

In [21]:
# Declare model parameters
batch_size = 500
embedding_size = 200
vocabulary_size = 2000
generations = 50000
model_learning_rate = 0.001
num_sampled = int(batch_size/2)
window_size = 3
# Add checkpoints to training
save_embeddings_every = 5000
print_valid_every = 5000
print_loss_every = 100

# Declare stop words
stops = stopwords.words('english')
# We pick some test words. We are expecting synonyms to appear
valid_words = ['love', 'hate', 'happy', 'sad', 'man', 'woman']

In [106]:
data_folder_name = 'temp'
if not os.path.exists(data_folder_name):
    os.makedirs(data_folder_name)
texts, target = text_helpers.load_movie_data(data_folder_name)
texts = text_helpers.normalize_text(texts, stops)
# Texts must contain at least 3 words
target = [target[ix] for ix, x in enumerate(texts) if len(x.split()) > 2]
texts = [x for x in texts if len(x.split()) > 2]

In [107]:
word_dictionary = text_helpers.build_dictionary(texts,vocabulary_size)
word_dictionary_rev = dict(zip(word_dictionary.values(), word_dictionary.keys()))
text_data = text_helpers.text_to_numbers(texts, word_dictionary)
# Get validation word keys
valid_examples = [word_dictionary[x] for x in valid_words]

In [108]:
embeddings = tf.Variable(tf.random_uniform([vocabulary_size,embedding_size], -1.0, 1.0))
# NCE loss parameters
nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size,embedding_size], stddev=1.0 / np.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
# Create data/target placeholders
x_inputs = tf.placeholder(tf.int32, shape=[batch_size,2*window_size])

y_target = tf.placeholder(tf.int32, shape=[batch_size, 1])

valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

# Lookup the word embeddings and
# Add together window embeddings:
embed = tf.zeros([batch_size, embedding_size])
for element in range(2*window_size):
    embed += tf.nn.embedding_lookup(embeddings, x_inputs[:,element])

In [109]:
# Declare loss function (NCE)
loss = tf.reduce_mean(tf.nn.nce_loss(nce_weights, nce_biases, y_target, embed, num_sampled, vocabulary_size))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=model_learning_rate).minimize(loss)

In [110]:
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings,transpose_b=True)

In [111]:
saver = tf.train.Saver({"embeddings": embeddings})

init = tf.global_variables_initializer()
sess.run(init)

In [113]:
loss_vec = []
loss_x_vec = []
for i in range(generations):
    batch_inputs, batch_labels = text_helpers.generate_batch_data(text_data, batch_size, window_size, method='cbow')
    #batch_inputs, batch_labels =generate_batch_data(text_data, batch_size, window_size, method='cbow')

    feed_dict = {x_inputs : batch_inputs, y_target : batch_labels}

    # Run the train step
    sess.run(optimizer, feed_dict=feed_dict)

    # Return the loss
    if (i+1) % print_loss_every == 0:
        loss_val = sess.run(loss, feed_dict=feed_dict)
        loss_vec.append(loss_val)
        loss_x_vec.append(i+1)
        print('Loss at step {} : {}'.format(i+1, loss_val))
      
    # Validation: Print some random words and top 5 related words
    if (i+1) % print_valid_every == 0:
        sim = sess.run(similarity, feed_dict=feed_dict)
        for j in range(len(valid_words)):
            valid_word = word_dictionary_rev[valid_examples[j]]
            top_k = 5 # number of nearest neighbors
            nearest = (-sim[j, :]).argsort()[1:top_k+1]
            log_str = "Nearest to {}:".format(valid_word)
            for k in range(top_k):
                close_word = word_dictionary_rev[nearest[k]]
                log_str = '{} {},' .format(log_str, close_word)
            print(log_str)
            
    # Save dictionary + embeddings
    if (i+1) % save_embeddings_every == 0:
        # Save vocabulary dictionary
        with open(os.path.join(data_folder_name,'movie_vocab.pkl'), 'wb') as f:
            pickle.dump(word_dictionary, f)
        
        # Save embeddings
        model_checkpoint_path = os.path.join(os.getcwd(),data_folder_name,'cbow_movie_embeddings.ckpt')
        save_path = saver.save(sess, model_checkpoint_path)
        print('Model saved in file: {}'.format(save_path))    
    
    
    

Loss at step 100 : 217.25083923339844
Loss at step 200 : 140.424072265625
Loss at step 300 : 118.1656494140625
Loss at step 400 : 82.33472442626953
Loss at step 500 : 63.612998962402344
Loss at step 600 : 78.27619934082031
Loss at step 700 : 55.06660842895508
Loss at step 800 : 61.247962951660156
Loss at step 900 : 41.272178649902344
Loss at step 1000 : 41.40550231933594
Loss at step 1100 : 47.52169418334961
Loss at step 1200 : 45.21162796020508
Loss at step 1300 : 53.46511459350586
Loss at step 1400 : 46.61164855957031
Loss at step 1500 : 65.64624786376953
Loss at step 1600 : 40.94310760498047
Loss at step 1700 : 29.713363647460938
Loss at step 1800 : 49.08052444458008
Loss at step 1900 : 60.1457633972168
Loss at step 2000 : 38.47370529174805
Loss at step 2100 : 29.889801025390625
Loss at step 2200 : 41.5339241027832
Loss at step 2300 : 56.318565368652344
Loss at step 2400 : 43.5775032043457
Loss at step 2500 : 37.82508087158203
Loss at step 2600 : 32.870975494384766
Loss at step 2700

Loss at step 17900 : 16.239850997924805
Loss at step 18000 : 14.242884635925293
Loss at step 18100 : 16.896455764770508
Loss at step 18200 : 16.501832962036133
Loss at step 18300 : 21.11435317993164
Loss at step 18400 : 18.422332763671875
Loss at step 18500 : 15.43557071685791
Loss at step 18600 : 17.176883697509766
Loss at step 18700 : 18.188325881958008
Loss at step 18800 : 17.593040466308594
Loss at step 18900 : 16.485340118408203
Loss at step 19000 : 22.91590690612793
Loss at step 19100 : 19.56573486328125
Loss at step 19200 : 12.56234073638916
Loss at step 19300 : 15.439950942993164
Loss at step 19400 : 11.446372985839844
Loss at step 19500 : 14.392125129699707
Loss at step 19600 : 12.602102279663086
Loss at step 19700 : 14.960792541503906
Loss at step 19800 : 18.765453338623047
Loss at step 19900 : 13.668280601501465
Loss at step 20000 : 16.08641242980957
Nearest to love: musical, light, works, jokes, filmmakers,
Nearest to hate: allen, analyze, successful, finale, keep,
Nearest 

Loss at step 35100 : 10.594259262084961
Loss at step 35200 : 9.623647689819336
Loss at step 35300 : 10.572104454040527
Loss at step 35400 : 10.958413124084473
Loss at step 35500 : 11.419717788696289
Loss at step 35600 : 14.147721290588379
Loss at step 35700 : 12.986072540283203
Loss at step 35800 : 9.806304931640625
Loss at step 35900 : 9.004600524902344
Loss at step 36000 : 10.38296890258789
Loss at step 36100 : 10.746953964233398
Loss at step 36200 : 10.63414192199707
Loss at step 36300 : 11.62169075012207
Loss at step 36400 : 11.00817584991455
Loss at step 36500 : 12.083013534545898
Loss at step 36600 : 9.007204055786133
Loss at step 36700 : 11.92452621459961
Loss at step 36800 : 10.878929138183594
Loss at step 36900 : 9.695296287536621
Loss at step 37000 : 10.834305763244629
Loss at step 37100 : 10.561861038208008
Loss at step 37200 : 10.063477516174316
Loss at step 37300 : 11.239116668701172
Loss at step 37400 : 9.872960090637207
Loss at step 37500 : 11.687749862670898
Loss at ste