In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import random
import os
import string
import requests
import collections
import io
import tarfile
import urllib.request
from nltk.corpus import stopwords
from tensorflow.python.framework import ops

ops.reset_default_graph()

#os.chdir(os.path.dirname(os.path.realpath(__file__)))

  from ._conv import register_converters as _register_converters


In [2]:

# Start a graph session
sess = tf.Session()

# Declare model parameters
batch_size = 50
embedding_size = 200
vocabulary_size = 10000
generations = 50000
print_loss_every = 500

num_sampled = int(batch_size/2)    # Number of negative examples to sample.
window_size = 2       # How many words to consider left and right.

# Declare stop words
stops = stopwords.words('english')

# We pick five test words. We are expecting synonyms to appear
print_valid_every = 2000
valid_words = ['cliche', 'love', 'hate', 'silly', 'sad']
# Later we will have to transform these into indices

In [3]:
# Load the movie review data
# Check if data was downloaded, otherwise download it and save for future use
def load_movie_data():
    save_folder_name = 'temp'
    pos_file = os.path.join(save_folder_name, 'rt-polarity.pos')
    neg_file = os.path.join(save_folder_name, 'rt-polarity.neg')

    # Check if files are already downloaded
    if os.path.exists(save_folder_name):
        pos_data = []
        with open(pos_file, 'r') as temp_pos_file:
            for row in temp_pos_file:
                pos_data.append(row)
        neg_data = []
        with open(neg_file, 'r') as temp_neg_file:
            for row in temp_neg_file:
                neg_data.append(row)
    else: # If not downloaded, download and save
        movie_data_url = 'http://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz'
        stream_data = urllib.request.urlopen(movie_data_url)
        tmp = io.BytesIO()
        while True:
            s = stream_data.read(16384)
            if not s:  
                break
            tmp.write(s)
            stream_data.close()
            tmp.seek(0)
    
        tar_file = tarfile.open(fileobj=tmp, mode="r:gz")
        pos = tar_file.extractfile('rt-polaritydata/rt-polarity.pos')
        neg = tar_file.extractfile('rt-polaritydata/rt-polarity.neg')
        # Save pos/neg reviews
        pos_data = []
        for line in pos:
            pos_data.append(line.decode('ISO-8859-1').encode('ascii',errors='ignore').decode())
        neg_data = []
        for line in neg:
            neg_data.append(line.decode('ISO-8859-1').encode('ascii',errors='ignore').decode())
        tar_file.close()
        # Write to file
        if not os.path.exists(save_folder_name):
            os.makedirs(save_folder_name)
        # Save files
        with open(pos_file, "w") as pos_file_handler:
            pos_file_handler.write(''.join(pos_data))
        with open(neg_file, "w") as neg_file_handler:
            neg_file_handler.write(''.join(neg_data))
    texts = pos_data + neg_data
    target = [1]*len(pos_data) + [0]*len(neg_data)
    return(texts, target)

texts, target = load_movie_data()
print(texts[0])
print(target[0])

the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . 

1


In [17]:

# Normalize text
def normalize_text(texts, stops):
    # Lower case
    texts = [x.lower() for x in texts]

    # Remove punctuation
    texts = [''.join(c for c in x if c not in string.punctuation) for x in texts]

    # Remove numbers
    texts = [''.join(c for c in x if c not in '0123456789') for x in texts]

    # Remove stopwords
    texts = [' '.join([word for word in x.split() if word not in (stops)]) for x in texts]

    # Trim extra whitespace
    texts = [' '.join(x.split()) for x in texts]
    
    return(texts)
    
texts = normalize_text(texts, stops)

# Texts must contain at least 3 words (each sentence contains more than 3 words)
target = [target[ix] for ix, x in enumerate(texts) if len(x.split()) > 2]
texts = [x for x in texts if len(x.split()) > 2]

In [6]:
# Build dictionary of words
def build_dictionary(sentences, vocabulary_size):
    # Turn sentences (list of strings) into lists of words
    split_sentences = [s.split() for s in sentences]
    words = [x for sublist in split_sentences for x in sublist]
    
    # Initialize list of [word, word_count] for each word, starting with unknown
    count = [['RARE', -1]]
    
    # Now add most frequent words, limited to the N-most frequent (N=vocabulary size)
    count.extend(collections.Counter(words).most_common(vocabulary_size-1))
    
    # Now create the dictionary
    word_dict = {}
    # For each word, that we want in the dictionary, add it, then make it
    # the value of the prior dictionary length
    for word, word_count in count:
        word_dict[word] = len(word_dict)#################################################### word:index
    
    return(word_dict)
    

In [7]:
# Turn text data into lists of integers from dictionary
def text_to_numbers(sentences, word_dict):
    # Initialize the returned data
    data = []
    for sentence in sentences:
        sentence_data = []
        # For each word, either use selected index or rare word index
        for word in sentence:
            if word in word_dict:
                word_ix = word_dict[word]  #counts
            else:
                word_ix = 0
            sentence_data.append(word_ix)
        data.append(sentence_data)
    return(data)

# Build our data set and dictionaries
word_dictionary = build_dictionary(texts, vocabulary_size)

word_dictionary_rev = dict(zip(word_dictionary.values(), word_dictionary.keys()))


text_data = text_to_numbers(texts, word_dictionary)

# Get validation word keys
valid_examples = [word_dictionary[x] for x in valid_words]

In [35]:
print(text_data[0])

[2700, 0, 1838, 1031, 0, 0, 721, 0, 0, 0, 2654, 721, 0, 0, 0, 0, 0, 1838, 721, 2654, 0, 1811, 2700, 0, 0, 0, 2654, 721, 2812, 0, 1838, 0, 2654, 0, 2654, 0, 2213, 721, 0, 0, 2674, 0, 0, 2654, 2674, 0, 0, 0, 1031, 721, 0, 0, 4993, 3941, 0, 0, 2213, 0, 721, 8449, 721, 2654, 0, 2674, 2700, 721, 0, 0, 721, 2700, 0, 0, 2700, 2654, 0, 3941, 0, 0, 0, 1838, 2213, 2812, 0, 2700, 0, 721, 2654, 721, 2674, 2674, 721, 2700, 0, 1113, 721, 0, 2654, 1838, 3941, 0, 1811, 0, 0, 8449, 0, 2654, 0, 0, 0, 0, 0, 721, 0, 0, 0, 721, 8449, 721, 2654, 0, 0, 721, 2674, 0, 3941]


In [16]:
# Generate data randomly (N words behind, target, N words ahead)
def generate_batch_data(sentences, batch_size, window_size, method='skip_gram'):
    # Fill up data batch
    batch_data = []
    label_data = []
    while len(batch_data) < batch_size:
        # select random sentence to start

        rand_sentence = np.random.choice(sentences) #return one sentence
       # print(rand_sentence)
        # Generate consecutive windows to look at

        window_sequences = [rand_sentence[max((ix-window_size),0):(ix+window_size+1)] for ix, x in enumerate(rand_sentence)]
        
        # Denote which element of each window is the center word of interest

        #print("window_sequences",window_sequences)
        
        label_indices = [ix if ix<window_size else window_size for ix,x in enumerate(window_sequences)]
        
        #print("label_indices",label_indices )
        
        # Pull out center word of interest for each window and create a tuple for each window
        if method=='skip_gram':
            batch_and_labels = [(x[y], x[:y] + x[(y+1):]) for x,y in zip(window_sequences, label_indices)]
            #print("batch_and_labels",batch_and_labels)
            # Make it in to a big list of tuples (target word, surrounding word)
            tuple_data = [(x, y_) for x,y in batch_and_labels for y_ in y]
            #print("tuple_data",tuple_data[:2] )
        elif method=='cbow':
            batch_and_labels = [(x[:y] + x[(y+1):], x[y]) for x,y in zip(window_sequences, label_indices)]
            # Make it in to a big list of tuples (target word, surrounding word)
            tuple_data = [(x_, y) for x,y in batch_and_labels for x_ in x]
        else:
            raise ValueError('Method {} not implmented yet.'.format(method))
            
        # extract batch and labels
        batch, labels = [list(x) for x in zip(*tuple_data)]
        batch_data.extend(batch[:batch_size])
        label_data.extend(labels[:batch_size])
    # Trim batch and label at the end
    batch_data = batch_data[:batch_size]
    label_data = label_data[:batch_size]
    
    # Convert to numpy array
    batch_data = np.array(batch_data)
    label_data = np.transpose(np.array([label_data]))
    
    return(batch_data, label_data)
for i in range(1):
    batch_inputs, batch_labels = generate_batch_data(text_data, batch_size, window_size)    
    #print("batch_inputs",batch_inputs )
    #print("batch_labels",batch_labels )
    

In [10]:
# Define Embeddings:
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))

# NCE loss parameters
nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size],stddev=1.0 / np.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

# Create data/target placeholders
x_inputs = tf.placeholder(tf.int32, shape=[batch_size])
y_target = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

# Lookup the word embedding:
embed = tf.nn.embedding_lookup(embeddings, x_inputs)

# Get loss from prediction
loss = tf.reduce_mean(tf.nn.nce_loss(nce_weights, nce_biases, y_target,  embed,num_sampled,vocabulary_size))
                                     
# Create optimizer
optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss)

# Cosine similarity between words
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims =True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

#Add variable initializer.
init = tf.global_variables_initializer()
sess.run(init)

In [25]:
# Run the skip gram model.
loss_vec = []
loss_x_vec = []
for i in range(generations):
    batch_inputs, batch_labels = generate_batch_data(text_data, batch_size, window_size)
    feed_dict = {x_inputs : batch_inputs, y_target : batch_labels}

    # Run the train step
    sess.run(optimizer, feed_dict=feed_dict)

    # Return the loss
    if (i+1) % print_loss_every == 0:
        loss_val = sess.run(loss, feed_dict=feed_dict)
        loss_vec.append(loss_val)
        loss_x_vec.append(i+1)
        print("Loss at step {} : {}".format(i+1, loss_val))
      
    # Validation: Print some random words and top 5 related words
    if (i+1) % print_valid_every == 0:
        sim = sess.run(similarity, feed_dict=feed_dict)
        for j in range(len(valid_words)):
            valid_word = word_dictionary_rev[valid_examples[j]]
            top_k = 5 # number of nearest neighbors
            nearest = (-sim[j, :]).argsort()[1:top_k+1]
            log_str = "Nearest to {}:".format(valid_word)
            for k in range(top_k):
                close_word = word_dictionary_rev[nearest[k]]
                log_str = "%s %s," % (log_str, close_word)
            print(log_str)

Loss at step 500 : 50.77898406982422
Loss at step 1000 : 15.63180923461914
Loss at step 1500 : 12.383655548095703
Loss at step 2000 : 3.6537389755249023
Nearest to cliche: character, opened, powerpuff, phillip, adam,
Nearest to love: warned, metropolitan, itll, strengths, hints,
Nearest to hate: feardotcom, superbly, hanging, photographers, lint,
Nearest to silly: indistinct, irrelevant, smith, tear, essential,
Nearest to sad: perspective, shadyac, lump, performance, consolation,
Loss at step 2500 : 9.307934761047363
Loss at step 3000 : 28.51190948486328
Loss at step 3500 : 2.0294852256774902
Loss at step 4000 : 3.671635150909424
Nearest to cliche: character, opened, powerpuff, phillip, adam,
Nearest to love: warned, metropolitan, itll, strengths, hints,
Nearest to hate: feardotcom, superbly, hanging, photographers, lint,
Nearest to silly: indistinct, irrelevant, smith, tear, essential,
Nearest to sad: perspective, shadyac, lump, performance, consolation,
Loss at step 4500 : 12.2367725

Loss at step 34500 : 1.0263478755950928
Loss at step 35000 : 1.5766000747680664
Loss at step 35500 : 2.0771398544311523
Loss at step 36000 : 2.193027973175049
Nearest to cliche: character, opened, powerpuff, phillip, adam,
Nearest to love: warned, metropolitan, itll, strengths, hints,
Nearest to hate: feardotcom, superbly, hanging, photographers, lint,
Nearest to silly: indistinct, irrelevant, smith, tear, essential,
Nearest to sad: perspective, shadyac, lump, performance, consolation,
Loss at step 36500 : 1.5933908224105835
Loss at step 37000 : 2.2624058723449707
Loss at step 37500 : 1.4665138721466064
Loss at step 38000 : 2.9751980304718018
Nearest to cliche: character, opened, powerpuff, phillip, adam,
Nearest to love: warned, metropolitan, itll, strengths, hints,
Nearest to hate: feardotcom, superbly, hanging, photographers, lint,
Nearest to silly: indistinct, irrelevant, smith, tear, essential,
Nearest to sad: perspective, shadyac, lump, performance, consolation,
Loss at step 3850