In [None]:
# Text Helper Functions
#---------------------------------------
#
# We pull out text helper functions to reduce redundant code

import string
import os
import urllib.request
import io
import tarfile
import collections
import numpy as np
import requests
import gzip
class text_helpers:
  # Normalize text
  def normalize_text(texts, stops):
      # Lower case
      texts = [x.lower() for x in texts]

      # Remove punctuation
      texts = [''.join(c for c in x if c not in string.punctuation) for x in texts]

      # Remove numbers
      texts = [''.join(c for c in x if c not in '0123456789') for x in texts]

      # Remove stopwords
      texts = [' '.join([word for word in x.split() if word not in (stops)]) for x in texts]

      # Trim extra whitespace
      texts = [' '.join(x.split()) for x in texts]

      return(texts)


  # Build dictionary of words
  def build_dictionary(sentences, vocabulary_size):
      # Turn sentences (list of strings) into lists of words
      split_sentences = [s.split() for s in sentences]
      words = [x for sublist in split_sentences for x in sublist]

      # Initialize list of [word, word_count] for each word, starting with unknown
      count = [['RARE', -1]]

      # Now add most frequent words, limited to the N-most frequent (N=vocabulary size)
      count.extend(collections.Counter(words).most_common(vocabulary_size-1))

      # Now create the dictionary
      word_dict = {}
      # For each word, that we want in the dictionary, add it, then make it
      # the value of the prior dictionary length
      for word, word_count in count:
          word_dict[word] = len(word_dict)

      return(word_dict)


  # Turn text data into lists of integers from dictionary
  def text_to_numbers(sentences, word_dict):
      # Initialize the returned data
      data = []
      for sentence in sentences:
          sentence_data = []
          # For each word, either use selected index or rare word index
          for word in sentence.split():
              if word in word_dict:
                  word_ix = word_dict[word]
              else:
                  word_ix = 0
              sentence_data.append(word_ix)
          data.append(sentence_data)
      return(data)


  # Generate data randomly (N words behind, target, N words ahead)
  def generate_batch_data(sentences, batch_size, window_size, method='skip_gram'):
      # Fill up data batch
      batch_data = []
      label_data = []
      while len(batch_data) < batch_size:
          # select random sentence to start
          rand_sentence_ix = int(np.random.choice(len(sentences), size=1))
          rand_sentence = sentences[rand_sentence_ix]
          # Generate consecutive windows to look at
          window_sequences = [rand_sentence[max((ix-window_size),0):(ix+window_size+1)] for ix, x in enumerate(rand_sentence)]
          # Denote which element of each window is the center word of interest
          label_indices = [ix if ix<window_size else window_size for ix,x in enumerate(window_sequences)]

          # Pull out center word of interest for each window and create a tuple for each window
          if method=='skip_gram':
              batch_and_labels = [(x[y], x[:y] + x[(y+1):]) for x,y in zip(window_sequences, label_indices)]
              # Make it in to a big list of tuples (target word, surrounding word)
              tuple_data = [(x, y_) for x,y in batch_and_labels for y_ in y]
              batch, labels = [list(x) for x in zip(*tuple_data)]
          elif method=='cbow':
              batch_and_labels = [(x[:y] + x[(y+1):], x[y]) for x,y in zip(window_sequences, label_indices)]
              # Only keep windows with consistent 2*window_size
              batch_and_labels = [(x,y) for x,y in batch_and_labels if len(x)==2*window_size]
              batch, labels = [list(x) for x in zip(*batch_and_labels)]
          elif method=='doc2vec':
              # For doc2vec we keep LHS window only to predict target word
              batch_and_labels = [(rand_sentence[i:i+window_size], rand_sentence[i+window_size]) for i in range(0, len(rand_sentence)-window_size)]
              batch, labels = [list(x) for x in zip(*batch_and_labels)]
              # Add document index to batch!! Remember that we must extract the last index in batch for the doc-index
              batch = [x + [rand_sentence_ix] for x in batch]
          else:
              raise ValueError('Method {} not implemented yet.'.format(method))

          # extract batch and labels
          batch_data.extend(batch[:batch_size])
          label_data.extend(labels[:batch_size])
      # Trim batch and label at the end
      batch_data = batch_data[:batch_size]
      label_data = label_data[:batch_size]

      # Convert to numpy array
      batch_data = np.array(batch_data)
      label_data = np.transpose(np.array([label_data]))

      return(batch_data, label_data)


  # Load the movie review data
  # Check if data was downloaded, otherwise download it and save for future use
  def load_movie_data():
      save_folder_name = 'temp'
      pos_file = os.path.join(save_folder_name, 'rt-polaritydata', 'rt-polarity.pos')
      neg_file = os.path.join(save_folder_name, 'rt-polaritydata', 'rt-polarity.neg')

      # Check if files are already downloaded
      if not os.path.exists(os.path.join(save_folder_name, 'rt-polaritydata')):
          movie_data_url = 'http://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz'

          # Save tar.gz file
          req = requests.get(movie_data_url, stream=True)
          with open('temp_movie_review_temp.tar.gz', 'wb') as f:
              for chunk in req.iter_content(chunk_size=1024):
                  if chunk:
                      f.write(chunk)
                      f.flush()
          # Extract tar.gz file into temp folder
          tar = tarfile.open('temp_movie_review_temp.tar.gz', "r:gz")
          tar.extractall(path='temp')
          tar.close()

      pos_data = []
      with open(pos_file, 'r', encoding='latin-1') as f:
          for line in f:
              pos_data.append(line.encode('ascii',errors='ignore').decode())
      f.close()
      pos_data = [x.rstrip() for x in pos_data]

      neg_data = []
      with open(neg_file, 'r', encoding='latin-1') as f:
          for line in f:
              neg_data.append(line.encode('ascii',errors='ignore').decode())
      f.close()
      neg_data = [x.rstrip() for x in neg_data]

      texts = pos_data + neg_data
      target = [1]*len(pos_data) + [0]*len(neg_data)

      return(texts, target)

In [None]:
# Load movie review data
texts, targets = text_helpers.load_movie_data()

# Display a few reviews and their labels
for i in range(5):  # Print the first 5 reviews
    print(f"Review {i + 1}:")
    print(texts[i])
    print(f"Label: {'Positive' if targets[i] == 1 else 'Negative'}")
    print("=" * 50)


Review 1:
the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .
Label: Positive
Review 2:
the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth .
Label: Positive
Review 3:
effective but too-tepid biopic
Label: Positive
Review 4:
if you sometimes like to go to the movies to have fun , wasabi is a good place to start .
Label: Positive
Review 5:
emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one .
Label: Positive


In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import random
import os
import pickle
import string
import requests
import collections
import io
import tarfile
import urllib.request
from nltk.corpus import stopwords
from tensorflow.python.framework import ops

# Uncomment the following line if you want to install NLTK in Colab
# !pip install nltk

ops.reset_default_graph()

# Declare model parameters
batch_size = 500
vocabulary_size = 7500
generations = 100000
model_learning_rate = 0.001

embedding_size = 200   # Word embedding size
doc_embedding_size = 100   # Document embedding size
concatenated_size = embedding_size + doc_embedding_size

num_sampled = int(batch_size/2)    # Number of negative examples to sample.
window_size = 3       # How many words to consider to the left.

# Add checkpoints to training
save_embeddings_every = 5000
print_valid_every = 5000
print_loss_every = 100

# Declare stop words
# stops = stopwords.words('english')
stops = []

# We pick a few test words for validation.
valid_words = ['love', 'hate', 'happy', 'sad', 'man', 'woman']
# Later we will have to transform these into indices

# Load the movie review data
print('Loading Data')
texts, target = text_helpers.load_movie_data()

# Normalize text
print('Normalizing Text Data')
texts = text_helpers.normalize_text(texts, stops)

# Texts must contain at least 3 words
target = [target[ix] for ix, x in enumerate(texts) if len(x.split()) > window_size]
texts = [x for x in texts if len(x.split()) > window_size]
assert(len(target)==len(texts))

# Build our data set and dictionaries
print('Creating Dictionary')
word_dictionary = text_helpers.build_dictionary(texts, vocabulary_size)
word_dictionary_rev = dict(zip(word_dictionary.values(), word_dictionary.keys()))
text_data = text_helpers.text_to_numbers(texts, word_dictionary)

# Get validation word keys
valid_examples = [word_dictionary[x] for x in valid_words]

print('Creating Model')
# Define Embeddings:
embeddings = tf.Variable(tf.random.uniform([vocabulary_size, embedding_size], -1.0, 1.0))
doc_embeddings = tf.Variable(tf.random.uniform([len(texts), doc_embedding_size], -1.0, 1.0))

# NCE loss parameters
nce_weights = tf.Variable(tf.random.truncated_normal([vocabulary_size, concatenated_size],
                                               stddev=1.0 / np.sqrt(concatenated_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

# Create data/target placeholders
x_inputs = tf.Variable(tf.zeros([batch_size, window_size + 1], dtype=tf.int32))
y_target = tf.Variable(tf.zeros([batch_size, 1], dtype=tf.int32))
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

# Lookup the word embedding
# Add together element embeddings in window:
embed = tf.zeros([batch_size, embedding_size])
for element in range(window_size):
    embed += tf.nn.embedding_lookup(embeddings, x_inputs[:, element])

doc_indices = tf.slice(x_inputs, [0, window_size], [batch_size, 1])
doc_embed = tf.nn.embedding_lookup(doc_embeddings, doc_indices)

# concatenate embeddings
final_embed = tf.concat(axis=1, values=[embed, tf.squeeze(doc_embed)])

# Get loss from prediction
loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights,
                                     biases=nce_biases,
                                     labels=y_target,
                                     inputs=final_embed,
                                     num_sampled=num_sampled,
                                     num_classes=vocabulary_size))

# Create optimizer
optimizer = tf.optimizers.SGD(learning_rate=model_learning_rate)

# Create a function to perform one optimization step
@tf.function
def train_step(batch_inputs, batch_labels):
    with tf.GradientTape() as tape:
        # Run the forward pass
        embed = tf.zeros([batch_size, embedding_size])
        for element in range(window_size):
            embed += tf.nn.embedding_lookup(embeddings, batch_inputs[:, element])

        doc_indices = tf.slice(batch_inputs, [0, window_size], [batch_size, 1])
        doc_embed = tf.nn.embedding_lookup(doc_embeddings, doc_indices)
        final_embed = tf.concat(axis=1, values=[embed, tf.squeeze(doc_embed)])

        # Compute the loss
        current_loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights,
                                                     biases=nce_biases,
                                                     labels=batch_labels,
                                                     inputs=final_embed,
                                                     num_sampled=num_sampled,
                                                     num_classes=vocabulary_size))

    # Compute gradients
    grads = tape.gradient(current_loss, [embeddings, doc_embeddings, nce_weights, nce_biases])

    # Apply gradients
    optimizer.apply_gradients(zip(grads, [embeddings, doc_embeddings, nce_weights, nce_biases]))

    return current_loss

# Cosine similarity between words
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

# Create model saving operation
saver = tf.compat.v1.train.Saver({"embeddings": embeddings, "doc_embeddings": doc_embeddings})

# Add variable initializer.
init = tf.compat.v1.global_variables_initializer()

# The next line is not necessary in Colab, as it automatically starts a session.
# sess.run(init)

# Run the doc2vec model.
print('Starting Training')
loss_vec = []
loss_x_vec = []
for i in range(generations):
    batch_inputs, batch_labels = text_helpers.generate_batch_data(text_data, batch_size,
                                                                  window_size, method='doc2vec')
    current_loss = train_step(batch_inputs, batch_labels)

    # Return the loss
    if (i+1) % print_loss_every == 0:
        loss_vec.append(current_loss)
        loss_x_vec.append(i+1)
        print('Loss at step {} : {}'.format(i+1, current_loss))

    # Validation: Print some random words and top 5 related words
    if (i+1) % print_valid_every == 0:
        sim = similarity.numpy()
        for j in range(len(valid_words)):
            valid_word = word_dictionary_rev[valid_examples[j]]
            top_k = 5  # number of nearest neighbors
            nearest = (-sim[j, :]).argsort()[1:top_k+1]
            log_str = "Nearest to {}:".format(valid_word)
            for k in range(top_k):
                close_word = word_dictionary_rev[nearest[k]]
                log_str = '{} {},'.format(log_str, close_word)
            print(log_str)

    # Save dictionary + embeddings
    if (i+1) % save_embeddings_every == 0:
        # Save vocabulary dictionary
        with open(os.path.join(data_folder_name, 'movie_vocab.pkl'), 'wb') as f:
            pickle.dump(word_dictionary, f)

        # Save embeddings
        model_checkpoint_path = os.path.join(os.getcwd(), data_folder_name, 'doc2vec_movie_embeddings.ckpt')
        save_path = saver.save(tf.compat.v1.Session(), model_checkpoint_path)
        print('Model saved in file: {}'.format(save_path))


Loading Data
Normalizing Text Data
Creating Dictionary
Creating Model




Starting Training
Loss at step 100 : 627.3471069335938
Loss at step 200 : 602.0684204101562
Loss at step 300 : 591.26904296875
Loss at step 400 : 566.1998901367188
Loss at step 500 : 568.5272827148438
Loss at step 600 : 530.0843505859375
Loss at step 700 : 532.084228515625
Loss at step 800 : 499.2488708496094
Loss at step 900 : 502.57891845703125
Loss at step 1000 : 495.83477783203125
Loss at step 1100 : 492.978271484375
Loss at step 1200 : 459.86895751953125
Loss at step 1300 : 466.2934265136719
Loss at step 1400 : 438.1857604980469
Loss at step 1500 : 441.8061828613281
Loss at step 1600 : 434.389892578125
Loss at step 1700 : 412.6051025390625
Loss at step 1800 : 342.7693786621094
Loss at step 1900 : 405.4205322265625
Loss at step 2000 : 358.5806884765625
Loss at step 2100 : 333.2110595703125
Loss at step 2200 : 364.3914794921875
Loss at step 2300 : 346.8654479980469
Loss at step 2400 : 335.8286437988281
Loss at step 2500 : 306.0010681152344
Loss at step 2600 : 319.6458740234375
Loss 