# Download & Preprocess the IMDB Dataset

In [1]:
# Download reviews.txt and labels.txt from here: 
# https://github.com/udacity/deep-learning/tree/master/sentiment-network

def pretty_print_review_and_label(i):
   # A helper function to quickly print a specific review
   # along with its label for a quick preview.
   print(labels[i] + "\t:\t" + reviews[i][:80] + "...")

# 1) READ REVIEWS
g = open('reviews.txt','r') 
reviews = list(map(lambda x: x[:-1], g.readlines()))
g.close()
# - We open 'reviews.txt' and read each line into 'reviews'.
# - 'x[:-1]' removes the newline character from the end of each line.
# - 'reviews' is now a list of strings, each string is one full movie review.

# 2) READ LABELS
g = open('labels.txt','r')
labels = list(map(lambda x: x[:-1].upper(), g.readlines()))
g.close()
# - We open 'labels.txt' and read each line into 'labels'.
# - 'x[:-1]' removes the newline character from each line.
# - '.upper()' turns each label into uppercase (e.g., "positive" -> "POSITIVE").
# - 'labels' is now a list of strings ("POSITIVE" or "NEGATIVE").

# You can optionally view a sample review with its label:
# pretty_print_review_and_label(0)


# -------------------------------------------
# PREPROCESS DATASET (Alternative method)
# -------------------------------------------

import sys

# 1) Read the entire 'reviews.txt' into raw_reviews
f = open('reviews.txt')
raw_reviews = f.readlines()
f.close()

# 2) Read the entire 'labels.txt' into raw_labels
f = open('labels.txt')
raw_labels = f.readlines()
f.close()

# 3) TOKENIZE REVIEWS
# Convert each line into a set of words split on spaces.
# Using a set removes duplicate words in each review.
tokens = list(map(lambda x: set(x.split(" ")), raw_reviews))

# 4) BUILD A GLOBAL VOCABULARY
#  - We'll gather every unique word across all reviews.
vocab = set()
for sent in tokens:
    for word in sent:
        if len(word) > 0:
            vocab.add(word)

vocab = list(vocab)
# Convert from a set to a list so we can index each word.

# 5) CREATE A MAPPING FROM WORD -> UNIQUE ID (INDEX)
word2index = {}
for i, word in enumerate(vocab):
    word2index[word] = i
# Now each word is assigned a fixed integer ID.

# 6) CONVERT REVIEWS INTO LISTS OF WORD INDICES
#    We'll remove duplicates by storing indices in a set, then convert to a list.
input_dataset = list()
for sent in tokens:
    sent_indices = list()
    for word in sent:
        try:
            sent_indices.append(word2index[word])
        except:
            pass
    input_dataset.append(list(set(sent_indices)))
# 'input_dataset' is now a list of lists:
# each inner list is the unique word indices for one review.

# 7) CONVERT LABELS INTO BINARY TARGETS (1 or 0)
target_dataset = list()
for label in raw_labels:
    if label == 'positive\n':
        target_dataset.append(1)
    else:
        target_dataset.append(0)
# Each review's label is 1 if 'positive', else 0 for 'negative'.

# 'input_dataset' and 'target_dataset' are now ready for use in a RNN.


# The Surprising Power of Averaged Word Vectors

In [2]:
import numpy as np

# 1) Compute vector norms for each row in weights_0_1
#    'weights_0_1' is assumed to be a (vocab_size, embedding_dim) array
#    norms will contain the sum of squares of each word vector
norms = np.sum(weights_0_1 * weights_0_1, axis=1)
# norms is now shape: (vocab_size,)

# 2) Resize 'norms' to be a column vector (vocab_size, 1)
norms.resize(norms.shape[0], 1)

# 3) Multiply each row in 'weights_0_1' by its norm.
#    This effectively "normalizes" or scales each word vector
#    according to the magnitude we computed above.
normed_weights = weights_0_1 * norms

def make_sent_vect(words):
    """
    Convert a list of words into a single averaged vector using 'normed_weights'.
    1) Filter out any words not in 'word2index'.
    2) Convert each valid word to its index using 'word2index'.
    3) Average the rows from 'normed_weights' corresponding to these indices.
    """
    # Gather indices for all words present in 'word2index'
    indices = list(
        map(lambda x: word2index[x], 
            filter(lambda x: x in word2index, words)
        )
    )
    
    # Return the mean (1D vector of embedding_dim) over those word vectors
    return np.mean(normed_weights[indices], axis=0)

# 4) Convert each tokenized review into a single vector:
#    by averaging the normalized word vectors of the words it contains.
reviews2vectors = list()
for review in tokens:  # 'tokens' should be a list of tokenized reviews
    reviews2vectors.append(make_sent_vect(review))

# Convert 'reviews2vectors' from a list of arrays to a 2D numpy array
reviews2vectors = np.array(reviews2vectors)
# shape is now (num_reviews, embedding_dim)

def most_similar_reviews(review):
    """
    Given a list of words (e.g. ['boring','awful']),
    1) Construct a sentence vector 'v' by averaging the 'normed_weights' of these words.
    2) Compute dot product of 'v' with each row in 'reviews2vectors'.
    3) Sort reviews by similarity score (largest dot product).
    4) Return the top 3 most similar reviews.
    """
    # Build a single vector from the query words
    v = make_sent_vect(review)
    
    from collections import Counter
    scores = Counter()
    
    # Dot product of 'reviews2vectors' (shape (num_reviews, embedding_dim))
    # with the query vector 'v' (shape (embedding_dim,))
    # yields a similarity score for each review
    for i, val in enumerate(reviews2vectors.dot(v)):
        scores[i] = val
    
    most_similar = list()
    
    # 'scores.most_common(3)' returns the 3 highest dot products
    for idx, score in scores.most_common(3):
        # Retrieve the first 40 characters of the raw review text
        most_similar.append(raw_reviews[idx][0:40])
    
    return most_similar

# Example usage:
most_similar_reviews(['boring','awful'])


NameError: name 'weights_0_1' is not defined

# Matrices that Change Absolutely Nothing

In [3]:
import numpy as np

a = np.array([1,2,3])
b = np.array([0.1,0.2,0.3])
c = np.array([-1,-0.5,0])
d = np.array([0,0,0])

identity = np.eye(3)
print(identity)

[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


In [4]:
print(a.dot(identity))
print(b.dot(identity))
print(c.dot(identity))
print(d.dot(identity))

[1. 2. 3.]
[0.1 0.2 0.3]
[-1.  -0.5  0. ]
[0. 0. 0.]


In [5]:
this = np.array([2,4,6])
movie = np.array([10,10,10])
rocks = np.array([1,1,1])

print(this + movie + rocks)
print((this.dot(identity) + movie).dot(identity) + rocks)

[13 15 17]
[13. 15. 17.]


# Forward Propagation in Python

In [None]:
import numpy as np

def softmax(x_):
    # Convert 'x_' to at least a 2D array so operations work in batch mode
    x = np.atleast_2d(x_)
    # Exponentiate each element
    temp = np.exp(x)
    # Normalize each row so the sum of exponentials = 1
    return temp / np.sum(temp, axis=1, keepdims=True)

# 1) Initialize word vectors for a small vocabulary
#    Each word is represented by a 3-dimensional vector (all zeros here for demonstration).
word_vects = {}
word_vects['yankees'] = np.array([[0.,0.,0.]])
word_vects['bears']   = np.array([[0.,0.,0.]])
word_vects['braves']  = np.array([[0.,0.,0.]])
word_vects['red']     = np.array([[0.,0.,0.]])
word_vects['socks']   = np.array([[0.,0.,0.]])
word_vects['lose']    = np.array([[0.,0.,0.]])
word_vects['defeat']  = np.array([[0.,0.,0.]])
word_vects['beat']    = np.array([[0.,0.,0.]])
word_vects['tie']     = np.array([[0.,0.,0.]])

# 2) Create a random matrix 'sent2output' of shape (3, vocabulary_size)
#    We'll use this to map a 3D hidden state to a probability distribution
#    over the 9 words (via softmax).
sent2output = np.random.rand(3, len(word_vects))

# 3) Define a 3x3 identity matrix
#    This matrix acts like a simple "transition" for the hidden state,
#    effectively copying its current values without transformation.
identity = np.eye(3)

In [None]:
# 4) Forward pass:
#    a) 'layer_0' is the embedding for the first word 'red'
layer_0 = word_vects['red']

#    b) 'layer_1' = layer_0 * identity + the embedding for 'socks'
#       In a typical RNN, you'd have something like:
#       h_t = f(W * h_(t-1) + U * x_t), but here we're using the identity
#       and direct addition of the next word vector for simplicity.
layer_1 = layer_0.dot(identity) + word_vects['socks']

#    c) 'layer_2' = layer_1 * identity + the embedding for 'defeat'
#       We continue the same pattern, adding the next word's embedding.
layer_2 = layer_1.dot(identity) + word_vects['defeat']

# 5) Predict the distribution over vocabulary:
#    - Multiply 'layer_2' (shape (1,3)) by 'sent2output' (shape (3,9)) => (1,9)
#    - Apply softmax to get a probability distribution across the 9 words.
pred = softmax(layer_2.dot(sent2output))

print(pred)
# 'pred' is a 1 x 9 array indicating the predicted probability for each of the 9 words.

[[0.11111111 0.11111111 0.11111111 0.11111111 0.11111111 0.11111111
  0.11111111 0.11111111 0.11111111]]


# How do we Backpropagate into this?

In [None]:
# Suppose we have a target one-hot vector indicating the correct word is "yankees"
y = np.array([1,0,0,0,0,0,0,0,0]) 

# 'pred' is the predicted probability distribution from the forward pass
# 'pred_delta' is the gradient of the output w.r.t. the loss
# If we consider a simple cross-entropy-like approach: gradient = (pred - y)
pred_delta = pred - y

# 'layer_2_delta': we backprop through 'sent2output', so we multiply 'pred_delta' by
# the transpose of 'sent2output' to see how changes in the hidden state (layer_2) 
# would affect the final output.
layer_2_delta = pred_delta.dot(sent2output.T)

# 'defeat_delta' is the gradient w.r.t. the word vector 'defeat'.
# We multiply by 1 because there's no additional transformation after adding the word vector
# (like an activation derivative).
defeat_delta = layer_2_delta * 1 

# 'layer_1_delta' is how the gradient flows back from layer_2 to layer_1
# through the identity matrix. Multiplying by identity.T just returns the same vector.
layer_1_delta = layer_2_delta.dot(identity.T)

# Similarly for the 'socks' vector gradient.
socks_delta = layer_1_delta * 1

# 'layer_0_delta' flows further back from layer_1 to layer_0
# once again dot with identity.T.
layer_0_delta = layer_1_delta.dot(identity.T)

# Learning rate
alpha = 0.01

# 1) Update the word vectors:
#    Subtract the respective deltas scaled by 'alpha' from the word embeddings
word_vects['red'] -= layer_0_delta * alpha
word_vects['socks'] -= socks_delta * alpha
word_vects['defeat'] -= defeat_delta * alpha

# 2) Update the identity matrix:
#    Because we used layer_0 and layer_1 in the forward pass with a dot product,
#    we do an outer product of (input, gradient) for each step:
identity -= np.outer(layer_0, layer_1_delta) * alpha
identity -= np.outer(layer_1, layer_2_delta) * alpha

# 3) Update the 'sent2output' matrix:
#    The forward pass used layer_2.dot(sent2output), so the gradient w.r.t. sent2output
#    is the outer product of (layer_2, pred_delta).
sent2output -= np.outer(layer_2, pred_delta) * alpha


# Let's Train it!

In [None]:
import sys, random, math
from collections import Counter
import numpy as np

# 1) We load a subset of the bAbI tasks dataset
#    'qa1_single-supporting-fact_train.txt' is a set of simple question-answer pairs.

f = open('tasksv11/en/qa1_single-supporting-fact_train.txt','r')
raw = f.readlines()
f.close()

tokens = list()
for line in raw[0:1000]:
    # We take only the first 1000 lines
    # Convert each line to lowercase, strip newline, split on spaces.
    # Also skip the first token which is usually a numeric id in bAbI.
    tokens.append(line.lower().replace("\n","").split(" ")[1:])

print(tokens[0:3])
# A quick look at the first 3 tokenized lines (for sanity checking).


[['mary', 'moved', 'to', 'the', 'bathroom.'], ['john', 'went', 'to', 'the', 'hallway.'], ['where', 'is', 'mary?', '\tbathroom\t1']]


In [None]:
vocab = set()
for sent in tokens:
    for word in sent:
        vocab.add(word)

vocab = list(vocab)
# Convert the set to a list so we can index the words consistently.

word2index = {}
for i, word in enumerate(vocab):
    word2index[word] = i
# A mapping from each word to a unique integer (row index in embeddings).

def words2indices(sentence):
    """
    Convert a list of words into a list of integer indices 
    using 'word2index'.
    """
    idx = list()
    for word in sentence:
        idx.append(word2index[word])
    return idx

def softmax(x):
    """
    Standard softmax function: exponentiate and normalize by sum of exps.
    - x can be 1D or 2D, but typically we handle 1D input here.
    """
    e_x = np.exp(x - np.max(x))  # subtract max for numerical stability
    return e_x / e_x.sum(axis=0)


In [None]:
np.random.seed(1)
embed_size = 10

# word embeddings: shape (vocab_size, embed_size)
# each row is an embedding vector for one word
embed = (np.random.rand(len(vocab), embed_size) - 0.5) * 0.1

# recurrent: shape (embed_size, embed_size), 
# transforms one hidden state into the next
# (initially the identity is used in some examples, but we do random or identity)
recurrent = np.eye(embed_size)

# 'start' is the sentence embedding for an empty sentence
start = np.zeros(embed_size)

# decoder: shape (embed_size, vocab_size),
# maps from hidden state to a distribution over words
decoder = (np.random.rand(embed_size, len(vocab)) - 0.5) * 0.1

# one_hot: shape (vocab_size, vocab_size)
# identity matrix used for converting word indices into one-hot vectors
one_hot = np.eye(len(vocab))


# Forward Propagation with Arbitrary Length

In [None]:
def predict(sent):
    """
    Predict function:
    1. Start from 'start' hidden state.
    2. For each word in 'sent', predict the *next* word distribution 
       using the current hidden state, then compute loss wrt. the true next word.
    3. Update hidden state by applying 'recurrent' transform plus the embedding 
       for the actual word (like a simplified RNN).
    4. Return a list of 'layers' (one per step) and the total cross-entropy loss.
    """

    layers = list()
    layer = {}
    layer['hidden'] = start
    layers.append(layer)

    loss = 0
    preds = list()

    # forward propagate over each word in 'sent'
    for target_i in range(len(sent)):

        layer = {}

        # 1) Predict next word distribution: 
        #    hidden_state.dot(decoder) => unnormalized scores, 
        #    then softmax => probabilities
        layer['pred'] = softmax(layers[-1]['hidden'].dot(decoder))

        # 2) Compute cross-entropy loss = -log(prob of true next word)
        #    'sent[target_i]' is the "actual" word index at this step.
        loss += -np.log(layer['pred'][sent[target_i]])

        # 3) Generate the next hidden state: 
        #    old_hidden.dot(recurrent) + embed for the current word
        layer['hidden'] = layers[-1]['hidden'].dot(recurrent) + embed[sent[target_i]]

        layers.append(layer)

    return layers, loss


# Backpropagation with Arbitrary Length

In [None]:
# forward
for iter in range(30000):
    alpha = 0.001
    # We pick a sentence from 'tokens' in a cyclical manner
    sent = words2indices(tokens[iter % len(tokens)][1:])
    # skip the first word in that line (like the bAbI ID or something)

    # 1) Forward pass
    layers, loss = predict(sent)

    # 2) Backpropagate
    for layer_idx in reversed(range(len(layers))):
        # current layer
        layer = layers[layer_idx]
        # the 'true' word for this time step
        target = sent[layer_idx - 1]  
        # (layer_idx-1) because the first layer's "next word" is the first item in 'sent'

        if(layer_idx > 0):
            # The output delta = predicted distribution - one_hot_vector_of_true_word
            layer['output_delta'] = layer['pred'] - one_hot[target]
            # 'new_hidden_delta': how changes in hidden affect the output
            # multiply by decoder transpose
            new_hidden_delta = layer['output_delta'].dot(decoder.transpose())

            # If this is the last layer, no next layer delta
            # otherwise add the next layer's hidden delta 
            # multiplied by the transpose of recurrent.
            if(layer_idx == len(layers) - 1):
                layer['hidden_delta'] = new_hidden_delta
            else:
                layer['hidden_delta'] = new_hidden_delta + \
                    layers[layer_idx + 1]['hidden_delta'].dot(recurrent.transpose())
        else:
            # For the first layer, we don't have an 'output_delta' (no pred for "previous" word)
            # so we just gather hidden delta from the next layer
            layer['hidden_delta'] = layers[layer_idx + 1]['hidden_delta'].dot(recurrent.transpose())


# Weight Update with Arbitrary Length

In [None]:
# forward
for iter in range(30000):
    alpha = 0.001
    sent = words2indices(tokens[iter % len(tokens)][1:])
    
    layers, loss = predict(sent)

    # ... backprop code ...

    # update weights
    start -= layers[0]['hidden_delta'] * alpha / float(len(sent))
    # 'start' is the initial hidden state. 
    # We subtract the gradient scaled by alpha and the length of the sentence.

    for layer_idx, layer in enumerate(layers[1:]):
        # 1) Update 'decoder':
        #    outer product of (the previous layer's hidden state) and (output_delta).
        decoder -= np.outer(layers[layer_idx]['hidden'], layer['output_delta']) \
                   * alpha / float(len(sent))

        # 2) Update word embedding for the word 'embed_idx' at this step
        embed_idx = sent[layer_idx]
        embed[embed_idx] -= layers[layer_idx]['hidden_delta'] \
                            * alpha / float(len(sent))

        # 3) Update recurrent:
        #    outer product of (previous layer's hidden) and (this layer's hidden_delta).
        #    again scaled by alpha and sentence length.
        recurrent -= np.outer(layers[layer_idx]['hidden'], layer['hidden_delta']) \
                     * alpha / float(len(sent))

    if (iter % 1000) == 0:
        print("Perplexity:" + str(np.exp(loss / len(sent))))


Perplexity:82.07190057702232
Perplexity:81.99201065315157
Perplexity:81.90350186145962
Perplexity:81.77235882739159
Perplexity:81.54058043945564
Perplexity:81.09214329745042
Perplexity:80.16498978212181
Perplexity:78.05384313423644
Perplexity:72.10587993172675
Perplexity:45.039919968348315
Perplexity:23.808985116122862
Perplexity:19.983231768605688
Perplexity:18.701956753189645
Perplexity:17.485693470844865
Perplexity:15.850678254758108
Perplexity:13.419442755410344
Perplexity:10.29560845031423
Perplexity:7.887729550068891
Perplexity:6.646312421017693
Perplexity:5.851879733939698
Perplexity:5.286195672699819
Perplexity:4.946446677849613
Perplexity:4.734344492498793
Perplexity:4.5860018979698705
Perplexity:4.486785221893155
Perplexity:4.425031891935898
Perplexity:4.378844800385847
Perplexity:4.326100599939156
Perplexity:4.2550389484900535
Perplexity:4.162303465910049


# Execution and Output Analysis

In [None]:
sent_index = 4

# 'predict' returns (layers, loss).
l, _ = predict(words2indices(tokens[sent_index]))

print(tokens[sent_index])

# Print each predicted word vs. the true word for each step in the sentence
for i, each_layer in enumerate(l[1:-1]):
    input_word = tokens[sent_index][i]
    true_word = tokens[sent_index][i+1]
    
    # pick the predicted word index as argmax of layer['pred']
    pred_word = vocab[each_layer['pred'].argmax()]
    
    print("Prev Input:" + input_word + (' ' * (12 - len(input_word))) +\
          "True:" + true_word + (" " * (15 - len(true_word))) + "Pred:" + pred_word)


['sandra', 'moved', 'to', 'the', 'garden.']
Prev Input:sandra      True:moved          Pred:is
Prev Input:moved       True:to             Pred:to
Prev Input:to          True:the            Pred:the
Prev Input:the         True:garden.        Pred:bedroom.
