In [1]:
import sys

f = open('datasets/reviews.txt')
raw_reviews = f.readlines()
f.close()

f = open('datasets/labels.txt')
raw_labels = f.readlines()
f.close()

# Here we split the reviews into tokens and add them to a list of lists.
tokens = list(map(lambda x: set(x.split(" ")), raw_reviews))

# Here we select the unique tokens from the above token list by adding them to a set named vocabulary
vocab = set()
for sent in tokens:
    for word in sent:
        if(len(word)>0):
            vocab.add(word)

# Here we are creating a dictionary to keep track of tokens and their respective index in the vocabulary
vocab = list(vocab)
word2index = {}
for i, word in enumerate(vocab):
    word2index[word]=i

# Here we are converting the reviews in to numeric dataset form by using the above created word2index dictionary
input_dataset = list()
for sent in tokens:
    sent_indices = list()
    for word in sent:
        try:
            sent_indices.append(word2index[word])
        except:
            ""
    input_dataset.append(list(set(sent_indices)))

# Converting the labels to binary values
target_dataset = list()
for label in raw_labels:
    if label == 'positive\n':
        target_dataset.append(1)
    else:
        target_dataset.append(0)

In [2]:
import numpy as np

np.random.seed(1)

# Sigmoid function for given x
def sigmoid(x):
    return 1/(1 + np.exp(-x))

# Variable declaration
alpha, iterations = (0.01, 2)
hidden_size = 100
weights_0_1 = 0.2*np.random.random((len(vocab), hidden_size)) - 0.1
weights_1_2 = 0.2*np.random.random((hidden_size, 1)) - 0.1
correct,total = (0,0)

for iter in range(iterations):
        # Keeping 1000 values for testing
    for i in range(len(input_dataset) - 1000):
        
        # Here x contains the indexes of words instead of their one hot encoded values
        x, y = (input_dataset[i], target_dataset[i])

        # Forward Propagation
        layer_1 = sigmoid(np.sum(weights_0_1[x], axis=0))    # Embedding layer --> we get the summation with respective to indexes in x
        layer_2 = sigmoid(np.dot(layer_1,weights_1_2))

        # Backward Propagation
        layer_2_delta = layer_2 - y
        layer_1_delta = layer_2_delta.dot(weights_1_2.T)

        # Weight update
        weights_0_1[x] -= layer_1_delta * alpha
        weights_1_2 -= np.outer(layer_1, layer_2_delta) * alpha

        # If prediction and actual value difference is less than 0.5 then it is correct (softmax)
        if(np.abs(layer_2_delta) < 0.5):
            correct += 1
        
        total += 1
        if(i % 10 == 9):
            progress = str(i/float(len(input_dataset)))
            sys.stdout.write(   '\rIter:'+str(iter)\
                                +' Progress:'+progress[2:4]\
                                +'.'+progress[4:6]\
                                +'% Training Accuracy:'\
                                + str(correct/float(total)) + '%')
    print()


Iter:0 Progress:95.99% Training Accuracy:0.8338333333333333%
Iter:1 Progress:95.99% Training Accuracy:0.8671041666666667%


In [3]:

# Testing phase
correct,total = (0,0)

for i in range(len(input_dataset)-1000,len(input_dataset)):
    x = input_dataset[i]
    y = target_dataset[i]

    # Forward pass
    layer_1 = sigmoid(np.sum(weights_0_1[x],axis=0))
    layer_2 = sigmoid(np.dot(layer_1,weights_1_2))

    if(np.abs(layer_2 - y) < 0.5):
        correct += 1
    total += 1
    
print("Test Accuracy:" + str(correct / float(total)))

Test Accuracy:0.848


In [4]:
from collections import Counter
import math

def similar(target='beautiful'):

    # Find the index of the word we are interested
    target_index = word2index[target]
    scores = Counter()

    for word, index in word2index.items():
        raw_difference = weights_0_1[index] - (weights_0_1[target_index])
        squared_difference = raw_difference * raw_difference

        # Taking the negative to make smallest difference to large value
        scores[word] = -math.sqrt(sum(squared_difference))

    return scores.most_common(10)

In [5]:
similar("boring")

[('boring', -0.0),
 ('annoying', -0.7856686762236119),
 ('dull', -0.8158243398362439),
 ('disappointing', -0.8172656843915939),
 ('mess', -0.8214104282129204),
 ('fails', -0.8637754215448665),
 ('laughable', -0.8680425930433701),
 ('horrible', -0.8708682010448864),
 ('disappointment', -0.8819927902910096),
 ('badly', -0.8832801659361629)]

In [6]:
import sys,random,math
from collections import Counter
import numpy as np

np.random.seed(1)
random.seed(1)

f = open('Grokking-Deep-Learning-master/reviews.txt')
raw_reviews = f.readlines()
f.close()

tokens = list(map(lambda x:(x.split(" ")),raw_reviews))
wordcnt = Counter()

for sent in tokens:
    for word in sent:
        wordcnt[word] -= 1

vocab = list(set(map(lambda x: x[0], wordcnt.most_common())))

word2index = {}
for i,word in enumerate(vocab):
    word2index[word]=i

concatenated = list()
input_dataset = list()
for sent in tokens:
    sent_indices = list()
    for word in sent:
        try:
            sent_indices.append(word2index[word])
            concatenated.append(word2index[word])
        except:
            ""

    input_dataset.append(sent_indices)

# An array which contains all the tokens by their respective index from the vocabulary
concatenated = np.array(concatenated)

random.shuffle(input_dataset)


In [7]:
alpha, iterations = (0.05, 2)
hidden_size, window, negative = (50, 2, 5)

weights_0_1 = (np.random.rand(len(vocab),hidden_size) - 0.5) * 0.2
weights_1_2 = np.random.rand(len(vocab),hidden_size) * 0

layer_2_target = np.zeros(negative+1)
layer_2_target[0] = 1


# Same as the previous "similar" function
def similar(target='beautiful'):
    target_index = word2index[target]
    scores = Counter()
    for word,index in word2index.items():
        raw_difference = weights_0_1[index] - (weights_0_1[target_index])
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
    return scores.most_common(10)


def sigmoid(x):
    return 1/(1 + np.exp(-x))


for rev_i, review in enumerate(input_dataset * iterations):
    for target_i in range(len(review)):
        
        # Creates a sample data list with a token from the considering review and random tokens from vocabulary.
        # Theoritically this should have full vocabulary. Since it is an expensive operation we choose random set of tokens
        target_samples = [review[target_i]] + list(concatenated[(np.random.rand(negative) * len(concatenated)).astype('int').tolist()])

        # Getting the context of considering token based on the review tokens
        left_context = review[max(0,target_i-window):target_i]
        right_context = review[target_i+1:min(len(review),target_i+window)]

        # Forward Propagation (Interesting one at that..!)
        layer_1 = np.mean(weights_0_1[left_context+right_context],axis=0)  # Since we are taking the mean from axis 0, layer 1 get reduced to (1, hidden_size) dimension
        layer_2 = sigmoid(layer_1.dot(weights_1_2[target_samples].T))

        # In here we have make the layer_2_target to a constant since correct value will always be at first position
        layer_2_delta = layer_2 - layer_2_target
        layer_1_delta = layer_2_delta.dot(weights_1_2[target_samples])

        # Back Propagation
        weights_0_1[left_context+right_context] -= layer_1_delta * alpha
        weights_1_2[target_samples] -= np.outer(layer_2_delta,layer_1)*alpha

    if(rev_i % 250 == 0):
        sys.stdout.write('\rProgress:' + str(rev_i / float(len(input_dataset) * iterations)) + " " + str(similar('terrible')))
    sys.stdout.write('\rProgress:' + str(rev_i / float(len(input_dataset) * iterations)))





Progress:0.99998

In [8]:
print(similar('terrible'))

[('terrible', -0.0), ('horrible', -2.9634308450007594), ('brilliant', -3.244215002465441), ('pathetic', -3.628799685321204), ('phenomenal', -3.7442593702081486), ('superb', -3.8643080195802937), ('marvelous', -3.8977777105558635), ('masterful', -3.9176688910541415), ('bad', -4.022756787056689), ('mediocre', -4.098835555375042)]


In [9]:
def analogy(positive=['terrible','good'], negative=['bad']):

    # This has been done to make vectors not become zero when adding up
    norms = np.sum(weights_0_1 * weights_0_1, axis=1)
    norms.resize(norms.shape[0],1)
    normed_weights = weights_0_1 * norms

    query_vect = np.zeros(len(weights_0_1[0]))
    
    for word in positive:
        query_vect += normed_weights[word2index[word]]
    
    for word in negative:
        query_vect -= normed_weights[word2index[word]]
    
    scores = Counter()
    for word, index in word2index.items():
        raw_difference = weights_0_1[index] - query_vect
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
        
    return scores.most_common(10)[1:]

analogy(['elizabeth','he'],['she'])

[('christopher', -189.07013614555368),
 ('this', -189.33419875503418),
 ('tom', -189.37409015893581),
 ('william', -189.39351349089938),
 ('david', -189.40491594510164),
 ('peter', -189.40836842502662),
 ('it', -189.47355468636258),
 ('him', -189.48179092420622),
 ('simon', -189.4935880133762)]

In [14]:
import numpy as np

# Normalizing the weight values
norms = np.sum(weights_0_1 * weights_0_1,axis=1)
norms.resize(norms.shape[0],1)
normed_weights = weights_0_1 * norms

# Function to create averaged token vector
def make_sent_vect(words):
    indices = list(map(lambda x: word2index[x], filter(lambda x:x in word2index, words)))
    return np.mean(normed_weights[indices],axis=0)

# Converting all the reviews to averaged token vectors
reviews2vectors = list()
for review in tokens:
    reviews2vectors.append(make_sent_vect(review))

reviews2vectors = np.array(reviews2vectors)

def most_similar_reviews(review):
    v = make_sent_vect(review)
    scores = Counter()

    for i, val in enumerate(reviews2vectors.dot(v)):
        scores[i] = val

    most_similar = list()
    for idx,score in scores.most_common(3):
        most_similar.append(raw_reviews[idx][0:40])

    return most_similar

In [15]:
most_similar_reviews(['boring','awful'])

['i don  t know why i like this movie so w',
 'i am   and i hated this film its the wor',
 'i read the reviews before i watched this']