# Word to Vector

In [25]:
import sys

f = open('reviews.txt')
raw_reviews = f.readlines()
f.close()

f = open('labels.txt')
raw_labels = f.readlines()
f.close()

tokens = list(map(lambda x: set(x.split(" ")), raw_reviews))

vocab = set()
for sent in tokens:
    for word in sent:
        if (len(word) > 0):
            vocab.add(word)
vocab = list(vocab)

word2index = {}
for i, word in enumerate(vocab):
    word2index[word] = i

input_dataset = list()
for sent in tokens:
    sent_indices = list()
    for word in sent:
        try:
            sent_indices.append(word2index[word])
        except:
            ""
    input_dataset.append(list(set(sent_indices)))
    
target_dataset = list()
for label in raw_labels:
    if label == 'positive\n':
        target_dataset.append(1)
    else:
        target_dataset.append(0)

# Identify positive / negative review

In [12]:
import numpy as np
np.random.seed(1)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

alpha, iterations = (0.01, 2)
hidden_size = 100

weights_0_1 = 0.2 * np.random.random((len(vocab), hidden_size)) - 0.1
weights_1_2 = 0.2 * np.random.random((hidden_size, 1)) - 0.1

correct, total = (0, 0)
for iter in range(iterations):
    
    for i in range(len(input_dataset) - 1000):
        x, y = (input_dataset[i], target_dataset[i])
        layer_1 = sigmoid(np.sum(weights_0_1[x], axis=0))
        layer_2 = sigmoid(np.dot(layer_1, weights_1_2))
        
        layer_2_delta = layer_2 - y
        layer_1_delta = layer_2_delta.dot(weights_1_2.T)
        
        weights_0_1[x] -= layer_1_delta * alpha
        weights_1_2 -= np.outer(layer_1, layer_2_delta) * alpha
        
        if (np.abs(layer_2_delta) < 0.5):
            correct += 1
        total += 1
        if (i % 10 == 9):
            progress = str(i/float(len(input_dataset)))
            sys.stdout.write('\rIter: ' + str(iter)\
                            +' Progress: ' + progress[2:4]\
                            + '.' + progress[4:6]\
                            + '% Training Accuracy: '\
                            + str(correct/float(total)) + '%')
#             print()
            
    correct, total = (0, 0)
    for i in range(len(input_dataset) - 1000, len(input_dataset)):
        
        x = input_dataset[i]
        y = target_dataset[i]
        
        layer_1 = sigmoid(np.sum(weights_0_1[x], axis=0))
        layer_2 = sigmoid(np.dot(layer_1, weights_1_2))
        
        if (np.abs(layer_2 - y) < 0.5):
            correct += 1
        total += 1
        
    print("\nTest Accuracy: " + str(correct / float(total)))

Iter: 0 Progress: 95.99% Training Accuracy: 0.8335%39641517298%%
Test Accuracy: 0.847
Iter: 1 Progress: 95.99% Training Accuracy: 0.89748%0076030413%
Test Accuracy: 0.848


In [22]:
from collections import Counter
import math

def similar(target='beautiful'):
    target_index = word2index[target]
    scores = Counter()
    for word, index in word2index.items():
        raw_difference = weights_0_1[index] \
                        - (weights_0_1[target_index])
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
    
    return scores.most_common(10)

In [23]:
print(similar('beautiful'))

[('beautiful', -0.0), ('recommended', -0.7500406716847221), ('surprisingly', -0.7585962884800961), ('genius', -0.7621018567811624), ('sweet', -0.7657713593527669), ('performances', -0.7714181808402989), ('finest', -0.7776045774289513), ('friendship', -0.7785043258498116), ('innocent', -0.7873230289773757), ('beauty', -0.7883548899694895)]


In [24]:
print(similar('terrible'))

[('terrible', -0.0), ('poor', -0.7718187711519647), ('fails', -0.7861257960111084), ('horrible', -0.7952374836369098), ('save', -0.8043775836192308), ('worse', -0.8168998266390466), ('dull', -0.8172667810917823), ('boring', -0.8186040576439032), ('mess', -0.8289911307476302), ('annoying', -0.8291887883487581)]


# Append missing word

In [36]:
import sys, random, math
from collections import Counter
import numpy as np

np.random.seed(1)
random.seed(1)

f = open('reviews.txt')
raw_reviews = f.readlines()
f.close()

tokens = list(map(lambda x: (x.split(" ")), raw_reviews))
wordcnt = Counter()
for sent in tokens:
    for word in sent:
        wordcnt[word] += 1
vocab = list(set(map(lambda x:x[0], wordcnt.most_common())))

word2index = {}
for i, word in enumerate(vocab):
    word2index[word] = i
    
concatenated = list()
input_dataset = list()
for sent in tokens:
    sent_indices = list()
    for word in sent:
        try:
            sent_indices.append(word2index[word])
            concatenated.append(word2index[word])
        except:
            ""
    input_dataset.append(sent_indices)
concatenated = np.array(concatenated)

random.shuffle(input_dataset)
alpha, iterations = (0.05, 2)
hidden_size, window, negative = (50, 2, 5)

weights_0_1 = (np.random.rand(len(vocab), hidden_size) - 0.5) * 0.2
weights_1_2 = np.random.rand(len(vocab), hidden_size) * 0

layer_2_target = np.zeros(negative + 1)
layer_2_target[0] = 1

def similar(target='beautiful'):
    target_index = word2index[target]
    scores = Counter()
    for word, index in word2index.items():
        raw_difference = weights_0_1[index] \
                        - (weights_0_1[target_index])
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
    
    return scores.most_common(10)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

for rev_i, review in enumerate(input_dataset * iterations):
    for target_i in range(len(review)):
        
        target_samples = [review[target_i]] + list(concatenated\
        [(np.random.rand(negative) * len(concatenated)).astype('int').tolist()])
        
        left_context = review[max(0, target_i-window):target_i]
        right_context = review[target_i+1:min(len(review), target_i+window)]
        
        layer_1 = np.mean(weights_0_1[left_context+right_context], axis=0)
        layer_2 = sigmoid(layer_1.dot(weights_1_2[target_samples].T))
        layer_2_delta = layer_2 - layer_2_target
        layer_1_delta = layer_2_delta.dot(weights_1_2[target_samples])
        
        weights_0_1[left_context+right_context] -= layer_1_delta * alpha
        weights_1_2[target_samples] -= np.outer(layer_2_delta, layer_1) * alpha
    
    if (rev_i % 250 == 0):
        sys.stdout.write('\rProgress:'+str(rev_i/float(len(input_dataset)
                        *iterations)) + " " + str(similar('terrible')))
    sys.stdout.write('\rProgress:'+str(rev_i/float(len(input_dataset)
                                                 *iterations)))
    

print(similar('terrible'))

Progress:0.99998('terrible', -0.0), ('horrible', -3.053567342300262), ('brilliant', -3.232541598557306), ('superb', -3.641844707742084), ('phenomenal', -3.7770081359183028), ('pathetic', -3.8106285440632), ('mediocre', -3.8268078033307433), ('masterful', -3.965156453620492), ('terrific', -4.044147302525793), ('marvelous', -4.086776091184065)]0168188)]433)]][('terrible', -0.0), ('horrible', -3.044942614735214), ('brilliant', -3.5174019045541294), ('pathetic', -3.7829941486197267), ('mediocre', -3.8769310772091465), ('phenomenal', -3.883798531232351), ('superb', -3.911200876453739), ('bad', -4.010693720892296), ('masterful', -4.088799018018154), ('dreadful', -4.182583168971443)]


# Analogy

In [40]:
def analogy(positive=['terrible', 'good'], negative=['bad']):
    norms = np.sum(weights_0_1 * weights_0_1, axis=1)
    norms.resize(norms.shape[0], 1)
    
    normed_weights = weights_0_1 * norms
    
    query_vect = np.zeros(len(weights_0_1[0]))
    for word in positive:
        query_vect += normed_weights[word2index[word]]
    for word in negative:
        query_vect -= normed_weights[word2index[word]]
        
    scores = Counter()
    for word, index in word2index.items():
        raw_difference = weights_0_1[index] - query_vect
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
        
    return scores.most_common(10)[1:]

In [41]:
analogy()

[('superb', -224.61756194552135),
 ('terrific', -224.89530234475765),
 ('decent', -224.95292457087893),
 ('fine', -225.14567733198368),
 ('worth', -225.3643043381763),
 ('nice', -225.38382255366986),
 ('perfect', -225.41542779142944),
 ('terrible', -225.58859216073884),
 ('great', -225.58923211623184)]