# Define vocabulary (set of words)

In [1]:
import sys

f = open('reviews.txt')
raw_reviews = f.readlines()
f.close()

f = open('labels.txt')
raw_labels = f.readlines()
f.close()

tokens = list(map(lambda x: set(x.split(" ")), raw_reviews))

vocab = set()
for sent in tokens:
    for word in sent:
        if(len(word)>0):
            vocab.add(word)
vocab = list(vocab)

word2index = {}
for i,word in enumerate(vocab):
    word2index[word] = i

input_dataset = list()
for sent in tokens:
    sent_indices = list()
    for word in sent:
        try:
            sent_indices.append(word2index[word])
        except:
            ""
    input_dataset.append(list(set(sent_indices)))
target_dataset = list()
for label in raw_labels:
    if label=='positive\n':
        target_dataset.append(1)
    else:
        target_dataset.append(0)

# Word embedding

In [2]:
import numpy as np
np.random.seed(1)

def sigmoid(x):
    return 1/(1+np.exp(-x))

alpha, iterations = (0.01, 100)
hidden_size = 100

weights_0_1 = 0.2*np.random.random((len(vocab), hidden_size))-0.1
weights_1_2 = 0.2*np.random.random((hidden_size, 1))-0.1

correct, total = (0,0)
for iter in range(iterations):
    for i in range(len(input_dataset)-1000):
        x,y = (input_dataset[i], target_dataset[i])
        layer_1  =sigmoid(np.sum(weights_0_1[x],axis=0))
        layer_2 = sigmoid(np.dot(layer_1, weights_1_2))
        layer_2_delta = layer_2-y
        layer_1_delta = layer_2_delta.dot(weights_1_2.T)
        weights_0_1[x] -= layer_1_delta*alpha
        weights_1_2 -= np.outer(layer_1, layer_2_delta)*alpha
        if (np.abs(layer_2_delta)<0.5):
            correct += 1
        total+=1
        if(i%10 == 0):
            progress = str(i/float(len(input_dataset)))
            sys.stdout.write('\rIter: ' + str(iter) + 
                            ' Progress: ' + progress[2:4] +
                            '.' + progress[4:6] + 
                            '% Training Accuracy: ' + str(correct/float(total)) + '%')
    print()
for i in range(len(input_dataset)-1000, len(input_dataset)):
    x = input_dataset[i]
    y = target_dataset[i]
    
    layer_1 = sigmoid(np.sum(weights_0_1[x], axis=0))
    layer_2 = sigmoid(np.dot(layer_1, weights_1_2))
    
    if(np.abs(layer_2-y)<0.5):
        correct += 1
    total += 1
print("Test Accuracy: " + str(correct/float(total)))

Iter: 0 Progress: 18.62% Training Accuracy: 0.30303030303030304%
Iter: 1 Progress: 18.62% Training Accuracy: 0.5021459227467812%%
Iter: 2 Progress: 18.62% Training Accuracy: 0.6362339514978602%
Iter: 3 Progress: 18.62% Training Accuracy: 0.7147435897435898%
Iter: 4 Progress: 18.62% Training Accuracy: 0.7660119555935099%
Iter: 5 Progress: 18.62% Training Accuracy: 0.8036984352773826%
Iter: 6 Progress: 18.62% Training Accuracy: 0.8312004875076173%
Iter: 7 Progress: 18.62% Training Accuracy: 0.8518123667377399%
Iter: 8 Progress: 18.62% Training Accuracy: 0.8678351492183799%
Iter: 9 Progress: 18.62% Training Accuracy: 0.881074168797954%%
Iter: 10 Progress: 18.62% Training Accuracy: 0.891902363425029%%
Iter: 11 Progress: 18.62% Training Accuracy: 0.9009232954545454%
Iter: 12 Progress: 18.62% Training Accuracy: 0.9085545722713865%
Iter: 13 Progress: 18.62% Training Accuracy: 0.9150943396226415%
Iter: 14 Progress: 18.62% Training Accuracy: 0.9207611474013064%
Iter: 15 Progress: 18.62% Trainin

# Words comparison

In [3]:
from collections import Counter
import math

def similar(target='beautiful'):
    target_index = word2index[target]
    scores = Counter()

    for word, index in word2index.items():
        raw_difference = weights_0_1[index] - weights_0_1[target_index]
        squared_difference = raw_difference*raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
        
    return scores.most_common(10)

In [4]:
print(similar('beautiful'))

[('beautiful', -0.0), ('every', -0.7572337074830178), ('face', -0.765454329368547), ('williams', -0.7659953760027031), ('years', -0.7672558387308264), ('language', -0.7685560304767548), ('both', -0.7694482531598512), ('some', -0.7702098505268767), ('great', -0.7749907381614523), ('for', -0.7759880492782278)]


In [5]:
print(similar('terrible'))

[('terrible', -0.0), ('simplistic', -0.6921512305633609), ('purposely', -0.6991689254200739), ('inferiority', -0.6998003894698095), ('bother', -0.707274585303948), ('envy', -0.7089653828985435), ('injuries', -0.709563056704115), ('cucumber', -0.7108000066159067), ('marketplace', -0.7113928490043255), ('inadequacy', -0.714203477291545)]


# Filling missing words

In [6]:
import sys, random, math
from collections import Counter
import numpy as np

np.random.seed(1)
random.seed(1)
f = open('reviews.txt')
raw_reviews = f.readlines()
f.close()

tokens = list(map(lambda x: (x.split(" ")), raw_reviews))
wordcnt = Counter()
for sent in tokens:
    for word in sent:
        wordcnt[word] -= 1
vocab = list(set(map(lambda x: x[0], wordcnt.most_common())))
word2indedx = {}
for i,word in enumerate(vocab):
    word2index[word] = i
    
concatenated = list()
input_dataset = list()
for sent in tokens:
    sent_indices = list()
    for word in sent:
        try:
            sent_indices.append(word2index[word])
            concatenated.append(word2index[word])
        except:
            ""
    input_dataset.append(sent_indices)
concatenated = np.array(concatenated)
random.shuffle(input_dataset)
alpha, iterations = (0.05, 2)
hidden_size, window, negative = (50, 2, 5)

weights_0_1 = (np.random.rand(len(vocab), hidden_size) - 0.5)*0.2
weights_1_2 = np.random.rand(len(vocab), hidden_size)*0
layer_2_target = np.zeros(negative+1)
layer_2_target[0] = 1

def similar(target='beautiful'):
    target_index = word2index[target]
    scores = Counter()
    for word, index in word2index.items():
        raw_difference = weights_0_1[index] - (weights_0_1[target_index])
        squared_difference = raw_difference*raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
    return scores.most_common(10)

def sigmoid(x):
    return 1/(1+np.exp(-x))

for rev_i, review in enumerate(input_dataset*iterations):
    for target_i in range(len(review)):
        target_samples = [review[target_i]]+list(concatenated[(np.random.rand(negative)*len(concatenated)).astype('int').tolist()])
        
        left_context = review[max(0,target_i-window):target_i]
        right_context = review[target_i+1:min(len(review), target_i+window)]
        
        layer_1 = np.mean(weights_0_1[left_context+right_context], axis=0)
        layer_2 = sigmoid(layer_1.dot(weights_1_2[target_samples].T))
        
        layer_2_delta = layer_2 - layer_2_target
        layer_1_delta = layer_2_delta.dot(weights_1_2[target_samples])
        
        weights_0_1[left_context+right_context] -= layer_1_delta*alpha
        weights_1_2[target_samples] -= np.outer(layer_2_delta, layer_1)*alpha
    
    if(rev_i % 250 == 0):
        sys.stdout.write('\rProgress: '+str(rev_i/float(len(input_dataset)*iterations))
                        +" "+str(similar('terrible')))
print("\n\n\n")
print(similar('terrible'))

Progress: 0.9109311740890689 [('terrible', -0.0), ('decent', -1.864111821634946), ('horrible', -1.896257491109925), ('fine', -2.0403210154996754), ('charming', -2.0646488383669084), ('memorable', -2.1150698307529447), ('superb', -2.1554300267804116), ('disturbing', -2.1596982208339375), ('disney', -2.2177548152395206), ('happy', -2.246612939342435)]81905)]



[('terrible', -0.0), ('horrible', -1.800534012235169), ('fine', -1.857604131397638), ('decent', -1.8699809881713116), ('popular', -1.9361924654826963), ('disturbing', -1.9721102882052681), ('charming', -1.991120937277976), ('memorable', -2.0442231315297743), ('superb', -2.0739687380745986), ('happy', -2.0843847968253875)]


# Word analogies

In [13]:
def analogy(positive=['terrible', 'good'], negative=['bad']):
    norms = np.sum(weights_0_1*weights_0_1, axis=1)
    norms.resize(norms.shape[0], 1)
    normed_weights = weights_0_1*norms
    query_vect = np.zeros(len(weights_0_1[0]))
    
    for word in positive:
        query_vect += normed_weights[word2index[word]]
    for word in negative:
        query_vect -= normed_weights[word2index[word]]
    scores = Counter()
    for word,index in word2index.items():
        raw_difference = weights_0_1[index]-query_vect
        squared_difference = raw_difference*raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
    return scores.most_common(10)[1:]


In [14]:
analogy(['terrible', 'good'], ['bad'])

[('this', -126.97103362969905),
 ('great', -127.08570980922502),
 ('little', -127.4276429806803),
 ('very', -127.43404700352926),
 ('lot', -127.6935008477101),
 ('such', -127.70024496778453),
 ('long', -127.86455500387312),
 ('an', -127.89975560154767),
 ('few', -127.93558009244795)]

In [15]:
analogy(['elizabeth', 'he'], ['she'])

[('made', -163.99898373985727),
 ('does', -164.0263970776871),
 ('watched', -164.02997143318984),
 ('though', -164.07210865523618),
 ('since', -164.16103757273768),
 ('seen', -164.16855416663418),
 ('if', -164.26078677129084),
 ('again', -164.27222648891853),
 ('isn', -164.27786314485644)]