# Previously ... (NLP)

In [1]:
import sys

f = open('reviews.txt')
raw_reviews = f.readlines()
f.close()

f = open('labels.txt')
raw_labels = f.readlines()
f.close()

tokens = list(map(lambda x: set(x.split(" ")), raw_reviews))

vocab = set()
for sent in tokens:
    for word in sent:
        if(len(word)>0):
            vocab.add(word)
vocab = list(vocab)

word2index = {}
for i,word in enumerate(vocab):
    word2index[word] = i

input_dataset = list()
for sent in tokens:
    sent_indices = list()
    for word in sent:
        try:
            sent_indices.append(word2index[word])
        except:
            ""
    input_dataset.append(list(set(sent_indices)))
target_dataset = list()
for label in raw_labels:
    if label=='positive\n':
        target_dataset.append(1)
    else:
        target_dataset.append(0)

In [2]:
from collections import Counter
import math

def similar(target='beautiful'):
    target_index = word2index[target]
    scores = Counter()

    for word, index in word2index.items():
        raw_difference = weights_0_1[index] - weights_0_1[target_index]
        squared_difference = raw_difference*raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
        
    return scores.most_common(10)

In [3]:
import sys, random, math
from collections import Counter
import numpy as np

np.random.seed(1)
random.seed(1)
f = open('reviews.txt')
raw_reviews = f.readlines()
f.close()

tokens = list(map(lambda x: (x.split(" ")), raw_reviews))
wordcnt = Counter()
for sent in tokens:
    for word in sent:
        wordcnt[word] -= 1
vocab = list(set(map(lambda x: x[0], wordcnt.most_common())))
word2indedx = {}
for i,word in enumerate(vocab):
    word2index[word] = i
    
concatenated = list()
input_dataset = list()
for sent in tokens:
    sent_indices = list()
    for word in sent:
        try:
            sent_indices.append(word2index[word])
            concatenated.append(word2index[word])
        except:
            ""
    input_dataset.append(sent_indices)
concatenated = np.array(concatenated)
random.shuffle(input_dataset)
alpha, iterations = (0.05, 2)
hidden_size, window, negative = (50, 2, 5)

weights_0_1 = (np.random.rand(len(vocab), hidden_size) - 0.5)*0.2
weights_1_2 = np.random.rand(len(vocab), hidden_size)*0
layer_2_target = np.zeros(negative+1)
layer_2_target[0] = 1

def similar(target='beautiful'):
    target_index = word2index[target]
    scores = Counter()
    for word, index in word2index.items():
        raw_difference = weights_0_1[index] - (weights_0_1[target_index])
        squared_difference = raw_difference*raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
    return scores.most_common(10)

def sigmoid(x):
    return 1/(1+np.exp(-x))

for rev_i, review in enumerate(input_dataset*iterations):
    for target_i in range(len(review)):
        target_samples = [review[target_i]]+list(concatenated[(np.random.rand(negative)*len(concatenated)).astype('int').tolist()])
        
        left_context = review[max(0,target_i-window):target_i]
        right_context = review[target_i+1:min(len(review), target_i+window)]
        
        layer_1 = np.mean(weights_0_1[left_context+right_context], axis=0)
        layer_2 = sigmoid(layer_1.dot(weights_1_2[target_samples].T))
        
        layer_2_delta = layer_2 - layer_2_target
        layer_1_delta = layer_2_delta.dot(weights_1_2[target_samples])
        
        weights_0_1[left_context+right_context] -= layer_1_delta*alpha
        weights_1_2[target_samples] -= np.outer(layer_2_delta, layer_1)*alpha
    
    if(rev_i % 250 == 0):
        sys.stdout.write('\rProgress: '+str(rev_i/float(len(input_dataset)*iterations))
                        +" "+str(similar('terrible')))
print("\n\n\n")
print(similar('terrible'))

Progress: 0.9109311740890689 [('terrible', -0.0), ('horrible', -1.832427113754475), ('decent', -1.8440760666977758), ('fine', -2.0226154357367685), ('memorable', -2.040675105418206), ('superb', -2.1119848402765826), ('disturbing', -2.126745984732501), ('charming', -2.1859098912941066), ('disney', -2.196644509568583), ('roll', -2.226911837711128)]2)])])]



[('terrible', -0.0), ('horrible', -1.7321327096402326), ('decent', -1.8461941871111498), ('fine', -1.8745017448417307), ('popular', -1.9855882516312902), ('disturbing', -1.9861187761281354), ('memorable', -1.9982102444603995), ('superb', -2.0351908808064243), ('charming', -2.123398818789489), ('moving', -2.1374443039124555)]


# Calculating average of word vectors

In [4]:
import numpy as np
norms = np.sum(weights_0_1*weights_0_1, axis=1)
norms.resize(norms.shape[0], 1)
normed_weights = weights_0_1*norms

def make_sent_vect(words):
    indices = list(map(lambda x:word2index[x], filter(lambda x:x in word2index,words)))
    return np.mean(normed_weights[indices],axis=0)

reviews2vectors = list()
for review in tokens:
    reviews2vectors.append(make_sent_vect(review))
reviews2vectors = np.array(reviews2vectors)

def most_similar_reviews(review):
    v = make_sent_vect(review)
    scores = Counter()
    for i,val in enumerate(reviews2vectors.dot(v)):
        scores[i] = val
    most_similar = list()
    for idx, score in scores.most_common(3):
        most_similar.append(raw_reviews[idx][0:40])
    return most_similar

most_similar_reviews(['boring', 'awful'])

['this movie had to be the worst horror mo',
 'stargate is the best show ever . all the',
 'mt little sister and i are self  proclai']

# Transition matrix - simple example
## Forward propagation

In [5]:
import numpy as np

def softmax(x_):
    x = np.atleast_2d(x_)
    temp = np.exp(x)
    return temp/np.sum(temp, axis=1, keepdims=True)

word_vects = {}
word_vects['yankees'] = np.array([[0.,0.,0.]])
word_vects['bears'] = np.array([[0.,0.,0.]])
word_vects['braves'] = np.array([[0.,0.,0.]])
word_vects['red'] = np.array([[0.,0.,0.]])
word_vects['sox'] = np.array([[0.,0.,0.]])
word_vects['lose'] = np.array([[0.,0.,0.]])
word_vects['defeat'] = np.array([[0.,0.,0.]])
word_vects['beat'] = np.array([[0.,0.,0.]])
word_vects['tie'] = np.array([[0.,0.,0.]])

sent2output = np.random.rand(3, len(word_vects))
identity = np.eye(3)

layer_0 = word_vects['red']
layer_1 = layer_0.dot(identity) + word_vects['sox']
layer_2 = layer_1.dot(identity) + word_vects['defeat']

pred = softmax(layer_2.dot(sent2output))
print(pred)

[[0.11111111 0.11111111 0.11111111 0.11111111 0.11111111 0.11111111
  0.11111111 0.11111111 0.11111111]]


## Back propagation

In [6]:
y = np.array([1, 0, 0, 0, 0, 0, 0, 0, 0]) #label for 'yankees'

pred_delta = pred-y
layer_2_delta = pred_delta.dot(sent2output.T)
defeat_delta = layer_2_delta*1 #back-prop for word_vects['defeat'] term
layer_1_delta = layer_2_delta.dot(identity.T)
sox_delta = layer_1_delta*1
layer_0_delta = layer_1_delta.dot(identity.T)

alpha = 0.01
word_vects['red'] -= layer_0_delta*alpha
word_vects['sox'] -= sox_delta*alpha
word_vects['defeat'] -= defeat_delta*alpha
identity -= np.outer(layer_0, layer_1_delta)*alpha
identity -= np.outer(layer_1, layer_2_delta)*alpha
sent2output -= np.outer(layer_2, pred_delta)*alpha

# Training RNN for QA set

In [7]:
import sys, random, math
from collections import Counter
import numpy as np

f = open('tasksv11/en/qa1_single-supporting-fact_train.txt','r')
raw = f.readlines()
f.close()

tokens = list()
for line in raw[0:1000]:
    tokens.append(line.lower().replace("\n","").split(" ")[1:])

print(tokens[:3])

[['mary', 'moved', 'to', 'the', 'bathroom.'], ['john', 'went', 'to', 'the', 'hallway.'], ['where', 'is', 'mary?', '\tbathroom\t1']]


In [13]:
vocab = set()
for sent in tokens:
    for word in sent:
        vocab.add(word)
vocab = list(vocab)

word2index = {}
for i,word in enumerate(vocab):
    word2index[word] = i

def words2indices(sentence):
    idx = list()
    for word in sentence:
        idx.append(word2index[word])
    return idx

def softmax(x):
    e_x = np.exp(x-np.max(x))
    return e_x/e_x.sum(axis=0)

In [14]:
np.random.seed(1)
embed_size = 10
embed = (np.random.rand(len(vocab), embed_size)-0.5)*0.1
recurrent = np.eye(embed_size)
start = np.zeros(embed_size)
decoder = (np.random.rand(embed_size, len(vocab))-0.5)*0.1
one_hot = np.eye(len(vocab))

### Forward propagation

In [15]:
def predict(sent):
    layers = list()
    layer = {}
    layer['hidden'] = start
    layers.append(layer)
    
    loss = 0
    preds = list()
    for target_i in range(len(sent)):
        layer = {}
        layer['pred'] = softmax(layers[-1]['hidden'].dot(decoder))
        loss += -np.log(layer['pred'][sent[target_i]])
        layer['hidden'] = layers[-1]['hidden'].dot(recurrent) + embed[sent[target_i]]
        layers.append(layer)
    return layers, loss

### Back propagation

In [18]:
for iter in range(30000):
    alpha = 0.001
    sent = words2indices(tokens[iter%len(tokens)][1:])
    layers, loss = predict(sent)
    
    for layer_idx in reversed(range(len(layers))):
        layer = layers[layer_idx]
        target = sent[layer_idx-1]
        
        if(layer_idx>0):
            layer['output_delta'] = layer['pred']-one_hot[target]
            new_hidden_delta = layer['output_delta'].dot(decoder.transpose())
            if(layer_idx==len(layers)-1):
                layer['hidden_delta'] = new_hidden_delta
            else:
                layer['hidden_delta'] = new_hidden_delta+layers[layer_idx+1]['hidden_delta'].dot(recurrent.transpose())
        else:
            layer['hidden_delta'] = layers[layer_idx+1]['hidden_delta'].dot(recurrent.transpose())
    start -= layers[0]['hidden_delta']*alpha/float(len(sent))
    for layer_idx, layer in enumerate(layers[1:]):
        decoder -= np.outer(layers[layer_idx]['hidden'], layer['output_delta']*alpha/float(len(sent)))
        embed_idx = sent[layer_idx]
        embed[embed_idx] -= layers[layer_idx]['hidden_delta']*alpha/float(len(sent))
        recurrent -= np.outer(layers[layer_idx]['hidden'], layer['hidden_delta']*alpha/float(len(sent)))
    if(iter%1000==0):
        print('Perplexity: '+str(np.exp(loss/len(sent))))

Perplexity: 80.94604498191212
Perplexity: 80.02557844138092
Perplexity: 78.07451594051061
Perplexity: 73.17813896643796
Perplexity: 54.23091952537488
Perplexity: 27.181958461960072
Perplexity: 20.06151350864147
Perplexity: 18.473769800049887
Perplexity: 17.02035772569649
Perplexity: 14.903587333330526
Perplexity: 11.844797598164067
Perplexity: 8.723647428452939
Perplexity: 7.048127798842412
Perplexity: 6.1429774155237675
Perplexity: 5.480627642652905
Perplexity: 5.044603643791873
Perplexity: 4.793159340899614
Perplexity: 4.630364872452967
Perplexity: 4.532523625107382
Perplexity: 4.477987983589834
Perplexity: 4.433774322838025
Perplexity: 4.378079564587689
Perplexity: 4.307909135327972
Perplexity: 4.229962531530703
Perplexity: 4.15328279608556
Perplexity: 4.085967238825636
Perplexity: 4.029156666660759
Perplexity: 3.9776789752941037
Perplexity: 3.927619989282917
Perplexity: 3.920620784128345


In [19]:
sent_index = 4
l,_ = predict(words2indices(tokens[sent_index]))
print(tokens[sent_index])

for i,each_layer in enumerate(l[1:-1]):
    input = tokens[sent_index][i]
    true = tokens[sent_index][i+1]
    pred = vocab[each_layer['pred'].argmax()]
    print('Prev Input: ' + input + (' '*(12-len(input))) + 'True: ' + true + (' '*(15-len(true))) + 'Pred: ' + pred)

['sandra', 'moved', 'to', 'the', 'garden.']
Prev Input: sandra      True: moved          Pred: is
Prev Input: moved       True: to             Pred: to
Prev Input: to          True: the            Pred: the
Prev Input: the         True: garden.        Pred: bedroom.
