In [8]:
import sys, random, math
from collections import Counter
import numpy as np

f = open('datasets/tasksv11/en/qa1_single-supporting-fact_train.txt','r')
raw = f.readlines()
f.close()

tokens = list()
for line in raw[0:1000]:
    temp = []
    for i in line.lower().replace("\n","").replace("\t", " ").split(" ")[:]:
        if(i != "" and not i.isdigit()):
            temp.append(i)

    tokens.append(temp)

print(tokens[0:3])

[['mary', 'moved', 'to', 'the', 'bathroom.'], ['john', 'went', 'to', 'the', 'hallway.'], ['where', 'is', 'mary?', 'bathroom']]


In [9]:
vocab = set()
for sent in tokens:
    for word in sent:
        vocab.add(word)
vocab = list(vocab)


word2index = {}
for i,word in enumerate(vocab):
    word2index[word]=i

def words2indices(sentence):
    idx = list()
    for word in sentence:
        idx.append(word2index[word])
    return idx

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

In [10]:
np.random.seed(1)

embed_size = 10
embed = (np.random.rand(len(vocab), embed_size) - 0.5) * 0.1

recurrent = np.eye(embed_size)

# Initial Input layer
start = np.zeros(embed_size)

# Final Layer
decoder = (np.random.rand(embed_size, len(vocab)) - 0.5) * 0.1

# Smart way of creating a one hot encodeded values for tokens
one_hot = np.eye(len(vocab))

In [11]:
print(vocab)

['sandra?', 'bathroom.', 'to', 'mary', 'hallway.', 'moved', 'garden', 'bedroom.', 'john', 'is', 'journeyed', 'went', 'hallway', 'daniel', 'daniel?', 'mary?', 'garden.', 'where', 'sandra', 'bedroom', 'travelled', 'john?', 'bathroom', 'the', 'office.', 'office', 'back', 'kitchen', 'kitchen.']


In [12]:
def predict(sent):
    '''
    sent is a list with token indexes from the vocabulary.
    eg:- [15, 56, 561, 35432, 321, 5468]
    '''

    layers = list()
    layer = {}

    # Hidden layer setup to previously defined embedded layer
    layer['hidden'] = start
    layers.append(layer)

    loss = 0
    preds = list()

    # Running a loop for each token in a sentence
    for target_i in range(len(sent)):
        layer = {}
        
        # Similar to ((embeded vector*identity matrix) + next embeded vector) calculation
        # This part is responsible to keep the sequence factor.
        # Check the below image token vector multiplacation.
        layer['hidden'] = layers[-1]['hidden'].dot(recurrent) + embed[sent[target_i]]

        # Predicting which word is going to come up from the vocabulary (returns (1, len(vocab)) vector)
        layer['pred'] = softmax(layers[-1]['hidden'].dot(decoder))

        # From layer["pred"] of dim (len(vocab), 1) get the actual required value vector
        # if it is 1 then loss is zero, else we get a value because of log transformation
        loss += -np.log(layer['pred'][sent[target_i]])

        layers.append(layer)
        
    return layers, loss

<img src="./images/sentence_embedding.png" width=400 height=400 />

In [13]:
for iter in range(30000):
    alpha = 0.001

    # Getting the sentence as vector of indexes
    sent = words2indices(tokens[iter%len(tokens)][1:])
    
    # do the forward propagation for the respective sentence
    layers, loss = predict(sent)

    for layer_idx in reversed(range(len(layers))):

        # Select the token related weight values
        layer = layers[layer_idx]

        # Select the target token
        target = sent[layer_idx-1]

        # If not the First layer
        if(layer_idx > 0):
            
            # Take the error between output and target
            layer['output_delta'] = layer['pred'] - one_hot[target]
            
            # calculate the (1, embed_size) delta vector for the embed layer
            new_hidden_delta = layer['output_delta'].dot(decoder.transpose())

            if(layer_idx == len(layers)-1):
                # If this is the last layer, then no other derivatives to be add up from recurrent part
                layer['hidden_delta'] = new_hidden_delta
            else:
                # So we need to calculate the recurrent derivative from its higher layer
                layer['hidden_delta'] = new_hidden_delta + layers[layer_idx+1]['hidden_delta'].dot(recurrent.transpose())

        # If the first layer
        else: 
            layer['hidden_delta'] = layers[layer_idx+1]['hidden_delta'].dot(recurrent.transpose())


    # Weight Update part in backpropagation
    start -= layers[0]['hidden_delta'] * alpha / float(len(sent))

    for layer_idx, layer in enumerate(layers[1:]):
        decoder -= np.outer(layers[layer_idx]['hidden'], layer['output_delta']) * alpha / float(len(sent))
        
        embed_idx = sent[layer_idx]
        embed[embed_idx] -= layers[layer_idx]['hidden_delta'] * alpha / float(len(sent))

        recurrent -= np.outer(layers[layer_idx]['hidden'], layer['hidden_delta']) * alpha / float(len(sent))


    if(iter % 1000 == 0):
        print("Perplexity:" + str(np.exp(loss/len(sent))))

Perplexity:28.91321137929614
Perplexity:28.87885734561516
Perplexity:28.83392695493964
Perplexity:28.766093148203232
Perplexity:28.655626954084312
Perplexity:28.466625441746253
Perplexity:28.126154884554534
Perplexity:27.46374425206181
Perplexity:25.98831940347549
Perplexity:22.060655133662745
Perplexity:17.879780591425416
Perplexity:15.559868761620836
Perplexity:14.007077789678492
Perplexity:13.290651920540157
Perplexity:12.856605663120146
Perplexity:12.357552448478888
Perplexity:11.63623606085471
Perplexity:10.533524614286977
Perplexity:8.928377581547414
Perplexity:7.189938234298541
Perplexity:6.177569417522595
Perplexity:5.629040455667786
Perplexity:5.217876243207617
Perplexity:4.866188308225435
Perplexity:4.5664227632170284
Perplexity:4.335533204217807
Perplexity:4.188973193195896
Perplexity:4.108886898051218
Perplexity:4.0580026819125345
Perplexity:4.019927855779019


In [16]:
sent_index = 4

l, _ = predict(words2indices(tokens[sent_index]))
print(tokens[sent_index])

for i, each_layer in enumerate(l[1:-1]):
    input = tokens[sent_index][i]
    
    true = tokens[sent_index][i+1]

    pred = vocab[each_layer['pred'].argmax()]

    print("Prev Input:" + input + (' ' * (12 - len(input))) + "True:" + true + (" " * (15 - len(true))) + "Pred:" + pred)

['sandra', 'moved', 'to', 'the', 'garden.']
Prev Input:sandra      True:moved          Pred:to
Prev Input:moved       True:to             Pred:to
Prev Input:to          True:the            Pred:the
Prev Input:the         True:garden.        Pred:bedroom.
