# 9. Effectiveness of word2vec

We now repeat everything done in the file A3_one_hot using word2vec embeddings in place of one-hot embeddings. This will require re-running steps 1-8.

In [1]:
from collections import Counter
from gensim.models import Word2Vec
from random import random
from nltk import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from torch import nn
from torch.autograd import Variable
import torch.optim as optim
import math as math
import random

import numpy as np
import torch
import torch.nn.functional as F

# Data Acquisition

For this assignment, you must download the data and extract it into `data/`. The dataset contains two files, both containing a single caption on each line. We should have 415,795 sentences in the training captions and 500 sentences in the validation captions.

To download the data, run the following directly on your server: `wget https://s3-us-west-2.amazonaws.com/cpsc532l-data/a3_data.zip`

In [2]:
# Load the data into memory.
train_sentences = [line.strip() for line in open("data/mscoco_train_captions.txt").readlines()]
val_sentences = [line.strip() for line in open("data/mscoco_val_captions.txt").readlines()]

# Preprocessing

The code provided below creates word embeddings for you to use. After creating the vocabulary, we construct both one-hot embeddings and word2vec embeddings. 

All of the packages utilized should be installed on your Azure servers, however you will have to download an NLTK corpus. To do this, follow the instructions below:

1. SSH to your Azure server
2. Open up Python interpreter
3. `import nltk`
4. `nltk.download()`

    You should now see something that looks like:

    ```
    >>> nltk.download()
    NLTK Downloader
    ---------------------------------------------------------------------------
        d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
    ---------------------------------------------------------------------------
    Downloader> 

    ```

5. `d punkt`
6. Provided the download finished successfully, you may now exit out of the Python interpreter and close the SSH connection.

Please look through the functions provided below **carefully**, as you will need to use all of them at some point in your assignment.

In [3]:
sentences = train_sentences

# Lower-case the sentence, tokenize them and add <SOS> and <EOS> tokens
sentences = [["<SOS>"] + word_tokenize(sentence.lower()) + ["<EOS>"] for sentence in sentences]

# Create the vocabulary. Note that we add an <UNK> token to represent words not in our vocabulary.
vocabularySize = 1000
word_counts = Counter([word for sentence in sentences for word in sentence])
vocabulary = ["<UNK>"] + [e[0] for e in word_counts.most_common(vocabularySize-1)]
word2index = {word:index for index,word in enumerate(vocabulary)}
one_hot_embeddings = np.eye(vocabularySize)

# Build the word2vec embeddings
wordEncodingSize = 300
filtered_sentences = [[word for word in sentence if word in word2index] for sentence in sentences]
w2v = Word2Vec(filtered_sentences, min_count=0, size=wordEncodingSize)
w2v_embeddings = np.concatenate((np.zeros((1, wordEncodingSize)), w2v.wv.syn0))

# Define the max sequence length to be the longest sentence in the training data. 
maxSequenceLength = max([len(sentence) for sentence in sentences])

def preprocess_numberize(sentence):
    """
    Given a sentence, in the form of a string, this function will preprocess it
    into list of numbers (denoting the index into the vocabulary).
    """
    tokenized = word_tokenize(sentence.lower())
        
    # Add the <SOS>/<EOS> tokens and numberize (all unknown words are represented as <UNK>).
    tokenized = ["<SOS>"] + tokenized + ["<EOS>"]
    numberized = [word2index.get(word, 0) for word in tokenized]
    
    return numberized

def preprocess_one_hot(sentence):
    """
    Given a sentence, in the form of a string, this function will preprocess it
    into a numpy array of one-hot vectors.
    """
    numberized = preprocess_numberize(sentence)
    
    # Represent each word as it's one-hot embedding
    one_hot_embedded = one_hot_embeddings[numberized]
    
    return one_hot_embedded

def preprocess_word2vec(sentence):
    """
    Given a sentence, in the form of a string, this function will preprocess it
    into a numpy array of word2vec embeddings.
    """
    numberized = preprocess_numberize(sentence)
    
    # Represent each word as it's one-hot embedding
    w2v_embedded = w2v_embeddings[numberized]
    
    return w2v_embedded

def compute_bleu(reference_sentence, predicted_sentence):
    """
    Given a reference sentence, and a predicted sentence, compute the BLEU similary between them.
    """
    reference_tokenized = word_tokenize(reference_sentence.lower())
    predicted_tokenized = word_tokenize(predicted_sentence.lower())
    return sentence_bleu([reference_tokenized], predicted_tokenized)


# 1. Building a Language Decoder

We now implement a language decoder. For now, we will have the decoder take a single training sample at a time (as opposed to batching). For our purposes, we will also avoid defining the embeddings as part of the model and instead pass in embedded inputs. While this is sometimes useful, as it learns/tunes the embeddings, we avoid doing it for the sake of simplicity and speed.

Remember to use LSTM hidden units!

In [4]:
class DecoderLSTM(nn.Module):
    def __init__(self, hidden_size, isCuda):
        super(DecoderLSTM, self).__init__()
        self.hidden_size = 300
        
        self.num_layers = 1
        self.input_size = np.shape(w2v_embeddings)[1]
        self.output_size = vocabularySize
        self.lstm = nn.LSTM(self.input_size, self.hidden_size, self.num_layers)
        self.linear = nn.Linear(hidden_size, self.output_size)
        #self.softmax = nn.LogSoftmax(dim=1)
        self.isCuda = isCuda

    def forward(self, embedded_input, hx ,cx):
        output, (hx, cx) = self.lstm(embedded_input,(hx, cx))
        output = self.linear(output[0])
        return output, hx, cx

    def initHidden(self):
        hx = Variable(torch.zeros(self.num_layers, 1, self.hidden_size))
        cx = Variable(torch.zeros(self.num_layers, 1, self.hidden_size))
        
        return hx.cuda(),cx.cuda()
        


# 2. Training a Language Decoder

We must now train the language decoder we implemented above. An important thing to pay attention to is the [inputs for an LSTM](http://pytorch.org/docs/master/nn.html#torch.nn.LSTM).

In [6]:
softmax = nn.Softmax()

def train(sentence, 
          decoder, 
          decoder_optimizer, 
          criterion,
          embeddings=w2v_embeddings): 
    
    target_variable = preprocess_word2vec(sentence)
    target_variable2 = word_tokenize(sentence.lower())

    numberized = preprocess_numberize(sentence)
    
    teacher_forcing_ratio = 0.5
    
    decoder_input = Variable(torch.FloatTensor(target_variable[1])).unsqueeze(0).unsqueeze(0)
    decoder_input = decoder_input.cuda() 
    
    decoder_optimizer.zero_grad()
    hx,cx = decoder.initHidden()
    
    target_length = np.shape(target_variable)[0]
    
    loss = 0
    
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    
    if use_teacher_forcing:
        for di in range(2, target_length):
            decoder_output, hx, cx = decoder(decoder_input, hx, cx)
            #topv, topi = softmax(decoder_output).data.topk(1)
            #ni = topi[0][0]
            #print('predicted value: %s'%(vocabulary[ni]))
            #decoder_input = Variable(torch.FloatTensor(np.eye(vocabularySize)[[ni]])).unsqueeze(0)
            
            decoder_input = Variable(torch.FloatTensor(target_variable[di])).unsqueeze(0).unsqueeze(0)
            decoder_input = decoder_input.cuda() 
            
            if (di != (target_length-1)):
                    ti = word2index.get(target_variable2[di-1],0)
            else:
                    ti = word2index["<EOS>"]
                    
            trueValue = Variable(torch.LongTensor(1))
            trueValue.data[0] = ti
            #print('true value: %s'%(vocabulary[trueWord]))

            trueValue = trueValue.cuda() 
 
            loss += criterion(decoder_output, trueValue)
            
    else:
        for di in range(2, target_length):

                decoder_output, hx, cx = decoder(decoder_input, hx, cx)
                topv, topi = softmax(decoder_output).data.topk(1)
                ni = topi[0][0]
                #print('predicted value: %s'%(vocabulary[ni]))

                decoder_input = Variable(torch.FloatTensor(w2v_embeddings[[ni]])).unsqueeze(0)
                decoder_input = decoder_input.cuda() 

                
                if (di != (target_length-1)):
                    ti = word2index.get(target_variable2[di-1],0)
                else:
                    ti = word2index["<EOS>"]
                    
                trueValue = Variable(torch.LongTensor(1))
                trueValue.data[0] = ti

                trueValue = trueValue.cuda() 

                loss += criterion(decoder_output, trueValue)
                if ni == word2index["<EOS>"] :
                    break
    
    loss.backward()
    
    decoder_optimizer.step()
    
    return loss.data[0] / target_length



In [7]:

def trainIters(decoder, n_iters, print_every=1000, plot_every=500, learning_rate=0.01):
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every
    
    learning_rate = 0.01
    optimizer = optim.Adam(decoder.parameters(),learning_rate)
    criterion = nn.CrossEntropyLoss().cuda()

    #training_pairs = [variablesFromPair(random.choice(pairs))for i in range(n_iters)]

    for iter in range(1, n_iters + 1):
        if(np.size(preprocess_numberize(train_sentences[iter-1]))>2): 
                
            loss = train(train_sentences[iter-1], decoder, optimizer, criterion)

            print_loss_total += loss
            plot_loss_total += loss

            if iter % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                print('(%d %d%%) %.4f' % (iter, iter / n_iters * 100, print_loss_avg))

            '''if iter % plot_every == 0:
                plot_loss_avg = plot_loss_total / plot_every
                plot_losses.append(plot_loss_avg)
                plot_loss_total = 0'''



In [8]:
isCuda = True
hidden_size = 300
decoder = DecoderLSTM(hidden_size, isCuda).cuda()
size = math.floor(np.size(sentences)/2)
trainIters(decoder,size)

(1000 0%) 3.3667
(2000 0%) 3.3648
(3000 1%) 3.4044
(4000 1%) 3.3877
(5000 2%) 3.3873
(6000 2%) 3.3066
(7000 3%) 3.3172
(8000 3%) 3.3110
(9000 4%) 3.2074
(10000 4%) 3.2956
(11000 5%) 3.2625
(12000 5%) 3.2573
(13000 6%) 3.1770
(14000 6%) 3.1486
(15000 7%) 3.3253
(16000 7%) 3.2798
(17000 8%) 3.3501
(18000 8%) 3.2866
(19000 9%) 3.2907
(20000 9%) 3.2381
(21000 10%) 3.2682
(22000 10%) 3.3218
(23000 11%) 3.2938
(25000 12%) 6.5249
(26000 12%) 3.3910
(27000 12%) 3.3370
(28000 13%) 3.2200
(29000 13%) 3.3154
(30000 14%) 3.3127
(31000 14%) 3.3945
(32000 15%) 3.3787
(33000 15%) 3.3552
(34000 16%) 3.7236
(35000 16%) 3.8140
(36000 17%) 3.5887
(37000 17%) 3.4109
(38000 18%) 3.4786
(39000 18%) 3.3816
(40000 19%) 3.5108
(41000 19%) 3.4438
(42000 20%) 3.5171
(43000 20%) 3.4081
(44000 21%) 3.5175
(45000 21%) 3.4784
(46000 22%) 3.5536
(47000 22%) 3.4538
(48000 23%) 3.5859
(49000 23%) 3.6646
(50000 24%) 3.5939
(51000 24%) 3.5293
(52000 25%) 3.5226
(53000 25%) 3.4092
(54000 25%) 3.6979
(55000 26%) 3.6117
(56

# 3. Building Language Decoder MAP Inference

We now define a method to perform inference with our decoder and test it with a few different starting words. This code will be fairly similar to your training function from part 2.

In [9]:
softmax = nn.Softmax()
def inference(decoder, init_word, embeddings=w2v_embeddings, max_length = maxSequenceLength):
    
    decoder.eval()
    
    isCuda = True
    
    decoder_input = Variable(torch.FloatTensor(w2v_embeddings[[word2index.get(init_word, 0)]])).unsqueeze(0)
    decoder_input = decoder_input.cuda() 

    hx,cx = decoder.initHidden()

    result = init_word + " "
    i = 0 
    
    while True:
            i += 1
            decoder_output, hx, cx = decoder(decoder_input, hx, cx)
            topv, topi = softmax(decoder_output).data.topk(1)
            ni = topi[0][0]
            if ni == word2index["<EOS>"] or i == max_length:
                break
            result = result + vocabulary[ni] + " "
            decoder_input = Variable(torch.FloatTensor(w2v_embeddings[[ni]])).unsqueeze(0)
            decoder_input = decoder_input.cuda() if isCuda else decoder_input
            
                      
    return result

print(inference(decoder, init_word="the"))
print(inference(decoder, init_word="man"))
print(inference(decoder, init_word="woman"))
print(inference(decoder, init_word="dog"))

the man is white clouds <UNK> a man and a frisbee <UNK> a . 
man and a frisbee <UNK> a . 
woman is white clouds <UNK> a man and a frisbee <UNK> a . 
dog is white clouds <UNK> a man and a frisbee <UNK> a . 


# 4. Building Language Decoder Sampling Inference

We must now modify the method defined in part 3, to sample from the distribution outputted by the LSTM rather than taking the most probable word.

It might be useful to take a look at the output of your model and (depending on your implementation) modify it so that the outputs sum to 1. 

In [10]:
def sampling_inference(decoder, init_word, embeddings=w2v_embeddings, max_length=maxSequenceLength):
    
    decoder.eval()
    
    isCuda = True
    
    decoder_input = Variable(torch.FloatTensor(w2v_embeddings[[word2index.get(init_word, 0)]])).unsqueeze(0)
    decoder_input = decoder_input.cuda()
    hx,cx = decoder.initHidden()

    result = init_word + " "
    i = 0 
    
    while True:
            i += 1
            decoder_output, hx, cx = decoder(decoder_input, hx, cx)
            probabilities = np.squeeze(np.transpose(softmax(decoder_output.cpu()).data.numpy()))
            ni = np.random.choice(np.arange(0, 1000), p = probabilities)
            if ni == word2index["<EOS>"] or i == max_length:
                break
            result = result + vocabulary[ni] + " "
            decoder_input = Variable(torch.FloatTensor(w2v_embeddings[[ni]])).unsqueeze(0)
            decoder_input = decoder_input.cuda() if isCuda else decoder_input       
                      
    return result

# Print the results with sampling_inference by drawing 5 samples per initial word, requiring to run 
# the code below 5 times
print(sampling_inference(decoder, init_word="the"))
print(sampling_inference(decoder, init_word="man"))
print(sampling_inference(decoder, init_word="woman"))
print(sampling_inference(decoder, init_word="dog"))

the brown horse next to to bat under a dog bed 
man holding a zoo enclosure with a frisbee <UNK> <UNK> . frisbee together in playing grazing on grass next to a rocky to bat under and a frisbee the same a tree . 
woman a dog <UNK> <UNK> to their hat his head the ocean with trees around a yellow chair . 
dog man drawn a person , and sliced eaten . 


In [15]:
for i in range(0,5):
    print(sampling_inference(decoder, init_word="the"))
    print(sampling_inference(decoder, init_word="man"))
    print(sampling_inference(decoder, init_word="woman"))
    print(sampling_inference(decoder, init_word="dog"))
    print("\n ************** \n")

the young boy is white clouds showing `` topped with lots of a people in a . 
man grazing on grass 
woman on back topped with people in a stall field . 
dog sitting in an enclosure next to a and horses on grass <UNK> a large <UNK> this is a short park . 

 ************** 

the playing in front of a . 
man is white photograph black a a a person a zebra dog 's <UNK> red and white photo on the beach . 
woman laying horses and clouds as she air . 
dog <UNK> people of dogs lying on back area 

 ************** 

the three different colored on a a rock . 
man dog . preparing to catch a frisbee <UNK> two <UNK> hole a game with frisbee . from a . 
woman playing together in a rug grass field 
dog in a some <UNK> their and some horses running with two men in a street . in a grassy pasture . 

 ************** 

the stand next a a brown frisbee 
man in brown <UNK> with several zebras 
woman is served . '' on the <UNK> counter . 
dog . horses are playing horses . a <UNK> suit posing 

 *************

# 5.  Building Language Encoder

We now build a language encoder, which will encode an input word by word, and ultimately output a hidden state that we can then be used by our decoder.

In [5]:
class EncoderLSTM(nn.Module):
    def __init__(self, hidden_size, isCuda):
        super(EncoderLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.isCuda = isCuda

        self.num_layers = 1
        self.input_size = np.shape(w2v_embeddings)[1]
        #self.output_size = vocabularySize
        self.lstm = nn.LSTM(self.input_size, self.hidden_size, self.num_layers)
        #self.linear = nn.Linear(hidden_size, self.output_size)

    def forward(self, embedded_input, hx ,cx):
        output, (hx, cx) = self.lstm(embedded_input,(hx, cx))
        #output = self.linear(output[0])
        return output, hx, cx

    def initHidden(self):
        hx = Variable(torch.zeros(self.num_layers, 1, self.hidden_size))
        cx = Variable(torch.zeros(self.num_layers, 1, self.hidden_size))
        if isCuda:
            return hx.cuda(),cx.cuda()
        else:
            return hx,cx


# 6. Connecting Encoder to Decoder and Training End-to-End

We now connect our newly created encoder with our decoder, to train an end-to-end seq2seq architecture. 

It's likely that you'll be able to re-use most of your code from part 2. For our purposes, the only interaction between the encoder and the decoder is that the *last hidden state of the encoder is used as the initial hidden state of the decoder*. 

In [10]:

def train(sentence, 
          encoder,
          encoder_optimizer,
          decoder, 
          decoder_optimizer, 
          criterion,
          embeddings=w2v_embeddings): 
    
    target_variable = preprocess_word2vec(sentence)
    target_variable2 = word_tokenize(sentence.lower())

    numberized = preprocess_numberize(sentence)
    
    softmax = nn.Softmax()

    isCuda = True
    target_length = np.shape(target_variable)[0]

    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    hx,cx = encoder.initHidden()
    for ei in range(0,target_length):
        encoder_input = Variable(torch.FloatTensor(target_variable[ei])).unsqueeze(0).unsqueeze(0)
        encoder_input = encoder_input.cuda()
        encoder_output, hx, cx = encoder(encoder_input, hx, cx)    
    
    decoder_input = Variable(torch.FloatTensor(target_variable[0])).unsqueeze(0).unsqueeze(0)
    decoder_input = decoder_input.cuda()
    
    loss = 0
    for di in range(2, target_length):
            
            decoder_output, hx, cx = decoder(decoder_input, hx, cx)
            topv, topi = softmax(decoder_output).data.topk(1)
            ni = topi[0][0]
            #print('predicted value: %s'%(vocabulary[ni]))
            
            decoder_input = Variable(torch.FloatTensor(w2v_embeddings[[ni]])).unsqueeze(0)
            decoder_input = decoder_input.cuda() if isCuda else decoder_input
            if (di != (target_length-1)):
                ti = word2index.get(target_variable2[di-1],0)
            else:
                ti = word2index["<EOS>"]
                    
            trueValue = Variable(torch.LongTensor(1))
            trueValue.data[0] = ti
            trueValue = trueValue.cuda() 
            loss += criterion(decoder_output, trueValue)
            if ni == word2index["<EOS>"] :
                break

    loss.backward()
    
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return loss.data[0] / target_length



In [11]:

def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=500, learning_rate=0.01):
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every
    
    isCuda= True
    learning_rate=0.01
    decoder_optimizer = optim.Adam(decoder.parameters(),learning_rate)
    encoder_optimizer = optim.Adam(encoder.parameters(),learning_rate)

    criterion = nn.CrossEntropyLoss().cuda()

    #training_pairs = [variablesFromPair(random.choice(pairs))for i in range(n_iters)]

    for iter in range(1, n_iters+1):
        if(np.size(preprocess_numberize(train_sentences[iter-1]))>2): 
                

            loss = train(train_sentences[iter-1], encoder, encoder_optimizer, decoder, decoder_optimizer, criterion)
            print_loss_total += loss
            plot_loss_total += loss

            if iter % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                print('(%d %d%%) %.4f' % (iter, iter / n_iters * 100, print_loss_avg))

                

In [12]:
isCuda = True
hidden_size = 300
decoder = DecoderLSTM(hidden_size, isCuda).cuda()
encoder = EncoderLSTM(hidden_size, isCuda).cuda()

size = math.floor(np.size(sentences)/4)
trainIters(encoder, decoder, size)

(1000 0%) 3.4012
(2000 1%) 3.3075
(3000 2%) 3.4080
(4000 3%) 3.2311
(5000 4%) 3.3215
(6000 5%) 3.3710
(7000 6%) 3.3383
(8000 7%) 2.9751
(9000 8%) 3.2291
(10000 9%) 3.4056
(11000 10%) 3.2156
(12000 11%) 3.2290
(13000 12%) 3.2564
(14000 13%) 3.2803
(15000 14%) 3.4824
(16000 15%) 3.3378
(17000 16%) 3.3116
(18000 17%) 3.4178
(19000 18%) 3.1997
(20000 19%) 3.3462
(21000 20%) 3.3656
(22000 21%) 3.1741
(23000 22%) 3.3956
(25000 24%) 6.5449
(26000 25%) 3.3148
(27000 25%) 3.2540
(28000 26%) 3.3362
(29000 27%) 3.4114
(30000 28%) 3.4567
(31000 29%) 3.3378
(32000 30%) 3.2783
(33000 31%) 3.3807
(34000 32%) 3.7689
(35000 33%) 3.6520
(36000 34%) 3.5407
(37000 35%) 3.4346
(38000 36%) 3.7283
(39000 37%) 3.5816
(40000 38%) 3.7096
(41000 39%) 3.6674
(42000 40%) 3.5629
(43000 41%) 3.6084
(44000 42%) 3.7970
(45000 43%) 3.7142
(46000 44%) 3.6690
(47000 45%) 3.5484
(48000 46%) 3.5976
(49000 47%) 3.6336
(50000 48%) 3.5839
(51000 49%) 3.6670
(52000 50%) 3.5234
(53000 50%) 3.4926
(54000 51%) 3.5647
(55000 52%) 

# 7. Testing 

We must now define a method that allows us to do inference using the seq2seq architecture. We then run the 500 validation captions through this method, and ultimately compare the **reference** and **generated** sentences using our **BLEU** similarity score method defined above, to identify the average BLEU score.

In [13]:
softmax = nn.Softmax()

def seq2seq_inference(sentence, encoder, decoder, embeddings=w2v_embeddings, max_length = maxSequenceLength):
    
    encoder.eval()
    decoder.eval()
    isCuda = True
    
    hx,cx = encoder.initHidden()
    target_variable = preprocess_word2vec(sentence) 
    sentence_length = np.shape(target_variable)[0]



    for ei in range(0,sentence_length):
        encoder_input = Variable(torch.FloatTensor(target_variable[ei])).unsqueeze(0).unsqueeze(0)
        encoder_input = encoder_input.cuda()
        encoder_output, hx, cx = encoder(encoder_input, hx, cx)

    decoder_input = Variable(torch.FloatTensor(target_variable[0])).unsqueeze(0).unsqueeze(0)
    decoder_input = decoder_input.cuda()

    predicted_sentence = word_tokenize(sentence)[0]+" "
    i = 0 
    
    while True:
            i += 1
            decoder_output, hx, cx = decoder(decoder_input, hx, cx)
            topv, topi = softmax(decoder_output).data.topk(1)
            ni = topi[0][0]
            if ni == word2index["<EOS>"] or i == max_length:
                break
            predicted_sentence = predicted_sentence + vocabulary[ni] + " "
            decoder_input = Variable(torch.FloatTensor(w2v_embeddings[[ni]])).unsqueeze(0)
            decoder_input = decoder_input.cuda() if isCuda else decoder_input
            
    return predicted_sentence, compute_bleu(sentence, predicted_sentence)




In [14]:
# Perform inference for all validation sequences and report the average BLEU score
total = 0

for i,val_sentence in enumerate(val_sentences):
    predicted_sentence, similarity = seq2seq_inference(val_sentence, encoder, decoder)
    total += similarity

avg = total/len(val_sentences)
print(avg)

Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


0.43043897164777595


# 8. Encoding as Generic Feature Representation

We now use the final hidden state of our encoder, to identify the nearest neighbor amongst the training sentences for each sentence in our validation data.

It would be effective to first define a method that would generate all of the hidden states and store these hidden states **on the CPU**, and then loop over the generated hidden states to identify/output the nearest neighbors.

In [15]:
def final_encoder_hidden(sentence):
    
    target_variable = preprocess_word2vec(sentence) 
    sentence_length = np.shape(target_variable)[0]
    hx,cx = encoder.initHidden()
    for ei in range(0,sentence_length):
        encoder_input = Variable(torch.FloatTensor(target_variable[ei])).unsqueeze(0).unsqueeze(0)
        encoder_input = encoder_input.cuda()
        encoder_output, hx, cx = encoder(encoder_input, hx, cx)
    return hx.data[0][0].cpu().numpy()


In [16]:
# Now run all training data and validation data to store hidden states
size = math.floor(np.size(sentences)/4)
hx_store = np.zeros((size,hidden_size))
for i,train_sentence in enumerate(train_sentences[0:size]):
    hx_store[i] = final_encoder_hidden(train_sentence)


In [17]:
# Now get nearest neighbors and print
size = math.floor(np.size(sentences)/4)

for val_sentence in val_sentences[:10]:
    hx_val = final_encoder_hidden(val_sentence)
    min_dist = math.inf
    neighbor_index = 0
    for i,train_sentence in enumerate(train_sentences[0:size]):
        dist = np.linalg.norm(hx_val-hx_store[i])
        if(dist < min_dist):
            closest_sentence = train_sentence
            min_dist = dist 
            
    print('val_sentence: %s '% val_sentence)
    print('close_sentence: %s \n'% closest_sentence)

val_sentence: A man and woman at a table with beer and wine 
close_sentence: A man stands near a very large selection of fruits and vegetables 

val_sentence: A man speaking into a microphone on a stage with a bicycle and dressed in cyclist gear. 
close_sentence: A man in a room with a LCD television showing a movie. 

val_sentence: Four horses are skattered around a small water hole. 
close_sentence: A bathroom divided with the tub and toilet in one partition and the sink in the other half of the room. 

val_sentence: A man and a young girl playing Wii 
close_sentence: A little girl is reading a book in the living room. 

val_sentence: A boat home sitting on a river bay. 
close_sentence: A white police vehicle parked in a parking lot. 

val_sentence: Several Tim's of mints are stacked up with a bottle that has several  clipped roses inside 
close_sentence: an airplane hanging down from the cieling inside 

val_sentence: Family at a pizza restaurant posing for a picture before meal. 
c