# Q3: Char Sequence Embeddings Test Case 

In [3]:
import torch
import numpy as np
from config import config
from model import sequence_labeling
from randomness import apply_random_seed
from data_io import DataReader, gen_embedding_from_file, read_tag_vocab

## Reading word and char index dictionaries

In [4]:
_config = config()
apply_random_seed()

    
tag_dict = read_tag_vocab(config.output_tag_file)
reversed_tag_dict = {v: k for (k, v) in tag_dict.items()}
word_embedding, word_dict = gen_embedding_from_file(config.word_embedding_file, config.word_embedding_dim)
char_embedding, char_dict = gen_embedding_from_file(config.char_embedding_file, config.char_embedding_dim)

_config.nwords = len(word_dict)
_config.ntags = len(tag_dict)
_config.nchars = len(char_dict)
model = sequence_labeling(_config, word_embedding, char_embedding)


def get_word_ids(w):
    word = w.lower()
    if word in word_dict:
        return word_dict[word]
    else:
        return word_dict[UNKNOWN_WORD]

def get_char_ids(c):
    if c in char_dict:
        return char_dict[c]
    else:
        return char_dict[UNKNOWN_CHAR]

## Example Sentences...
Lets consider two example sentences:<br>
Here we have 2 sentences with 7 words in each sentence, and a maximum of 14 characters in a sentence.<br>
We will generate a char_index_batch of the size [2,7,14], and word_len_batch of the size [2,7].


In [76]:
sen1 = ['Potion', 'Mastery', 'is', 'specialization', 'of', 'Alchemy', '.']
sen2 = ['A', 'Guild', 'is', 'association', 'of', 'craftsmen', '.']

#sen1 = ['Potion', 'Mastery', 'is', 'specialization', 'of', '.']
#sen2 = ['A', 'Guild', 'is', 'association', 'of', '.']

In [77]:
sentence_list = [sen1] + [sen2]

word_index_lists = [[get_word_ids(word) for word in sentence] for sentence in sentence_list]
char_index_matrix = [[[get_char_ids(char) for char in word] for word in sentence] for sentence in sentence_list]
word_len_lists = [[len(word) for word in sentence] for sentence in char_index_matrix]
sentence_len_list = [len(x) for x in word_len_lists]


batch_char_index_matrices = np.zeros((len(word_index_lists), max(sentence_len_list), max(map(max, word_len_lists))), dtype=int)
for i, (char_index_matrix, word_len_list) in enumerate(zip(char_index_matrix, word_len_lists)):
    for j in range(len(word_len_list)):
        batch_char_index_matrices[i, j, :word_len_list[j]] = char_index_matrix[j]
        
        
batch_word_len_lists = np.ones((len(word_index_lists), max(sentence_len_list)), dtype=int) # cannot set default value to 0
for i, (word_len, sent_len) in enumerate(zip(word_len_lists, sentence_len_list)):
    batch_word_len_lists[i, :sent_len] = word_len
    
batch_word_len_lists = torch.from_numpy(np.array(batch_word_len_lists)).long()
batch_char_index_matrices = torch.from_numpy(batch_char_index_matrices).long()

We will pass the `batch_char_index_matrices` and `batch_word_len_lists` to the method `get_char_sequence()`.

In [98]:
## 
def get_char_sequence(model, batch_char_index_matrices, batch_word_len_lists):
    
    # Given an input of the size [2,7,14], we will convert it a minibatch of the shape [14,14] to 
    # represent 14 words(7 in each sentence), and 14 characters in each word.
    batch_char_index_matrices = batch_char_index_matrices.view(batch_char_index_matrices.size()[0]*batch_char_index_matrices.size()[1], batch_char_index_matrices.size()[2])
    print('batch_char_index_matrices', batch_char_index_matrices.size())
    ## NOTE: Please DO NOT USE for Loops to iterate over the mini-batch.
    
    batch_word_len_lists = batch_word_len_lists.view(batch_word_len_lists.size()[0]*batch_word_len_lists.size()[1], 1)
    print('batch_word_len_lists', batch_word_len_lists.size())
    
    
    # Get corresponding char_Embeddings, we will have a Final Tensor of the shape [14, 14, 50]
    # Sort the mini-batch wrt word-lengths, to form a pack_padded sequence.
    # Feed the pack_padded sequence to the char_LSTM layer.
    char_embed = model.char_embeds(batch_char_index_matrices)
    print('char_embed', char_embed.size())
    pack_padded_sequence, sorted_index = torch.sort(char_embed, descending=False)
    char_lstm, hidden = model.char_lstm(pack_padded_sequence)
    print('pack_padded_sequence', pack_padded_sequence.size())
    print('char_lstm', char_lstm.size())
    print('hidden', hidden[0].size())
    print('sorted',  sorted_index.size())
#     print('sorted',  sorted_index)
#     print('char_lstm', char_lstm)
    
    # Get hidden state of the shape [2,14,50].
    # Recover the hidden_states corresponding to the sorted index.
    # Re-shape it to get a Tensor the shape [2,7,100].
    print(char_lstm[sorted_index])
    
    
    result = hidden_state.view(hidden_state[0], hidden_state[1]/2, hidden_state[2]*2)
    #return result
    pass

## Comparing the result returned by the method against the Ground Truth Values...

You can compare the result returned by your method against the ground truth value, stored in the file `answer.npy`. 

In [99]:
answer = get_char_sequence(model, batch_char_index_matrices, batch_word_len_lists)
answer = answer.data.numpy()
result = np.load('./answer.npy')

batch_char_index_matrices torch.Size([14, 14])
batch_word_len_lists torch.Size([14, 1])
char_embed torch.Size([14, 14, 50])
pack_padded_sequence torch.Size([14, 14, 50])
char_lstm torch.Size([14, 14, 100])
hidden torch.Size([2, 14, 50])
sorted torch.Size([14, 14, 50])


RuntimeError: index 49 is out of bounds for dimension 0 with size 14

In [8]:
try:
    assert np.allclose(np.asarray(answer.tolist()), np.asarray(result.tolist()), atol=0.001)
    print('Your implementation is Correct')
except:
    print('Your implementation is not Correct')

Your implementation is Correct
