# Q3: Char Sequence Embeddings Test Case 

In [103]:
import torch
import numpy as np
from config import config
from model import sequence_labeling
from randomness import apply_random_seed
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from data_io import DataReader, gen_embedding_from_file, read_tag_vocab

## Reading word and char index dictionaries

In [104]:
_config = config()
apply_random_seed()

    
tag_dict = read_tag_vocab(config.output_tag_file)
reversed_tag_dict = {v: k for (k, v) in tag_dict.items()}
word_embedding, word_dict = gen_embedding_from_file(config.word_embedding_file, config.word_embedding_dim)
char_embedding, char_dict = gen_embedding_from_file(config.char_embedding_file, config.char_embedding_dim)

_config.nwords = len(word_dict)
_config.ntags = len(tag_dict)
_config.nchars = len(char_dict)
model = sequence_labeling(_config, word_embedding, char_embedding)


def get_word_ids(w):
    word = w.lower()
    if word in word_dict:
        return word_dict[word]
    else:
        return word_dict[UNKNOWN_WORD]

def get_char_ids(c):
    if c in char_dict:
        return char_dict[c]
    else:
        return char_dict[UNKNOWN_CHAR]

## Example Sentences...
Lets consider two example sentences:<br>
Here we have 2 sentences with 7 words in each sentence, and a maximum of 14 characters in a sentence.<br>
We will generate a char_index_batch of the size [2,7,14], and word_len_batch of the size [2,7].


In [105]:
sen1 = ['Potion', 'Mastery', 'is', 'specialization', 'of', 'Alchemy', '.']
sen2 = ['A', 'Guild', 'is', 'association', 'of', 'craftsmen', '.']

#sen1 = ['Potion', 'Mastery', 'is', 'specialization', 'of', '.']
#sen2 = ['A', 'Guild', 'is', 'association', 'of', '.']

In [106]:
sentence_list = [sen1] + [sen2]

word_index_lists = [[get_word_ids(word) for word in sentence] for sentence in sentence_list]
char_index_matrix = [[[get_char_ids(char) for char in word] for word in sentence] for sentence in sentence_list]
word_len_lists = [[len(word) for word in sentence] for sentence in char_index_matrix]
sentence_len_list = [len(x) for x in word_len_lists]


batch_char_index_matrices = np.zeros((len(word_index_lists), max(sentence_len_list), max(map(max, word_len_lists))), dtype=int)
for i, (char_index_matrix, word_len_list) in enumerate(zip(char_index_matrix, word_len_lists)):
    for j in range(len(word_len_list)):
        batch_char_index_matrices[i, j, :word_len_list[j]] = char_index_matrix[j]
        
        
batch_word_len_lists = np.ones((len(word_index_lists), max(sentence_len_list)), dtype=int) # cannot set default value to 0
for i, (word_len, sent_len) in enumerate(zip(word_len_lists, sentence_len_list)):
    batch_word_len_lists[i, :sent_len] = word_len
    
batch_word_len_lists = torch.from_numpy(np.array(batch_word_len_lists)).long()
batch_char_index_matrices = torch.from_numpy(batch_char_index_matrices).long()

We will pass the `batch_char_index_matrices` and `batch_word_len_lists` to the method `get_char_sequence()`.

In [216]:
## 
def get_char_sequence(model, batch_char_index_matrices, batch_word_len_lists):
    
    # Given an input of the size [2,7,14], we will convert it a minibatch of the shape [14,14] to 
    # represent 14 words(7 in each sentence), and 14 characters in each word.
    ## NOTE: Please DO NOT USE for Loops to iterate over the mini-batch.
    char_size = batch_char_index_matrices.size()
    mini_batch = batch_char_index_matrices.view(char_size[0]*char_size[1], char_size[2])
    
    # Get corresponding char_Embeddings, we will have a Final Tensor of the shape [14, 14, 50]
    char_Embeddings = model.char_embeds(mini_batch)
    
    # Sort the mini-batch wrt word-lengths, to form a pack_padded sequence.
    # Feed the pack_padded sequence to the char_LSTM layer.
    batch_word_lengths = batch_word_len_lists.view(-1)
    perm_idx, sorted_batch_word_len_lists = model.sort_input(batch_word_lengths)
    sorted_input_embeds = char_Embeddings[perm_idx]
    
    # Get hidden state of the shape [2,14,50].
    _, desorted_indices = torch.sort(perm_idx, descending=False)
    outputs = pack_padded_sequence(sorted_input_embeds, lengths = sorted_batch_word_len_lists.data.tolist(), batch_first=True)
    outputs, hidden_state = model.char_lstm(outputs)
    
    # Recover the hidden_states corresponding to the sorted index.
    result = torch.cat([hidden_state[0][0], hidden_state[0][1]], dim=-1)
    result = result[desorted_indices]
    
    # Re-shape it to get a Tensor the shape [2,7,100].
    r_size = result.size()
    result = result.view(2, int(r_size[0]/2), r_size[-1])

    return result

answer = get_char_sequence(model, batch_char_index_matrices, batch_word_len_lists)
answer = answer.data.numpy()
result = np.load('./answer.npy')

## Comparing the result returned by the method against the Ground Truth Values...

You can compare the result returned by your method against the ground truth value, stored in the file `answer.npy`. 

In [200]:
answer = get_char_sequence(model, batch_char_index_matrices, batch_word_len_lists)
answer = answer.data.numpy()
result = np.load('./answer.npy')

>>batch_char_index_matrices =>  torch.Size([2, 7, 14]) >>batch_word_len_lists =>  torch.Size([2, 7])
====mini_batch torch.Size([14, 14])
===char_embed torch.Size([14, 14, 50])
===batch_word_lengths torch.Size([14])
>>perm_idx=>  tensor([ 3, 10, 12,  1,  5,  0,  8,  4,  2,  9, 11,  6,  7, 13]) >>sorted_batch_word_len_lists=>  tensor([14, 11,  9,  7,  7,  6,  5,  2,  2,  2,  2,  1,  1,  1])
===sorted_input_embeds torch.Size([14, 14, 50])
==== torch.Size([2, 14, 50])
>>> torch.Size([2, 7, 100])


In [217]:
try:
    assert np.allclose(np.asarray(answer.tolist()), np.asarray(result.tolist()), atol=0.001)
    print('Your implementation is Correct')
except:
    print('Your implementation is not Correct')

Your implementation is Correct


In [149]:
a = answer.tolist()

In [157]:
answer.shape

(14, 14, 100)

In [201]:
a = answer.tolist()
b = result.tolist()
d = 0
for i in range(len(a)):
    l1 = a[i]
    for j in range(len(l1)):
        
        l2 = l1[j]
        for k in range(len(l2)):
            l3 = l2[k]
            v = abs(l3 - b[i][j][k])
            if v > d:
                d = v
                print(i, j ,k)
print(d)

0 0 0
0 0 1
0 0 2
0 0 3
0 0 4
0 0 6
0 0 15
0 0 39
0 0 47
0 0 50
0 0 51
0 0 52
0 0 55
0 0 91
1 4 21
0.597233384847641


In [211]:
## 
def get_char_sequence(model, batch_char_index_matrices, batch_word_len_lists):
    
    char_size = batch_char_index_matrices.size()
    mini_batch = batch_char_index_matrices.view(char_size[0]*char_size[1], char_size[2])
    
    char_Embeddings = model.char_embeds(mini_batch)
    batch_word_lengths = batch_word_len_lists.view(-1)
    
    perm_idx, sorted_batch_word_len_lists = model.sort_input(batch_word_lengths)
    
    sorted_input_embeds = char_Embeddings[perm_idx]
    
    _, desorted_indices = torch.sort(perm_idx, descending=False)
    outputs = pack_padded_sequence(sorted_input_embeds, lengths = 
                                   sorted_batch_word_len_lists.data.tolist(), batch_first=True)
    
    outputs, hidden_state = model.char_lstm(outputs)

    out_final = torch.cat([hidden_state[0][0], hidden_state[0][1]], dim=-1)
    
    print(out_final.size())
    out_final = out_final[desorted_indices]
    result = out_final.view(2,7,100)
    print(result.size())
    
    


    return result

answer = get_char_sequence(model, batch_char_index_matrices, batch_word_len_lists)
answer = answer.data.numpy()
result = np.load('./answer.npy')

torch.Size([14, 100])
torch.Size([2, 7, 100])
