# Q3: Char Sequence Embeddings Test Case 

In [4]:
import torch
import numpy as np
from config import config
from model import sequence_labeling
from randomness import apply_random_seed
from data_io import DataReader, gen_embedding_from_file, read_tag_vocab

## Reading word and char index dictionaries

In [5]:
_config = config()
apply_random_seed()

    
tag_dict = read_tag_vocab(config.output_tag_file)
reversed_tag_dict = {v: k for (k, v) in tag_dict.items()}
word_embedding, word_dict = gen_embedding_from_file(config.word_embedding_file, config.word_embedding_dim)
char_embedding, char_dict = gen_embedding_from_file(config.char_embedding_file, config.char_embedding_dim)

_config.nwords = len(word_dict)
_config.ntags = len(tag_dict)
_config.nchars = len(char_dict)
model = sequence_labeling(_config, word_embedding, char_embedding)


def get_word_ids(w):
    word = w.lower()
    if word in word_dict:
        return word_dict[word]
    else:
        return word_dict[UNKNOWN_WORD]

def get_char_ids(c):
    if c in char_dict:
        return char_dict[c]
    else:
        return char_dict[UNKNOWN_CHAR]

## Example Sentences...
Lets consider two example sentences:<br>
Here we have 2 sentences with 7 words in each sentence, and a maximum of 14 characters in a sentence.<br>
We will generate a char_index_batch of the size [2,7,14], and word_len_batch of the size [2,7].


In [6]:
sen1 = ['Potion', 'Mastery', 'is', 'specialization', 'of', 'Alchemy', '.']
sen2 = ['A', 'Guild', 'is', 'association', 'of', 'craftsmen', '.']

#sen1 = ['Potion', 'Mastery', 'is', 'specialization', 'of', '.']
#sen2 = ['A', 'Guild', 'is', 'association', 'of', '.']

In [7]:
sentence_list = [sen1] + [sen2]

word_index_lists = [[get_word_ids(word) for word in sentence] for sentence in sentence_list]
char_index_matrix = [[[get_char_ids(char) for char in word] for word in sentence] for sentence in sentence_list]
word_len_lists = [[len(word) for word in sentence] for sentence in char_index_matrix]
sentence_len_list = [len(x) for x in word_len_lists]


batch_char_index_matrices = np.zeros((len(word_index_lists), max(sentence_len_list), max(map(max, word_len_lists))), dtype=int)
for i, (char_index_matrix, word_len_list) in enumerate(zip(char_index_matrix, word_len_lists)):
    for j in range(len(word_len_list)):
        batch_char_index_matrices[i, j, :word_len_list[j]] = char_index_matrix[j]
        
        
batch_word_len_lists = np.ones((len(word_index_lists), max(sentence_len_list)), dtype=int) # cannot set default value to 0
for i, (word_len, sent_len) in enumerate(zip(word_len_lists, sentence_len_list)):
    batch_word_len_lists[i, :sent_len] = word_len
    
batch_word_len_lists = torch.from_numpy(np.array(batch_word_len_lists)).long()
batch_char_index_matrices = torch.from_numpy(batch_char_index_matrices).long()

We will pass the `batch_char_index_matrices` and `batch_word_len_lists` to the method `get_char_sequence()`.

In [108]:
## 
def get_char_sequence(model, batch_char_index_matrices, batch_word_len_lists):
    
    # Given an input of the size [2,7,14], we will convert it a minibatch of the shape [14,14] to 
    # represent 14 words(7 in each sentence), and 14 characters in each word.
    mini_batch = batch_char_index_matrices.view(batch_char_index_matrices.size()[0]*batch_char_index_matrices.size()[1], batch_char_index_matrices.size()[2])
    print('mini_batch', mini_batch.size())
    batch_word_lengths = batch_word_len_lists.view(1, batch_word_len_lists.size()[0]*batch_word_len_lists.size()[1])
    ## NOTE: Please DO NOT USE for Loops to iterate over the mini-batch.

    
    # Get corresponding char_Embeddings, we will have a Final Tensor of the shape [14, 14, 50]
    # Sort the mini-batch wrt word-lengths, to form a pack_padded sequence.
    # Feed the pack_padded sequence to the char_LSTM layer.
    char_Embeddings = model.char_embeds(mini_batch)
    print('char_embed', char_Embeddings.size())
    print('mini_batch', mini_batch)
    print('batch_word_len_lists', batch_word_len_lists.size())
    print('batch_word_len_lists', batch_word_len_lists)
    print('batch_word_lengths', batch_word_lengths.size())
    print('batch_word_lengths', batch_word_lengths)

    pack_padded_sequence = [x for _,x in sorted(zip(batch_word_lengths.tolist()[0], char_Embeddings.tolist()))]
    pack_padded_sequence = torch.from_numpy(np.array(pack_padded_sequence))
    pack_padded_sequence = pack_padded_sequence.type(torch.FloatTensor)
    
    print('pack_padded_sequence', pack_padded_sequence.size())
    print('pack_padded_sequence', pack_padded_sequence)
    
    lstm_char_out, hidden_state = model.char_lstm(pack_padded_sequence)
    
    print('hidden_state_0', hidden_state[0].size())
    
    print('hidden_state_1', hidden_state[1].size())
    
    # Get hidden state of the shape [2,14,50].
    # Recover the hidden_states corresponding to the sorted index.
    # Re-shape it to get a Tensor the shape [2,7,100].

    pass

## Comparing the result returned by the method against the Ground Truth Values...

You can compare the result returned by your method against the ground truth value, stored in the file `answer.npy`. 

In [109]:
answer = get_char_sequence(model, batch_char_index_matrices, batch_word_len_lists)
answer = answer.data.numpy()
result = np.load('./answer.npy')

mini_batch torch.Size([14, 14])
char_embed torch.Size([14, 14, 50])
mini_batch tensor([[47,  7, 11, 13,  7, 15,  0,  0,  0,  0,  0,  0,  0,  0],
        [56,  5, 18, 11,  3,  6, 25,  0,  0,  0,  0,  0,  0,  0],
        [13, 18,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [18, 24,  3, 14, 13,  5, 21, 13, 33,  5, 11, 13,  7, 15],
        [ 7, 22,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [30, 21, 14,  2,  3, 19, 25,  0,  0,  0,  0,  0,  0,  0],
        [29,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [30,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [48,  9, 13, 21, 17,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [13, 18,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 5, 18, 18,  7, 14, 13,  5, 11, 13,  7, 15,  0,  0,  0],
        [ 7, 22,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [14,  6,  5, 22, 11, 18, 19,  3, 15,  0,  0,  0,  0,  0],
        [29,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0

AttributeError: 'NoneType' object has no attribute 'data'

In [110]:
try:
    assert np.allclose(np.asarray(answer.tolist()), np.asarray(result.tolist()), atol=0.001)
    print('Your implementation is Correct')
except:
    print('Your implementation is not Correct')

Your implementation is not Correct
