## Inspect data

In [2]:
training_data = [
    ("The cat ate the cheese".lower().split(), ["DET", "NN", "V", "DET", "NN"]),
    ("She read that book".lower().split(), ["NN", "V", "DET", "NN"]),
    ("The dog loves art".lower().split(), ["DET", "NN", "V", "NN"]),
    ("The elephant answers the phone".lower().split(), ["DET", "NN", "V", "DET", "NN"])
]

test_sentence = "The cheese loves the elephant".lower().split()

In [3]:
for i, d in enumerate(training_data):
    print('sequence {}:len={}'.format(i, len(d[0])))
    print('\tdata={}'.format(d[0]))
    print('\ttag= {}'.format(d[1]))

sequence 0:len=5
	data=['the', 'cat', 'ate', 'the', 'cheese']
	tag= ['DET', 'NN', 'V', 'DET', 'NN']
sequence 1:len=4
	data=['she', 'read', 'that', 'book']
	tag= ['NN', 'V', 'DET', 'NN']
sequence 2:len=4
	data=['the', 'dog', 'loves', 'art']
	tag= ['DET', 'NN', 'V', 'NN']
sequence 3:len=5
	data=['the', 'elephant', 'answers', 'the', 'phone']
	tag= ['DET', 'NN', 'V', 'DET', 'NN']


## Create Encoders for Words and Tags

In [4]:
from collections import defaultdict
word2idx = defaultdict()

index = 0

for d in training_data:
    for w in d[0]:
        if w not in word2idx:
            word2idx[w] = index
            index += 1

            
print('word2idx: len={}\n\t{}'.format(len(word2idx), word2idx))

word2idx: len=14
	defaultdict(None, {'the': 0, 'cat': 1, 'ate': 2, 'cheese': 3, 'she': 4, 'read': 5, 'that': 6, 'book': 7, 'dog': 8, 'loves': 9, 'art': 10, 'elephant': 11, 'answers': 12, 'phone': 13})


In [5]:
tag2idx = defaultdict()

index = 0
for d in training_data:
    for t in d[1]:
        if t not in tag2idx:
            tag2idx[t] = index
            index += 1

idx2tag = {idx: tag for tag, idx in tag2idx.items()}
print('tag2idx: len={}\n\t{}'.format(len(tag2idx), tag2idx))
print('idx2tag: len={}\n\t{}'.format(len(idx2tag), idx2tag))

tag2idx: len=3
	defaultdict(None, {'DET': 0, 'NN': 1, 'V': 2})
idx2tag: len=3
	{0: 'DET', 1: 'NN', 2: 'V'}


In [6]:
import numpy as np
import torch

def prepare_sequence(seq, to_idx):
    idxs = [to_idx[w] for w in seq]
    idxs = np.array(idxs)
    idxs = torch.from_numpy(idxs)
    return idxs

## Build Generator for Words and Tags

In [7]:
def train_loader():
    for seq in training_data:
        word = seq[0]
        tag = seq[1]
        yield word, tag

## Define Model and Create it

In [8]:
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim

class LSTMTagger(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, target_size):
        super().__init__()
        
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.num_layers = 1
        self.num_batches = 1
        
        self.word_embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        
        # layer, hidden size, input size
        self.lstm = nn.LSTM(input_size=self.embedding_dim, 
                             hidden_size=self.hidden_dim,
                             num_layers=1)
        
        self.hidden2tag = nn.Linear(self.hidden_dim, target_size)
        
        self.hidden = self.init_hidden()
        
        
    def init_hidden(self):
        
        # num_layers, num_batches, hidden_dim
        return (torch.zeros(self.num_layers, self.num_batches, self.hidden_dim),
                torch.zeros(self.num_layers, self.num_batches, self.hidden_dim))
    
    def forward(self, sentence):
        embeds = self.word_embedding(sentence)
        
        print('\tLSTM:')
        print('\t\tsentence.size()={}'.format(sentence.size() ))
        print('\t\tembeds.size()={}'.format(embeds.size()))
        
        lstm_out, self.hidden = self.lstm(
                                embeds.view(len(sentence), 1, -1),
                                self.hidden)
        
        print('\t\thidden[0].size={}'.format(self.hidden[0].size()))
        print('\t\thidden[1].size={}'.format(self.hidden[1].size()))
        print('\t\tlstm_out.size={}'.format(lstm_out.size()))
        
        tag_outputs = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_outputs, dim=1)
        
        return tag_scores 
    
    
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

# len(word2idx) = 14                                                                                       
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word2idx), len(tag2idx))

loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

## Check Inital State

In [9]:
print('test sentence={}\n'.format(test_sentence))

inputs = prepare_sequence(test_sentence, word2idx)
print('inputs.size={}'.format(inputs.size()))

tag_scores = model.forward(inputs)

# size = (sequence, batch_size, vocab_size)
print('\n')
print('tag_scores.size()={}'.format(tag_scores.size()))
print('\t{}'.format(tag_scores))

_, predicted_tags = torch.max(tag_scores, 1)

tags = [idx2tag[idx] for idx in predicted_tags.numpy()]
print('\n')
print('Predicted tags: \n',predicted_tags)
print('Predicted tags: \n',tags)

test sentence=['the', 'cheese', 'loves', 'the', 'elephant']

inputs.size=torch.Size([5])
	LSTM:
		sentence.size()=torch.Size([5])
		embeds.size()=torch.Size([5, 6])
		hidden[0].size=torch.Size([1, 1, 6])
		hidden[1].size=torch.Size([1, 1, 6])
		lstm_out.size=torch.Size([5, 1, 6])


tag_scores.size()=torch.Size([5, 3])
	tensor([[-1.3208, -0.7837, -1.2860],
        [-1.4068, -0.7602, -1.2465],
        [-1.3667, -0.7648, -1.2743],
        [-1.3366, -0.7938, -1.2548],
        [-1.5143, -0.7828, -1.1304]])


Predicted tags: 
 tensor([ 1,  1,  1,  1,  1])
Predicted tags: 
 ['NN', 'NN', 'NN', 'NN', 'NN']


## Start Training

In [12]:
n_epochs = 300
loss_history = []

model.train()

epoch_loss = 0.0

for epoch in range(n_epochs):
    
    # iterate data
    for sentence, tags in train_loader():
#         for w, t in zip(words, tags):
            
        model.zero_grad()
        
        model.hidden = model.init_hidden()
        
        sentence_in = prepare_sequence(sentence, word2idx)
        targets = prepare_sequence(tags, tag2idx)

        tag_scores = model(sentence_in)

        loss = loss_function(tag_scores, targets)
        
        loss_history.append(loss.item())
        epoch_loss += loss.item()
        loss.backward()

        optimizer.step()
       
    
    if epoch % 20 == 19:
        print('epoch={}, loss={}'.format(epoch+1, epoch_loss/len(training_data)))
        epoch_loss = 0.0
            

	LSTM:
		sentence.size()=torch.Size([5])
		embeds.size()=torch.Size([5, 6])
		hidden[0].size=torch.Size([1, 1, 6])
		hidden[1].size=torch.Size([1, 1, 6])
		lstm_out.size=torch.Size([5, 1, 6])
	LSTM:
		sentence.size()=torch.Size([4])
		embeds.size()=torch.Size([4, 6])
		hidden[0].size=torch.Size([1, 1, 6])
		hidden[1].size=torch.Size([1, 1, 6])
		lstm_out.size=torch.Size([4, 1, 6])
	LSTM:
		sentence.size()=torch.Size([4])
		embeds.size()=torch.Size([4, 6])
		hidden[0].size=torch.Size([1, 1, 6])
		hidden[1].size=torch.Size([1, 1, 6])
		lstm_out.size=torch.Size([4, 1, 6])
	LSTM:
		sentence.size()=torch.Size([5])
		embeds.size()=torch.Size([5, 6])
		hidden[0].size=torch.Size([1, 1, 6])
		hidden[1].size=torch.Size([1, 1, 6])
		lstm_out.size=torch.Size([5, 1, 6])
	LSTM:
		sentence.size()=torch.Size([5])
		embeds.size()=torch.Size([5, 6])
		hidden[0].size=torch.Size([1, 1, 6])
		hidden[1].size=torch.Size([1, 1, 6])
		lstm_out.size=torch.Size([5, 1, 6])
	LSTM:
		sentence.size()=torch.Size([4])

# Start Testing

In [13]:
model.eval()

# inputs = prepare_sequence(test_sentence, word2idx)

sentence = training_data[1][0]
print(sentence)
inputs = prepare_sequence(sentence, word2idx)

tag_scores = model(inputs)

print('tag_scores.size()={}'.format(tag_scores))
_, predicted_tags = torch.max(tag_scores, 1)

tags = [idx2tag[idx] for idx in predicted_tags.numpy()]

print('Predicted tags:\n\t{} \n'.format(tags))
print('Test sequence:\n\t{}'.format(test_sentence))

['she', 'read', 'that', 'book']
	LSTM:
		sentence.size()=torch.Size([4])
		embeds.size()=torch.Size([4, 6])
		hidden[0].size=torch.Size([1, 1, 6])
		hidden[1].size=torch.Size([1, 1, 6])
		lstm_out.size=torch.Size([4, 1, 6])
tag_scores.size()=tensor([[-7.0071, -0.0123, -4.4796],
        [-3.8768, -7.1522, -0.0217],
        [-0.0350, -3.8869, -4.2737],
        [-7.6565, -0.0022, -6.3431]])
Predicted tags:
	['NN', 'V', 'DET', 'NN'] 

Test sequence:
	['the', 'cheese', 'loves', 'the', 'elephant']
