# Deterministic POS Word Level Language Model with Penn Treebank
Sequence Tagger: https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html<br>
Penn Treebank: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.9.8216&rep=rep1&type=pdf

https://gist.github.com/williamFalcon/f27c7b90e34b4ba88ced042d9ef33edd <br>
https://stanford.edu/~shervine/blog/pytorch-how-to-generate-data-parallel

In [1]:
import numpy as np
import nltk
from nltk.corpus import treebank
import os
import codecs

In [2]:
nltk.download('treebank')
nltk.download('universal_tagset')

[nltk_data] Downloading package treebank to /home/tyler/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/tyler/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7fc993cb5270>

In [4]:
from IPython.display import clear_output

In [5]:
import pyro
from pyro.distributions import Normal, Categorical
from pyro.infer import SVI, Trace_ELBO
from pyro.optim import Adam

pyro.enable_validation(True)
pyro.clear_param_store()

# An LSTM for Part-of-Speech Tagging

### Load Data

In [6]:
# Penn tree bank
sentences = treebank.tagged_sents(tagset='universal')

In [7]:
def format_sequence(seq):
    """
    Formats penn treebank POS format into tuple ([tokens], [POS])
    """
    tokens = [x[0] for x in seq]
    tags = [x[1] for x in seq]
    return (tokens, tags)

In [8]:
sentences = [format_sequence(sentence) for sentence in sentences]

In [15]:
def data_vocab(sentences):
    """Builds vocab based on input data"""
    vocab = dict()
    for sentence in sentences:
        for word in sentence[0]:
            if word not in vocab:
                vocab[word] = len(vocab) + 1    # counts from 1+ as 0 is reserved for <PAD> token
    return vocab

In [16]:
# Vocab of input data (this will likely be a subset of any word embedding array)
data_vocab = data_vocab(sentences)

In [18]:
# Add padding token to data vocab
data_vocab['<PAD>'] = 0

### Build generator for data batching

## Prepare data

In [20]:
# Train/Test split
split_ratio = 0.80
training_data = sentences[:int(len(sentences)*split_ratio)]
test_data = sentences[len(training_data):]

In [21]:
print(f'Dataset Size: {len(sentences)} | Training Set Size: {len(training_data)} | Test Set Size: {len(test_data)}')

Dataset Size: 3914 | Training Set Size: 3131 | Test Set Size: 783


In [35]:
word_to_ix = {}
for sent, tags in sentences:   # training_data
#     print(sent, tags)
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix) + 1
# print(word_to_ix)

In [36]:
# Create tag-index lookups
tag_to_ix = {}
for _, tags in sentences:
    for tag in tags:
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix) + 1

ix_to_tag = {v:k for k, v in tag_to_ix.items()}

In [37]:
print(f'Word dictionary size: {len(word_to_ix)}')
print(f'Tag dictionary size: {len(tag_to_ix)}')

Word dictionary size: 12408
Tag dictionary size: 12


In [55]:
# add padding to word dict
word_to_ix['<PAD>'] = 0

In [51]:
# add padding to tag dict
tag_to_ix['<PAD>'] = 0

In [81]:
len(tag_to_ix)

13

In [38]:
def prepare_sequence(seq, to_ix):
    """Encodes sentence tokens as ids from word_to_ix dictionary"""
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [39]:
def prepare_seq_batch(batch, to_ix, pad_len, pad_token_id=0):
    """Encodes batch of sequences as ids from word_to_ix dictionary and pads"""
    batch_size = len(batch)
    padded_batch = np.full((batch_size, pad_len), pad_token_id)
    seq_lens = [len(seq) for seq in batch]
    
    for i, seq in enumerate(batch):
        # encode tokens as ids
        idxs = [to_ix[w] for w in seq]    
        # pad length
        padded_batch[i,0:len(idxs)] = idxs
        
    return torch.tensor(padded_batch, dtype=torch.long)

In [40]:
batch = [sentences[0][0], sentences[1][0], sentences[2][0]]

In [41]:
sentences[0][0]

['Pierre',
 'Vinken',
 ',',
 '61',
 'years',
 'old',
 ',',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'Nov.',
 '29',
 '.']

In [42]:
prepare_sequence(sentences[2][0], word_to_ix)

tensor([27, 28,  3, 29,  5,  6, 30, 31, 20, 21, 32, 33, 34, 35,  3, 36, 37, 38,
        12, 13, 14, 21, 39, 40, 41, 42, 17])

In [43]:
prepare_seq_batch(batch, word_to_ix, 30)

tensor([[ 1,  2,  3,  4,  5,  6,  3,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [18,  2, 19, 20, 21, 22, 23,  3,  9, 24, 25, 26, 17,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [27, 28,  3, 29,  5,  6, 30, 31, 20, 21, 32, 33, 34, 35,  3, 36, 37, 38,
         12, 13, 14, 21, 39, 40, 41, 42, 17,  0,  0,  0]])

In [47]:
def prepare_targets_batch(batch, to_ix, pad_len, pad_token_id=0):
    """Encodes batch of sequence targets as ids from target_to_ix dictionary and pads length"""
    batch_size = len(batch)
    padded_batch = np.full((batch_size, pad_len), pad_token_id)
    targets_lens = [len(targets) for targets in batch]
    
    for i, targets in enumerate(batch):
        # encode targets as ids
        idxs = [to_ix[target] for target in targets]
        # pad length
        padded_batch[i, 0:len(idxs)] = idxs
    
    return torch.tensor(padded_batch, dtype=torch.long)

In [48]:
targets_batch = [sentences[0][1], sentences[1][1], sentences[2][1]]

In [49]:
prepare_targets_batch(targets_batch, tag_to_ix, 30)

tensor([[1, 1, 2, 3, 1, 4, 2, 5, 5, 6, 1, 7, 6, 4, 1, 1, 3, 2, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0],
        [1, 1, 5, 1, 7, 1, 1, 2, 6, 1, 5, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0],
        [1, 1, 2, 3, 1, 4, 8, 4, 1, 7, 1, 1, 1, 1, 2, 5, 5, 9, 6, 4, 1, 7, 6, 4,
         4, 1, 2, 0, 0, 0]])

### Create LSTM model

In [74]:
class LSTMTagger(nn.Module):
    def __init__(self,
                 embedding_dim,
                 hidden_dim,
                 vocab,
                 tagset_size,
                 batch_size=32,
                 pretrained_embeddings=None):
        
        super(LSTMTagger, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.num_layers = 1
        self.embedding_dim = embedding_dim
        self.vocab = vocab
        self.vocab_size = len(self.vocab)
        self.tagset_size = tagset_size - 1 # minus <TAG>
        self.padding_idx = self.vocab['<PAD>']
        
        
        self.word_embeddings = nn.Embedding(num_embeddings=self.vocab_size,
                                            embedding_dim=self.embedding_dim,
                                            padding_idx=self.padding_idx)
        
        if pretrained_embeddings is not None:
            self.word_embeddings.weight.data.copy_(torch.from_numpy(pretrained_embeddings))
            self.word_embeddings.weight.requires_grad = False
        
        self.hidden = self.init_hidden()

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        # If batch_first is true the input/output tensors are provided as (batch, seq, feature)
        self.lstm = nn.LSTM(input_size=self.embedding_dim,
                            hidden_size=self.hidden_dim,
                            num_layers=self.num_layers,
                            batch_first=True)

        # The linear layer that maps from hidden state space to tag space
        self.out = nn.Linear(in_features=self.hidden_dim,
                             out_features=self.tagset_size)

    def init_hidden(self):
        """
        Initialises weights for hidden layers of LSTM
        Weights are in the form of (num_layers, batch_size, embedding_dim)"""
        return (torch.zeros(self.num_layers, self.batch_size, self.hidden_dim))
        
    def forward(self, X, X_lengths):
        """"""
        self.hidden = self.init_hidden()
        
        batch_size, seq_len, _ = X.size()
        
        X = self.word_embeddings(X)
        
        X = torch.nn.utils.rnn.pack_padded_sequence(X, X_lengths, batch_first=True)
        
        X, self.hidden = self.lstm(X, self.hidden)
        
        X, _ = torch.nn.utils.nn.pad_packed_sequence(X, batch_first=True)
        
        X = X.contiguous()
        X = X.view(-1, X.shape[2])
        
        X = self.out(X)
        
        X = F.log_softmax(X, dim=1)
        
        X = X.view(batch_size, seq_len, self.tagset_size)
        
        Y_hat = X
        return Y_hat
        
        
#         embeds = self.word_embeddings(X)
#         lstm_out, _ = self.lstm(embeds.view(len(X), 1, -1))
#         tag_space = self.out(lstm_out.view(len(X), -1))
#         tag_scores = F.log_softmax(tag_space, dim=1)
#         return tag_scores
    
        
    def loss(self, Y_hat, Y, X_lengths):
        """"""
        
        Y = y.view(-1)
        
        Y_hat = Y_hat.view(-1, self.tagset_size)
        
        tag_pad_token = self.tags['<PAD>']
        mask = (Y > tag_pad_token).float()
        
        num_tokens = int(torch.sum(mask).data[0])
        
        Y_hat = Y_hat[range(Y_hat.shape[0]), Y] * mask
        
        ce_loss = -torch.sum(Y_hat) / num_tokens
        
        return ce_loss

### Initialise the NN model

In [75]:
path_to_embeddings = './data/embeddings/glove.6B.300d.txt'
path_to_trimmed_embeddings = './data/embeddings/trimmed_emb.npz'

In [76]:
# Load trimmed embeddings from disk
pretrained_embeddings = np.load(path_to_trimmed_embeddings)

In [77]:
# Check embedding shape
pretrained_embeddings['embeddings'].shape

(12409, 300)

In [78]:
EMBEDDING_DIM = 300   # Glove 300
HIDDEN_DIM = 32

In [79]:
lstm_net = LSTMTagger(embedding_dim=EMBEDDING_DIM,
                      hidden_dim=HIDDEN_DIM,
                      vocab=data_vocab,
                      tagset_size=len(tag_to_ix),
                      pretrained_embeddings=pretrained_embeddings['embeddings'])
loss_function = nn.NLLLoss()
optimizer = optim.SGD(lstm_net.parameters(), lr=0.1)

In [80]:
print(lstm_net)

LSTMTagger(
  (word_embeddings): Embedding(12409, 300, padding_idx=0)
  (lstm): LSTM(300, 32, batch_first=True)
  (out): Linear(in_features=32, out_features=12, bias=True)
)


### Train standard NN model

In [None]:
for epoch in range(2):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        lstm_net.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = lstm_net(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
    
    if epoch % 1 == 0:
        print(f'Epoch: {epoch} - Loss: {loss}')

In [None]:
# helper function for deterministic nn inference
def tag_score_to_tag_name(tag_score, ix_to_tag):
    """Converts tag score to tag names"""
    return ix_to_tag.get(torch.argmax(tag_score).item())

In [None]:
# Single test example
test_data_sm = test_data[:1]

In [None]:
# Inference
with torch.no_grad():
    inputs = prepare_sequence(test_data_sm[0][0], word_to_ix)
    tag_scores = lstm_net(inputs)
    
#     print(f'Tag Scores:\n{tag_scores}\n')
    print(f'{"Token":<20} {"Pred":<10} {"Actual":<10}')
    print(f'{"-----":<20} {"----":<10} {"------":<10}')
    for i, token in enumerate(training_data[0][0]):
        print(f'{token:<20} {tag_score_to_tag_name(tag_scores[i], ix_to_tag):<10} {test_data_sm[0][1][i]:<10}')