# Deterministic POS Word Level Language Model with Penn Treebank
Sequence Tagger: https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html<br>
Penn Treebank: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.9.8216&rep=rep1&type=pdf

https://gist.github.com/williamFalcon/f27c7b90e34b4ba88ced042d9ef33edd <br>
https://stanford.edu/~shervine/blog/pytorch-how-to-generate-data-parallel

In [1]:
import numpy as np
import nltk
from nltk.corpus import treebank
import os
import codecs

In [2]:
nltk.download('treebank')
nltk.download('universal_tagset')

[nltk_data] Downloading package treebank to /home/tyler/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/tyler/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

torch.manual_seed(1)

<torch._C.Generator at 0x7fe1dbd95f70>

In [4]:
from IPython.display import clear_output

# An LSTM for Part-of-Speech Tagging

## Create DataLoader
Something something data loader...
- See refs for future optimisation (perhaps, havent read): https://towardsdatascience.com/building-efficient-custom-datasets-in-pytorch-2563b946fd9f and https://stanford.edu/~shervine/blog/pytorch-how-to-generate-data-parallel

- TODO split dataset into train, test, val

In [532]:
class PennTreeBankDataset(Dataset):
    """Penn Tree Bank dataset"""
    
    def __init__(self):
        # download data, read, process
        np_array = np.zeros((1000,10))
        
        # load Penn tree bank sentences
        self.sentences = treebank.tagged_sents(tagset='universal')
        self.sentences = [self.format_sequence(sentence) for sentence in self.sentences]
        self.len = len(self.sentences)
        
        self.pad_token = '<PAD>'
        self.pad_token_id = 0
        self.pad_len = max([len(sentence[0]) for sentence in self.sentences])
        
        # Vocab of input data (this will likely be a subset of any word embedding array)
        self.build_vocab()
        self.create_word_to_idx_dict()
        self.create_tag_to_idx_dict()
        self.X, self.y, self.X_lens = self.encode_and_pad_data()
        
    def format_sequence(self,seq):
        """
        Formats penn treebank POS format into tuple ([tokens], [POS])
        """
        tokens = [x[0] for x in seq]
        tags = [x[1] for x in seq]
        return (tokens, tags)
    
    def build_vocab(self):
        """Builds vocab dictionary based on input data"""
        self.vocab = dict()
        for sentence in self.sentences:
            for word in sentence[0]:
                if word not in self.vocab:
                    self.vocab[word] = len(self.vocab) + 1    # counts from 1+ as 0 is reserved for PAD token
        
        # Add padding token to data vocab
        self.vocab[self.pad_token] = 0
    
    def create_word_to_idx_dict(self):
        """Builds word to index dictionary"""
        self.word_to_idx = {}
        for sentence, tags in self.sentences:
            for word in sentence:
                if word not in self.word_to_idx:
                    self.word_to_idx[word] = len(self.word_to_idx) + 1    # counts from 1+ as 0 is reserved for PAD token
        
        # add padding to word dict
        self.word_to_idx[self.pad_token] = 0
        
    def create_tag_to_idx_dict(self):
        """Builds tag to index and index to tag dictionary"""
        self.tag_to_idx = {}
        for _, tags in self.sentences:
            for tag in tags:
                if tag not in self.tag_to_idx:
                    self.tag_to_idx[tag] = len(self.tag_to_idx) + 1    # counts from 1+ as 0 is reserved for PAD token

        # Add padding to tag dict
        self.tag_to_idx[self.pad_token] = 0
        self.idx_to_tag = {v:k for k, v in self.tag_to_idx.items()}
        
    def encode_and_pad_data(self):
        """Encodes data (seq and tags) into ids from id dictionary and pads"""
        batch_size = len(self.sentences)

        padded_seq_batch = np.full((batch_size, self.pad_len), self.pad_token_id)
        padded_tags_batch = np.full((batch_size, self.pad_len), self.pad_token_id)
        seq_lengths = [len(sentence[0]) for sentence in self.sentences]
        
        # encode and pad
        for i, (seq, tags) in enumerate(self.sentences):
            # encode
            seq_idxs = [self.word_to_idx[token] for token in seq]
            tags_idxs = [self.tag_to_idx[tag] for tag in tags]        
            # pad
            padded_seq_batch[i,0:len(seq_idxs)] = seq_idxs
            padded_tags_batch[i, 0:len(tags_idxs)] = tags_idxs
            
        return torch.tensor(padded_seq_batch, dtype=torch.long), torch.tensor(padded_tags_batch, dtype=torch.long), torch.tensor(seq_lengths, dtype=torch.int)
    
    def __getitem__(self, index):
        return self.X[index], self.y[index], self.X_lens[index]
    
    def __len__(self):
        return self.len

In [533]:
dataset = PennTreeBankDataset()

In [534]:
train_loader = DataLoader(dataset=dataset,
                         batch_size=32,
                         shuffle=True,
                         num_workers=2)

In [535]:
dataset.idx_to_tag

{1: 'NOUN',
 2: '.',
 3: 'NUM',
 4: 'ADJ',
 5: 'VERB',
 6: 'DET',
 7: 'ADP',
 8: 'CONJ',
 9: 'X',
 10: 'ADV',
 11: 'PRT',
 12: 'PRON',
 0: '<PAD>'}

### Create LSTM model

### Initialise Word Embeddings

In [536]:
path_to_embeddings = './data/embeddings/glove.6B.300d.txt'
path_to_trimmed_embeddings = './data/embeddings/trimmed_emb.npz'

### Initialse Model

In [613]:
class LSTMTagger(nn.Module):
    def __init__(self,
                 embedding_dim,
                 hidden_dim,
                 vocab,
                 tags,
                 batch_size=32,
                 pretrained_embeddings=None):
        
        super(LSTMTagger, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.num_layers = 1
        self.embedding_dim = embedding_dim
        self.vocab = vocab
        self.vocab_size = len(self.vocab)
        self.tags = tags
        self.tagset_size = len(self.tags)    # -1 not added as we want the model to predict where there is a <PAD> tag? doesn't seem right, but model works...
#         self.tagset_size = self.tagset_size - 1 # minus <TAG>
        self.padding_idx = self.vocab['<PAD>']
        
        self.word_embeddings = nn.Embedding(num_embeddings=self.vocab_size,
                                            embedding_dim=self.embedding_dim,
                                            padding_idx=self.padding_idx)
        
        if pretrained_embeddings is not None:
            self.word_embeddings.weight.data.copy_(torch.from_numpy(pretrained_embeddings))
            self.word_embeddings.weight.requires_grad = False
        
        self.hidden = self.init_hidden()

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        # If batch_first is true the input/output tensors are provided as (batch, seq, feature)
        self.lstm = nn.LSTM(input_size=self.embedding_dim,
                            hidden_size=self.hidden_dim,
                            num_layers=self.num_layers,
                            batch_first=True)

        # The linear layer that maps from hidden state space to tag space
        self.out = nn.Linear(in_features=self.hidden_dim,
                             out_features=self.tagset_size)

    def init_hidden(self):
        """
        Initialises weights for hidden layers of LSTM
        Weights are in the form of (num_layers, batch_size, embedding_dim)"""
        return (torch.zeros(self.num_layers, self.batch_size, self.hidden_dim), torch.zeros(self.num_layers, self.batch_size, self.hidden_dim))
        
    def forward(self, X, seq_lengths):
        """"""
        self.hidden = self.init_hidden()
        batch_size, seq_len = X.size()       
        
        # Embed the input
        # (batch_size, seq_len) -> (batch_size, seq_len, embedding_dim)
        embedded_seq_tensor = self.word_embeddings(X)
        
        # --- Run through model ---
        # pack_padded_sequence so that padded items in the sequence won't be shown to the LSTM
        # Sequences have not been sorted descending by length, this reduces the efficiency of the algorith... [TODO: fix in the data loader]
        packed_input = torch.nn.utils.rnn.pack_padded_sequence(embedded_seq_tensor,
                                                               seq_lengths,
                                                               batch_first=True,
                                                               enforce_sorted=False)
        
        # run through LSTM
        packed_output, self.hidden = self.lstm(packed_input, self.hidden)
        
        # undo the packing operation
        output, _ = torch.nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        
        # --- Project to tag space ---
        # (batch_size, seq_len, embedding_dim) -> (batch_size * seq_len, embedding_dim)
        output = output.contiguous()
        output = output.view(-1, output.shape[2])
        
        # Run through linear layer
        output = self.out(output)
        
        # Perform softmax
        # (batch_size * seq_len, embedding_dim) -> (batch_size, seq_len, tagset_size)
        output = F.log_softmax(output, dim=1)

        # Reshape for sanity
        output = output.view(batch_size, max(seq_lengths), self.tagset_size)   # max(seq_lengths) is for MAX across batch   
    
        Y_hat = output
        return Y_hat
    
        
    def loss(self, Y_hat, Y, seq_lengths):
        """Compute cross-entropy loss"""
        # Y_hat (batch_size, max_seq_len_pad, tagset_size)
        # Y (batch_size, max_seq_len_all)      
        
        # ignore any words/sentences that are completely padding
        ymask = ~torch.all(Y == 0, dim=1)
        Y = Y[ymask]
        Y_hat = Y_hat[ymask]
        
        # Truncate targets with batch max length
        Y = Y[:, :max(seq_lengths)]
        
        # Flatten all the target labels
        Y = Y.reshape(-1)   # reshape truncated targets
        
        # Flatten all the predicted labels
        Y_hat = Y_hat.view(-1,self.tagset_size)
        
        tag_pad_token = self.tags['<PAD>']    # should be 0
        mask = (Y > tag_pad_token).float()
            
        # Count how many tokens there are
        num_tokens = int(torch.sum(mask).item())
        
        # Pick the values for the label and zero out the rest with the mask       
        Y_hat = Y_hat[range(Y_hat.shape[0]), Y] * mask
        
        ce_loss = -torch.sum(Y_hat) / num_tokens
        
        return ce_loss

In [614]:
# Load trimmed embeddings from disk
pretrained_embeddings = np.load(path_to_trimmed_embeddings)

In [615]:
EMBEDDING_DIM = 300   # Using Glove 300 dim
HIDDEN_DIM = 64

In [616]:
lstm_net = LSTMTagger(embedding_dim=EMBEDDING_DIM,
                      hidden_dim=HIDDEN_DIM,
                      vocab=dataset.vocab,
                      tags=dataset.tag_to_idx,
                      pretrained_embeddings=pretrained_embeddings['embeddings'])
optimizer = optim.SGD(lstm_net.parameters(), lr=0.1)

In [617]:
print(lstm_net)

LSTMTagger(
  (word_embeddings): Embedding(12409, 300, padding_idx=0)
  (lstm): LSTM(300, 64, batch_first=True)
  (out): Linear(in_features=64, out_features=13, bias=True)
)


### Train Deterministic Sequence Tagger
Training performance (10 epochs; 300/16 dim):
- Without optimised pad packing e.g. max_len padding   | time: 3min 52s | EOT loss: 0.8258
- With optimised pad packing e.g. truncated per batch  | time: 33s | EOT loss: 0.7542

In [618]:
EPOCHS = 10

In [620]:
%%time
for epoch in range(0,EPOCHS+1,1):
    min_loss = 100
    for batch_idx, (data, targets, data_lens) in enumerate(train_loader, 0):
        lstm_net.zero_grad()
        # Forward pass
        tag_scores = lstm_net(data, data_lens)
        # compute loss, gradients and update parameters by calling optimzier.step()
        loss = lstm_net.loss(tag_scores, targets, data_lens)
        loss.backward()
        optimizer.step()
        if loss < min_loss:
            min_loss = loss
        
        if batch_idx % 10 == 0:
            clear_output(wait=True)
            print(f'EPOCH: {epoch}\nLoss: {loss.item():0.4f}')

print(f'Minimum Loss: {min_loss:0.4f}')

EPOCH: 10
Loss: 0.4732
Minimum Loss: 0.3891
CPU times: user 2min 28s, sys: 2.05 s, total: 2min 30s
Wall time: 41.2 s


## Model Inference

In [None]:
# helper function for deterministic nn inference
def tag_score_to_tag_name(tag_score, ix_to_tag):
    """Converts tag score to tag names"""
    return ix_to_tag.get(torch.argmax(tag_score).item())

In [None]:
# Single test example
test_data_sm = test_data[:1]

In [None]:
# Inference
with torch.no_grad():
    inputs = prepare_sequence(test_data_sm[0][0], word_to_ix)
    tag_scores = lstm_net(inputs)
    
#     print(f'Tag Scores:\n{tag_scores}\n')
    print(f'{"Token":<20} {"Pred":<10} {"Actual":<10}')
    print(f'{"-----":<20} {"----":<10} {"------":<10}')
    for i, token in enumerate(training_data[0][0]):
        print(f'{token:<20} {tag_score_to_tag_name(tag_scores[i], ix_to_tag):<10} {test_data_sm[0][1][i]:<10}')