# Sequence Models and LSTM Networks - Probablistic Model with Pyro and Penn Treebank
Sequence Tagger: https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html<br>
Bayesian NN: https://github.com/paraschopra/bayesian-neural-network-mnist/blob/master/bnn.ipynb<br>
Penn Treebank: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.9.8216&rep=rep1&type=pdf

In [1]:
import numpy as np
import nltk
from nltk.corpus import treebank

In [2]:
nltk.download('treebank')
nltk.download('universal_tagset')

[nltk_data] Downloading package treebank to /home/tyler/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/tyler/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7f088617be10>

In [4]:
from IPython.display import clear_output

In [5]:
import pyro
from pyro.distributions import Normal, Categorical
from pyro.infer import SVI, Trace_ELBO
from pyro.optim import Adam

# An LSTM for Part-of-Speech Tagging

### Load Data

In [6]:
# Penn tree bank
sentences = treebank.tagged_sents(tagset='universal')

In [7]:
samples = 200
sentences = sentences[:samples]

In [8]:
def format_sequence(seq):
    """
    Formats penn treebank POS format into tuple ([tokens], [POS])
    """
    tokens = [x[0] for x in seq]
    tags = [x[1] for x in seq]
    return (tokens, tags)

In [9]:
sentences = [format_sequence(sentence) for sentence in sentences]

## Prepare data

In [10]:
def prepare_sequence(seq, to_ix):
    """Encodes sentence tokens as ids from word_to_ix dictionary"""
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [11]:
# Train/Test split
split_ratio = 0.80
training_data = sentences[:int(len(sentences)*split_ratio)]
test_data = sentences[len(training_data):]

In [12]:
print(f'Dataset Size: {len(sentences)} | Training Set Size: {len(training_data)} | Test Set Size: {len(test_data)}')

Dataset Size: 200 | Training Set Size: 160 | Test Set Size: 40


In [13]:
word_to_ix = {}
for sent, tags in sentences:   # training_data
#     print(sent, tags)
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
# print(word_to_ix)

In [14]:
# Create tag-index lookups
tag_to_ix = {}
for _, tags in sentences:
    for tag in tags:
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)

ix_to_tag = {v:k for k, v in tag_to_ix.items()}

In [15]:
print(f'Word dictionary size: {len(word_to_ix)}')
print(f'Tag dictionary size: {len(tag_to_ix)}')

Word dictionary size: 1660
Tag dictionary size: 12


### Create LSTM model

In [16]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        self.hidden = self.init_hidden()

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)    # , batch_first=True

        # The linear layer that maps from hidden state space to tag space
        self.out = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        
        self.hidden = self.init_hidden()
        
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.out(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores
    
    def init_hidden(self):
        return (torch.zeros(1, 1, self.hidden_dim))

### Initialise the NN model

In [17]:
# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 32
HIDDEN_DIM = 32

In [18]:
lstm_net = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
# loss_function = nn.NLLLoss()
# optimizer = optim.SGD(model.parameters(), lr=0.1)

In [19]:
print(lstm_net)

LSTMTagger(
  (word_embeddings): Embedding(1660, 32)
  (lstm): LSTM(32, 32)
  (out): Linear(in_features=32, out_features=12, bias=True)
)


### Initialise Pyro model

Ref:<br>
- https://forum.pyro.ai/t/bayesian-rnn-nan-loss-issue/254

- Loc = mean, Scale = standard deviation
- mu = 0, sigma = 1 -> Unit Gaussian distribution

Refs:<br>
- https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html
- https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html

In [20]:
def model(input, target):
    
    # Embeddings
    word_embeddings_w_prior = Normal(loc=torch.zeros_like(lstm_net.word_embeddings.weight),
                                     scale=torch.ones_like(lstm_net.word_embeddings.weight))
    
    # LSTM
    lstm_w_ih_l0_prior = Normal(loc=torch.zeros_like(lstm_net.lstm.weight_ih_l0),
                          scale=torch.ones_like(lstm_net.lstm.weight_ih_l0))
    lstm_w_hh_l0_prior = Normal(loc=torch.zeros_like(lstm_net.lstm.weight_hh_l0),
                          scale=torch.ones_like(lstm_net.lstm.weight_hh_l0))
    lstm_b_ih_l0_prior = Normal(loc=torch.zeros_like(lstm_net.lstm.bias_ih_l0),
                          scale=torch.ones_like(lstm_net.lstm.bias_ih_l0))
    lstm_b_hh_l0_prior = Normal(loc=torch.zeros_like(lstm_net.lstm.bias_hh_l0),
                          scale=torch.ones_like(lstm_net.lstm.bias_hh_l0))
    
    # Output
    out_w_prior = Normal(loc=torch.zeros_like(lstm_net.out.weight),
                          scale=torch.ones_like(lstm_net.out.weight))
    out_b_prior = Normal(loc=torch.zeros_like(lstm_net.out.bias),
                          scale=torch.ones_like(lstm_net.out.bias))
    
    
    priors = {'word_embeddings.weight': word_embeddings_w_prior,
              'lstm.weight_ih_l0': lstm_w_ih_l0_prior,
              'lstm.weight_hh_l0': lstm_w_hh_l0_prior,
              'lstm.bias_ih_l0': lstm_b_ih_l0_prior,
              'lstm.bias_hh_l0': lstm_b_hh_l0_prior,
              'out.weight': out_w_prior,
              'out.bias': out_b_prior}
    
    # Lift module parameters to random variables sampled from the priors
    lifted_module = pyro.random_module("module", lstm_net, priors)
    
    # Sample a regressor (which also samples w and b)
    lifted_reg_model = lifted_module()
    
#     lhat = log_softmax(lifted_reg_model(input))
    output = lifted_reg_model(input)
    
    pyro.sample("obs", Categorical(logits=output), obs=target)

In [21]:
softplus = torch.nn.Softplus()

def guide(input, target):
    
    # Embedding layer weight distribution priors
    word_embeddings_w_mu = torch.randn_like(lstm_net.word_embeddings.weight)
    word_embeddings_w_sigma = torch.randn_like(lstm_net.word_embeddings.weight)
    word_embeddings_w_mu_param = pyro.param("word_embeddings_w_mu", word_embeddings_w_mu)
    word_embeddings_w_sigma_param = softplus(pyro.param("word_embeddings_w_sigma", word_embeddings_w_sigma))
    word_embeddings_w_prior = Normal(loc=word_embeddings_w_mu_param, scale=word_embeddings_w_sigma_param)
    
    # LSTM layer weight distribution priors
    lstm_w_ih_l0_mu = torch.randn_like(lstm_net.lstm.weight_ih_l0)
    lstm_w_ih_l0_sigma = torch.randn_like(lstm_net.lstm.weight_ih_l0)
    lstm_w_ih_l0_mu_param = pyro.param("lstm_w_ih_l0_mu", lstm_w_ih_l0_mu)
    lstm_w_ih_l0_sigma_param = softplus(pyro.param("lstm_w_ih_l0_sigma", lstm_w_ih_l0_sigma))
    lstm_w_ih_l0_prior = Normal(loc=lstm_w_ih_l0_mu_param, scale=lstm_w_ih_l0_sigma_param)
    
    lstm_w_hh_l0_mu = torch.randn_like(lstm_net.lstm.weight_hh_l0)
    lstm_w_hh_l0_sigma = torch.randn_like(lstm_net.lstm.weight_hh_l0)
    lstm_w_hh_l0_mu_param = pyro.param("lstm_w_hh_l0_mu", lstm_w_hh_l0_mu)
    lstm_w_hh_l0_sigma_param = softplus(pyro.param("lstm_w_hh_l0_sigma", lstm_w_hh_l0_sigma))
    lstm_w_hh_l0_prior = Normal(loc=lstm_w_hh_l0_mu_param, scale=lstm_w_hh_l0_sigma_param)
    
    # LSTM layer bias distribution priors
    lstm_b_ih_l0_mu = torch.randn_like(lstm_net.lstm.bias_ih_l0)
    lstm_b_ih_l0_sigma = torch.randn_like(lstm_net.lstm.bias_ih_l0)
    lstm_b_ih_l0_mu_param = pyro.param("lstm_b_ih_l0_mu", lstm_b_ih_l0_mu)
    lstm_b_ih_l0_sigma_param = softplus(pyro.param("lstm_b_ih_l0_sigma", lstm_b_ih_l0_sigma))
    lstm_b_ih_l0_prior = Normal(loc=lstm_b_ih_l0_mu_param, scale=lstm_b_ih_l0_sigma_param)
    
    lstm_b_hh_l0_mu = torch.randn_like(lstm_net.lstm.bias_hh_l0)
    lstm_b_hh_l0_sigma = torch.randn_like(lstm_net.lstm.bias_hh_l0)
    lstm_b_hh_l0_mu_param = pyro.param("lstm_b_hh_l0_mu", lstm_b_hh_l0_mu)
    lstm_b_hh_l0_sigma_param = softplus(pyro.param("lstm_b_hh_l0_sigma", lstm_b_hh_l0_sigma))
    lstm_b_hh_l0_prior = Normal(loc=lstm_b_hh_l0_mu_param, scale=lstm_b_hh_l0_sigma_param)
    
    # Output layer weight distribution priors
    out_w_mu = torch.randn_like(lstm_net.out.weight)
    out_w_sigma = torch.randn_like(lstm_net.out.weight)
    out_w_mu_param = pyro.param("out_w_mu", out_w_mu)
    out_w_sigma_param = softplus(pyro.param("out_w_sigma", out_w_sigma))
    out_w_prior = Normal(loc=out_w_mu_param, scale=out_w_sigma_param)
    
    # Output layer bias distribution priors
    out_b_mu = torch.randn_like(lstm_net.out.bias)
    out_b_sigma = torch.randn_like(lstm_net.out.bias)
    out_b_mu_param = pyro.param("out_b_mu", out_b_mu)
    out_b_sigma_param = softplus(pyro.param("out_b_sigma", out_b_sigma))
    out_b_prior = Normal(loc=out_b_mu_param, scale=out_b_sigma_param)
    
    priors = {'word_embeddings.weight': word_embeddings_w_prior,
              'lstm.weight_ih_l0': lstm_w_ih_l0_prior,
              'lstm.weight_hh_l0': lstm_w_hh_l0_prior,
              'lstm.bias_ih_l0': lstm_b_ih_l0_prior,
              'lstm.bias_hh_l0': lstm_b_hh_l0_prior,
              'out.weight': out_w_prior,
              'out.bias': out_b_prior}
    
    lifted_module = pyro.random_module("module", lstm_net, priors)
    
    return lifted_module()

In [22]:
inference = SVI(model, guide, Adam({"lr": 0.01}), loss=Trace_ELBO())

In [23]:
# TODO: update to use batches; atm its single sample... 

num_iterations = 100
loss = 0
for j in range(num_iterations):
    loss = 0
    for sentence, tags in training_data:
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)
        
        # Calculate loss and take gradient step
        loss += inference.step(sentence_in, targets)
    
    total_epoch_loss_train = loss / len(training_data)
    
    if j % 1 == 0:
        clear_output(wait=True)
        print(f'Epoch {j} - Loss {total_epoch_loss_train:0.4f}')

Epoch 99 - Loss 363.3785


In [24]:
def predict(x, num_samples):
    """
    
    """
    # Initialise set of probablistic models for inference
    sampled_models = [guide(None, None) for _ in range(num_samples)]
    
    yhats = [model(x).data for model in sampled_models]
#     print(f'\nyhats:\n{yhats[0]}')
    mean = torch.mean(torch.stack(yhats), 0)
    print(f'\nMean:\n{mean}')
    return np.argmax(mean.numpy(), axis=1)

In [25]:
# helper function
def tag_score_to_tag_name(tag_score, ix_to_tag):
    """
    Converts tag score to tag names
    """
    if type(tag_score).__module__ == np.__name__:
        return ix_to_tag.get(np.argmax(tag_score))
    if torch.is_tensor(tag_score):
        return ix_to_tag.get(torch.argmax(tag_score).item())

In [26]:
test_data_sm = test_data[:1]

In [27]:
# Predictions
num_samples = 10
correct = 0
total = 0
for j, data in enumerate(test_data_sm):
    sentence, tags = data
    sentence_in = prepare_sequence(sentence, word_to_ix)
    print(sentence, tags)
    
    # Convert tags into their indexes in tag dictionary
    tag_indices = np.array([tag_to_ix.get(tag) for tag in tags])
    
    predicted = predict(sentence_in, num_samples)
    total += len(tags)
    correct += (predicted == tag_indices).sum()
    
    for i, token in enumerate(sentence):
        print(f'{token:<10} {ix_to_tag.get(predicted[i])}')
    print('\n')

print(f'Accuracy: {correct/total * 100:0.1f}%')

['October', 'sales', ',', 'compared', '*', 'with', 'the', 'previous', 'month', ',', 'inched', 'down', '0.4', '%', '.'] ['NOUN', 'NOUN', '.', 'VERB', 'X', 'ADP', 'DET', 'ADJ', 'NOUN', '.', 'VERB', 'ADV', 'NUM', 'NOUN', '.']

Mean:
tensor([[-1.4936, -3.0441, -3.5166, -3.1182, -2.9232, -3.0208, -2.1408, -3.1277,
         -3.5863, -3.8820, -3.0258, -3.2294],
        [-1.5775, -2.8532, -3.2604, -3.5181, -2.7871, -3.2136, -2.2108, -3.3034,
         -3.7129, -3.7334, -3.3108, -3.2258],
        [-1.5602, -2.8143, -3.6457, -3.3381, -2.8239, -3.2848, -2.1105, -3.3884,
         -3.2771, -4.0242, -3.4533, -3.2076],
        [-1.4979, -2.6493, -3.1390, -3.4661, -2.7001, -3.4760, -2.2484, -3.8118,
         -3.7700, -4.1569, -3.4654, -3.3058],
        [-1.4652, -2.6129, -3.2697, -3.1860, -2.8692, -3.3591, -2.2423, -3.2747,
         -3.7282, -3.8712, -3.3318, -3.3149],
        [-1.4781, -2.7361, -3.0891, -3.1847, -2.3990, -3.1741, -2.2813, -3.5945,
         -3.3305, -4.1622, -3.3531, -3.4549],
        