<a href="https://colab.research.google.com/github/CharlesPoletowin/YCBS-273/blob/master/Lecture9_RNN_pos_tagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import time
from sklearn.metrics import accuracy_score

This notebook was inspired from https://github.com/bentrevett/pytorch-sentiment-analysis. Great thanks to the authors!

# Data setup

In [0]:
import torch
from torchtext import data
from torchtext import datasets
from torchtext.vocab import GloVe
import random

SEED = 1234
MAX_VOCAB_SIZE = 25_000
BATCH_SIZE = 64

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Define the fields associated with the sequences.
TEXT = data.Field(init_token="<bos>", eos_token="<eos>")
UD_TAG = data.Field(init_token="<bos>", eos_token="<eos>")

# Download and the load default data.
train_data, valid_data, test_data = datasets.UDPOS.splits(fields=(('text', TEXT), ('udtag', UD_TAG)))

train_data, valid_data = train_data.split(random_state = random.seed(SEED))

TEXT.build_vocab(train_data.text, min_freq=3)
UD_TAG.build_vocab(train_data.udtag)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    device = device)

print(device)

downloading en-ud-v2.zip


en-ud-v2.zip: 100%|██████████| 688k/688k [00:00<00:00, 11.0MB/s]


extracting
cuda


In [0]:
print(train_data.fields)
print(len(train_data))
print(vars(train_data[0]))

{'text': <torchtext.data.field.Field object at 0x7f33ce0e3d30>, 'udtag': <torchtext.data.field.Field object at 0x7f33f0cd4cc0>}
12543
{'text': ['Al', '-', 'Zaman', ':', 'American', 'forces', 'killed', 'Shaikh', 'Abdullah', 'al', '-', 'Ani', ',', 'the', 'preacher', 'at', 'the', 'mosque', 'in', 'the', 'town', 'of', 'Qaim', ',', 'near', 'the', 'Syrian', 'border', '.'], 'udtag': ['PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'ADJ', 'NOUN', 'VERB', 'PROPN', 'PROPN', 'PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'PROPN', 'PUNCT', 'ADP', 'DET', 'ADJ', 'NOUN', 'PUNCT']}


In [0]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 8780
Number of validation examples: 3763
Number of testing examples: 2077


In [0]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in UD_TAG vocabulary: {len(UD_TAG.vocab)}")

Unique tokens in TEXT vocabulary: 5220
Unique tokens in UD_TAG vocabulary: 21


In [0]:
print(UD_TAG.vocab.freqs)

print(UD_TAG.vocab.freqs.most_common(20))

Counter({'NOUN': 24253, 'PUNCT': 16626, 'VERB': 16199, 'PRON': 13017, 'ADP': 12301, 'DET': 11315, 'PROPN': 9196, 'ADJ': 8800, 'AUX': 8716, 'ADV': 7476, 'CCONJ': 4705, 'PART': 3898, 'NUM': 2825, 'SCONJ': 2696, 'X': 562, 'INTJ': 470, 'SYM': 416})
[('NOUN', 24253), ('PUNCT', 16626), ('VERB', 16199), ('PRON', 13017), ('ADP', 12301), ('DET', 11315), ('PROPN', 9196), ('ADJ', 8800), ('AUX', 8716), ('ADV', 7476), ('CCONJ', 4705), ('PART', 3898), ('NUM', 2825), ('SCONJ', 2696), ('X', 562), ('INTJ', 470), ('SYM', 416)]


In [0]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', '<bos>', '<eos>', '.', 'the', ',', 'to', 'and', 'a']


In [0]:
print(UD_TAG.vocab.stoi)

defaultdict(<function _default_unk_index at 0x7f343bfb26a8>, {'<unk>': 0, '<pad>': 1, '<bos>': 2, '<eos>': 3, 'NOUN': 4, 'PUNCT': 5, 'VERB': 6, 'PRON': 7, 'ADP': 8, 'DET': 9, 'PROPN': 10, 'ADJ': 11, 'AUX': 12, 'ADV': 13, 'CCONJ': 14, 'PART': 15, 'NUM': 16, 'SCONJ': 17, 'X': 18, 'INTJ': 19, 'SYM': 20})


# Model definition

## Exercise: Write the bi-directional RNN model and LSTM variant of the same. Test their performance in the given task.

In [0]:
import torch.nn as nn


class RNN_pos_tagger(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):

        #text = [sent len, batch size]
        
        embedded = self.embedding(text)
        
        #embedded = [sent len, batch size, emb dim]
        
        output, hidden = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        
        return self.fc(output.view(-1, output.shape[-1]))

class RNN_deep_pos_tagger(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        self.rnn = nn.RNN(embedding_dim, hidden_dim, 
                           num_layers=n_layers)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):#, text_lengths):

        #text = [sent len, batch size]
        
        embedded = self.embedding(text)
        
        #embedded = [sent len, batch size, emb dim]
        
        output, hidden = self.rnn(embedded)
                
        #output = [sent len, batch size, hid dim]
        
        #hidden = [num_layers, batch size, hid dim]
        #cell = [num_layers, batch size, hid dim]
          
        return self.fc(output.view(-1, output.shape[-1]))

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [0]:
def evaluate(model, data_iterator, loss_func):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in data_iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = loss_func(predictions, batch.udtag.view(-1).long())
            
            acc = accuracy_score(torch.argmax(predictions, dim=1).cpu().detach().numpy(), batch.udtag.view(-1).cpu().numpy())

            epoch_loss += loss.item()
            epoch_acc += acc
        
    return epoch_acc / len(data_iterator), epoch_loss / len(data_iterator)

def train_model(model, train_data_iterator, valid_data_iterator, loss_func, optimizer, epochs=5):

  for epoch in range(epochs):
    
    model.train()
    epoch_loss = 0
    epoch_acc = 0
    
    tic = time.time()
    for batch in train_data_iterator:
      
      predictions = model(batch.text).squeeze(1)
      
      loss = loss_func(predictions, batch.udtag.view(-1).long())

      loss.backward()
      optimizer.step()
      optimizer.zero_grad()

      epoch_loss += loss.item()
    toc = time.time()
    
    acc, _ = evaluate(model, valid_data_iterator, loss_func)
    toe = time.time()

    print('Loss at epoch %d : %f, validation acc : %f | training time : %d sec, evaluation time : %d sec' % (epoch, epoch_loss / len(train_data_iterator), acc, toc-tic, toe - toc))

In [0]:
def get_model(model_type, n_layers=2):
  INPUT_DIM = len(TEXT.vocab)
  EMBEDDING_DIM = 100
  HIDDEN_DIM = 256
  OUTPUT_DIM = 21
  N_LAYERS = n_layers

  if model_type == 'rnn':
    model = RNN_pos_tagger(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
  elif model_type == 'rnn_deep':
    model = RNN_deep_pos_tagger(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS)
  elif model_type == 'rnn_deep_bidir':
    model = RNN_deep_bidir_pos_tagger(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS)
  
  return model

In [0]:
import torch.nn.functional as F
import torch.optim as optim

model = get_model('rnn_deep_bidir')
model = model.to(device)
print(f'The model has {count_parameters(model):,} trainable parameters')

loss_func = F.cross_entropy
optimizer = optim.Adam(model.parameters())

train_model(model, train_iterator, valid_iterator, loss_func, optimizer, epochs=10)

The model has 1,110,309 trainable parameters
Loss at epoch 0 : 0.373077, validation acc : 0.780759 | training time : 2 sec, evaluation time : 0 sec
Loss at epoch 1 : 0.191019, validation acc : 0.831823 | training time : 1 sec, evaluation time : 0 sec
Loss at epoch 2 : 0.144641, validation acc : 0.859360 | training time : 1 sec, evaluation time : 0 sec
Loss at epoch 3 : 0.119310, validation acc : 0.876442 | training time : 1 sec, evaluation time : 0 sec
Loss at epoch 4 : 0.098550, validation acc : 0.888743 | training time : 1 sec, evaluation time : 0 sec
Loss at epoch 5 : 0.082022, validation acc : 0.897652 | training time : 1 sec, evaluation time : 0 sec
Loss at epoch 6 : 0.067951, validation acc : 0.903980 | training time : 1 sec, evaluation time : 0 sec
Loss at epoch 7 : 0.056687, validation acc : 0.905779 | training time : 1 sec, evaluation time : 0 sec
Loss at epoch 8 : 0.045985, validation acc : 0.909110 | training time : 1 sec, evaluation time : 0 sec
Loss at epoch 9 : 0.038476, 