In [None]:
# !pip install 'sru[cuda]<2.1.9'

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
import matplotlib.pyplot as plt
%matplotlib inline

import time
import warnings

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from sru import SRU, SRUCell

In [None]:
import nltk
# nltk.download('treebank')
# nltk.download('universal_tagset')

tagged_sentence = nltk.corpus.treebank.tagged_sents(tagset='universal')
print("Number of Tagged Sentences ",len(tagged_sentence))
tagged_words=[tup for sent in tagged_sentence for tup in sent]
print("Total Number of Tagged words", len(tagged_words))
vocab=set([word for word,tag in tagged_words])
print("Vocabulary of the Corpus",len(vocab))
tags=set([tag for word,tag in tagged_words])
print("Number of Tags in the Corpus ",len(tags))

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(tagged_sentence,test_size=0.2,random_state=1234)
print("Number of Sentences in Training Data ",len(train_set))
print("Number of Sentences in Testing Data ",len(test_set))

In [None]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

word_to_ix = {}

for word in vocab:
    if word not in word_to_ix:
        word_to_ix[word] = len(word_to_ix) + 1

tag_to_ix = {}
for tag in tags:
    if tag not in tag_to_ix:
        tag_to_ix[tag] = len(tag_to_ix) + 1

assert len(tag_to_ix) == len(tags)

In [None]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class Tagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, use_lstm, vocab_size, tagset_size):
        super(Tagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.tagset_size = tagset_size
        self.vocab_size = vocab_size
        self.use_lstm = use_lstm
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        if use_lstm:
            self.reccurent_layer = nn.LSTM(embedding_dim, hidden_dim, num_layers=2,)
        else:
            self.reccurent_layer = SRU(embedding_dim, hidden_dim,
                                      num_layers = 2,          # number of stacking RNN layers
                                      dropout = 0.0,           # dropout applied between RNN layers
                                      bidirectional = False,   # bidirectional RNN
                                      layer_norm = False,      # apply layer normalization on the output of each layer
                                      highway_bias = 0,        # initial bias of highway gate (<= 0)
                                      rescale = True,          # whether to use scaling correction
                                    )
            
        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
    
    def forward(self, sentence, lengths):
        embeds = self.word_embeddings(sentence) # ---> [B x L x D]
        
        if self.use_lstm:
            packed_input = pack_padded_sequence(embeds, lengths=lengths, enforce_sorted=False, batch_first=False)
            packed_out, _ = self.reccurent_layer(packed_input) # ---> [L x B x D]
            output, input_sizes = pad_packed_sequence(packed_out, batch_first=False)

        else:
            mask =  rnn.pad_sequence([torch.zeros(l) for l in lengths], batch_first = False, padding_value=1).to(device)
            # assumed in mini-batch length-first
            assert mask.shape == sentence.shape, "Wrong shape, should be [L x B]"

            output, _ = self.reccurent_layer(embeds, mask_pad = mask)              
        
        tag_space = self.hidden2tag(output) 
        tag_scores = F.log_softmax(tag_space, dim=2)

        return tag_scores

In [None]:
EMBEDDING_DIM = 128
HIDDEN_DIM = 300

In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

print(device)

In [None]:
from torch.utils import data

class MyDataset(data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, data):
        'Initialization'
        self.list_data = data
        
    def __len__(self):
        'Denotes the total number of samples'
        return len(self.list_data)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        line = self.list_data[index]

        d = dict(line)       
        words = d.keys()
        tags = d.values()
        sentence_in = prepare_sequence(words, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        return sentence_in, targets

In [None]:
training_set = MyDataset(train_set)
testing_set = MyDataset(test_set)

In [None]:
from torch.nn.utils import rnn

class PadSequence:
    def __call__(self, batch):
        sequences = [x[0] for x in batch]
        tags = [x[1] for x in batch]

        sequences_padded = rnn.pad_sequence(sequences, batch_first = False, padding_value=0)
        tags_padded = rnn.pad_sequence(tags, batch_first = False, padding_value=0)
        
        # Also need to store the length of each sequence
        # This is later needed in order to unpad the sequences
        lengths = torch.LongTensor(list(map(len, sequences)))
        
        return sequences_padded, tags_padded, lengths

In [None]:
def calculate_score_generator(model, dataset):

    params = {'batch_size': 512,
          'shuffle': False,
         'collate_fn':PadSequence()}

    loader = data.DataLoader(dataset, **params)

    model.eval()

    with torch.no_grad():
        sum = 0
        n_missmatch = 0
        n_all = 0
        for batch_id, (sentences, tags, lengths) in enumerate(loader):
            sentences = sentences.to(device)
            tag_scores = model.forward(sentences, lengths)
            y_pred = tag_scores.argmax(axis=2)

            preds = pack_padded_sequence(y_pred, lengths=lengths, enforce_sorted=False, batch_first=False)
            targets = pack_padded_sequence(tags, lengths=lengths, enforce_sorted=False, batch_first=False)

            n_missmatch += (targets.data != preds.data.cpu()).sum()
            n_all += len(targets.data)
    
    return 1 - n_missmatch.numpy()/n_all  

In [None]:
import time
import warnings

def train_generator(model, train_loader, eval_datasets=None, loss_function = nn.NLLLoss(), n_epoch = 3):
    warnings.filterwarnings(action='once')
    start = time.time()
    optimizer = optim.SGD(model.parameters(), lr=0.1)
    
    for epoch in range(n_epoch):
        model.train()
        for batch_id, (sentences, tags, lengths) in enumerate(train_loader):
            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model.zero_grad()

            # Step 2. Get our inputs ready for the network, that is, turn them into
            # Tensors of word indices.
            sentence_in = sentences.to(device)
            targets = tags.to(device)

            # Step 3. Run our forward pass.
            tag_scores = model.forward(sentence_in, lengths)
            
            # Step 4. Compute the loss, gradients, and update the parameters by
            #  calling optimizer.step()

            loss = loss_function(tag_scores.view(-1, model.tagset_size), targets.view(-1))
            loss.backward()            
            optimizer.step()
            
        if eval_datasets is None:
            print(f"Epoch: {epoch + 1}/{n_epoch}")
        else:
            print(f"Epoch: {epoch + 1}/{n_epoch}     Acc_eval_0: {calculate_score_generator(model, eval_datasets[0])},  Acc_eval_1: {calculate_score_generator(model, eval_datasets[1])}")    

    print("Elapsed time: ", time.time() - start)

In [None]:
params = {'batch_size': 8,
          'shuffle': True,
         'collate_fn':PadSequence()}

train_loader = data.DataLoader(training_set, **params)
test_loader = data.DataLoader(testing_set, **params)

In [None]:
sru = Tagger(EMBEDDING_DIM, HIDDEN_DIM, use_lstm=False, vocab_size=len(word_to_ix) + 1, tagset_size=len(tag_to_ix) + 1).to(device)
lstm = Tagger(EMBEDDING_DIM, HIDDEN_DIM, use_lstm=True, vocab_size=len(word_to_ix) + 1, tagset_size=len(tag_to_ix) + 1).to(device)

In [None]:
train_generator(sru, train_loader, n_epoch=10, eval_datasets=(training_set, testing_set)) # 

In [None]:
train_generator(lstm, train_loader, n_epoch=10, eval_datasets=(training_set, testing_set)) # 