# chunker: default program

In [1]:
from chunker import *
import os, sys, optparse, gzip, re, logging, random
if os.getcwd().split('\\')[-1]=="answer":
    os.chdir("..")
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import tqdm
import string
from collections import OrderedDict
import itertools
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Run the default solution on dev

In [4]:
chunker = LSTMTagger(os.path.join('data', 'train.txt.gz'), os.path.join('data', 'chunker'), '.tar')
decoder_output = chunker.decode('data/input/dev.txt')

100%|██████████| 1027/1027 [00:02<00:00, 459.66it/s]


## Evaluate the default output

In [5]:
flat_output = [ output for sent in decoder_output for output in sent ]
import conlleval
true_seqs = []
with open(os.path.join('data','reference','dev.out')) as r:
    for sent in conlleval.read_file(r):
        true_seqs += sent.split()
conlleval.evaluate(true_seqs, flat_output)

processed 23663 tokens with 11896 phrases; found: 11672 phrases; correct: 8568.
accuracy:  84.35%; (non-O)
accuracy:  85.65%; precision:  73.41%; recall:  72.02%; FB1:  72.71
             ADJP: precision:  36.49%; recall:  11.95%; FB1:  18.00  74
             ADVP: precision:  71.36%; recall:  39.45%; FB1:  50.81  220
            CONJP: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
             INTJ: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
               NP: precision:  70.33%; recall:  76.80%; FB1:  73.42  6811
               PP: precision:  92.40%; recall:  87.14%; FB1:  89.69  2302
              PRT: precision:  65.00%; recall:  57.78%; FB1:  61.18  40
             SBAR: precision:  84.62%; recall:  41.77%; FB1:  55.93  117
               VP: precision:  63.66%; recall:  58.25%; FB1:  60.83  2108


(73.40644276901988, 72.02420981842637, 72.70875763747455)

# chunker: Baseline model

We obtained a dev score of 76.5 for the baseline model. The baseline model was built by concatenating the character-level representation of the word with the word-embedding and passing this as an input to the default chunker.

In [1]:
from chunker import *
import os, sys, optparse, gzip, re, logging, random
if os.getcwd().split('\\')[-1]=="answer":
    os.chdir("..")
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import tqdm
import string
from collections import OrderedDict
import itertools
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Run the baseline solution on dev

In [5]:
baseline_chunker = LSTMTagger(os.path.join('../data', 'train.txt.gz'), os.path.join('../data', 'baseline'), '.tar')
decoder_output = baseline_chunker.decode('../data/input/dev.txt')

100%|██████████| 1027/1027 [00:02<00:00, 430.53it/s]


In [8]:
baseline_chunker.model

LSTMTaggerModel(
  (word_embeddings): Embedding(9675, 128)
  (lstm): LSTM(428, 64)
  (hidden2tag): Linear(in_features=64, out_features=22, bias=True)
)

## Evaluate the baseline output

In [7]:
flat_output = [ output for sent in decoder_output for output in sent ]
import conlleval
true_seqs = []
with open(os.path.join('../data','reference','dev.out')) as r:
    for sent in conlleval.read_file(r):
        true_seqs += sent.split()
conlleval.evaluate(true_seqs, flat_output)

processed 23663 tokens with 11896 phrases; found: 11884 phrases; correct: 9106.
accuracy:  86.67%; (non-O)
accuracy:  87.67%; precision:  76.62%; recall:  76.55%; FB1:  76.59
             ADJP: precision:  46.51%; recall:  17.70%; FB1:  25.64  86
             ADVP: precision:  74.27%; recall:  44.97%; FB1:  56.03  241
            CONJP: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
             INTJ: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
               NP: precision:  74.32%; recall:  79.86%; FB1:  76.99  6702
               PP: precision:  91.99%; recall:  87.55%; FB1:  89.71  2323
              PRT: precision:  70.73%; recall:  64.44%; FB1:  67.44  41
             SBAR: precision:  76.92%; recall:  42.19%; FB1:  54.50  130
               VP: precision:  69.46%; recall:  71.18%; FB1:  70.31  2361


(76.62403231235274, 76.546738399462, 76.58536585365853)

# chunker: Additional Improvement - 2nd option 
Made use of a second RNN taking input as the character level representation and used it's hidden layer and concatenated it with the word embeddings and passed it as an input to the chunker RNN.
Obtained a dev score of 77.18 for 10 epochs. However at 3 epochs, the score was 78.4 and then, the model starts to overfit

In [3]:
from chunker import *
import os, sys, optparse, gzip, re, logging, random
if os.getcwd().split('\\')[-1]=="answer":
    os.chdir("..")
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import tqdm
import string
from collections import OrderedDict
import itertools
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Run the improved solution on dev

In [4]:
chunker = LSTMTagger(os.path.join('../data', 'train.txt.gz'), os.path.join('../data', 'chunker'), '.tar')
decoder_output = chunker.decode('../data/input/dev.txt')

100%|██████████| 1027/1027 [00:03<00:00, 300.15it/s]


In [5]:

chunker.model

LSTMTaggerModel(
  (word_embeddings): Embedding(9675, 128)
  (lstm): LSTM(192, 64)
  (lstm2): LSTM(300, 64)
  (hidden2tag): Linear(in_features=64, out_features=22, bias=True)
)

## Evaluate the improved output

In [6]:
flat_output = [ output for sent in decoder_output for output in sent ]
import conlleval
true_seqs = []
with open(os.path.join('../data','reference','dev.out')) as r:
    for sent in conlleval.read_file(r):
        true_seqs += sent.split()
conlleval.evaluate(true_seqs, flat_output)

processed 23663 tokens with 11896 phrases; found: 12457 phrases; correct: 9399.
accuracy:  86.49%; (non-O)
accuracy:  87.67%; precision:  75.45%; recall:  79.01%; FB1:  77.19
             ADJP: precision:  45.40%; recall:  32.74%; FB1:  38.05  163
             ADVP: precision:  64.89%; recall:  52.01%; FB1:  57.74  319
            CONJP: precision:   0.00%; recall:   0.00%; FB1:   0.00  3
             INTJ: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
               NP: precision:  76.22%; recall:  82.89%; FB1:  79.42  6783
               PP: precision:  90.91%; recall:  88.04%; FB1:  89.45  2364
              PRT: precision:  42.86%; recall:  60.00%; FB1:  50.00  63
             SBAR: precision:  66.48%; recall:  50.21%; FB1:  57.21  179
               VP: precision:  64.00%; recall:  71.74%; FB1:  67.65  2583


(75.45155334350164, 79.00975117686617, 77.18966862398882)

## Analysis

Do some analysis of the results. What ideas did you try? What worked and what did not?

#### Tried scRNN

Implementation could be incorrect but it seems that the training data might not be enough as the denoiser does not struggle with common words. The results for trained model are shown below. Best results were obtained when vocabulary and training data were converted to lower case.

In [8]:
def char_rep_lower(word):
    beg = {c:0 for c in [c for c in string.printable]}
    inter = {c:0 for c in [c for c in string.printable]}
    end = {c:0 for c in [c for c in string.printable]}
    if word != '[unk]':
        if len(word) == 1:
            beg[word] += 1
        elif len(word) == 2:
            beg[word[0]] += 1
            end[word[0]] += 1
        else:
            beg[word[0]] += 1
            for c in range(1,len(word)-2):
                inter[word[c]] += 1
            end[word[-1]] += 1 
    for l in 'QWERTYUIOPASDFGHJKLZXCVBNM':
        del(beg[l])
        del(inter[l])
        del(end[l])
    res = list(itertools.chain(list(beg.values()), list(inter.values()), list(end.values())))
    return res

class DenoiserModel(nn.Module):

    def __init__(self, hidden_dim, vocab_size):
        torch.manual_seed(1)
        super(DenoiserModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM((100-26)*3, hidden_dim, bidirectional=False)
        self.hidden2tag = nn.Linear(hidden_dim, vocab_size)

    def forward(self, sentence):
        lstm_out, _ = self.lstm(sentence.view(len(sentence), 1, -1))
        word_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        word_scores = F.log_softmax(word_space, dim=1)
        return word_scores

vocab = np.load("../data/vocab.npy")
vocablen = len(vocab)

denoiser = DenoiserModel(650, vocablen).to(device)
saved_model = torch.load("../data/denoiser.tar", map_location='cpu')
denoiser.load_state_dict(saved_model['model_state_dict'])

denoiser.eval()

def forward_denoiser(sentence):
    sentence_lower = [word.lower() for word in sentence.split()]
    inpt = torch.FloatTensor([char_rep_lower(word) for word in sentence_lower]).to(device)
    tag_scores = denoiser(inpt).detach().cpu().numpy()
    return vocab[np.argmax(tag_scores, axis=1)]

In [9]:
print(forward_denoiser("flyng arond the wrld"))
print(forward_denoiser("Rockwell sabid the agreement clals for it to supply 200 additional so-called shipsets for the planes"))

['flying' 'around' 'the' 'worked']
['regional' 'said' 'the' 'agreement' 'class' 'for' 'in' 'to' 'supply'
 '250' 'additional' 'so-called' 'sessions' 'for' 'the' 'plans']


#### scRNN Denoising

We made the B,I,E LSTM from the outlined implementation option 1 as a denoiser and fed the output of this model for dev.txt into the Tagger LSTM as a means to recreate decode noisy words in dev and test sets.

When looking at the results of the output for the denoiser model, it appeared that the model was not working as expected and led to a decrease in F1 score. The noise model was ran for 30 epochs with the same parameters as the Tagger model, but an increased hidden dimension size of 650. Some code is provided below to demonstrate the converting of words into their noisy counter part. The result of read_noisy_annotations() is given the the TaggerLSTM as training data.

This method lowered the F1 score down to 69.897, so we believe that the model was not implemented as in the paper by Sakaguchi, or that our data was too far skewed to [UNK] and common words within the traning data set.

Ouput:
Rockwell Internatinal Corp.'s Tulsa unit said it signed a tentative agreement extending its contract with Boeing Co. to provide structural parts for Boeing's 747 jetliners

Denoised output:
[UNK] International Corp. 's [UNK] unit said it United a little agreement [UNK] its contract with likely Co. to provide structural parts for delivery's,40 [UNK].


accuracy:  83.46%; precision:  69.57%; recall:  69.90%; FB1:  69.7

             ADJP: precision:  41.46%; recall:  15.04%; FB1:  22.08  82
             ADVP: precision:  60.69%; recall:  39.95%; FB1:  48.18  262
            CONJP: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
             INTJ: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
               NP: precision:  66.97%; recall:  73.50%; FB1:  70.08  6845
               PP: precision:  86.14%; recall:  85.83%; FB1:  85.98  2432
              PRT: precision:  66.67%; recall:  53.33%; FB1:  59.26  36
             SBAR: precision:  66.39%; recall:  33.33%; FB1:  44.38  119
               VP: precision:  61.58%; recall:  58.16%; FB1:  59.82  2176
(69.56994645247657, 69.8974445191661, 69.73331096947334)


In [None]:
def read_noisy_annotations(file):
    handle = None
    if file[-3:] == '.gz':
        handle = gzip.open(file, 'rt')
    else:
        handle = open(file, 'r')
    conll_data = []
    contents = re.sub(r'\n\s*\n', r'\n\n', handle.read())
    contents = contents.rstrip()
    for sent_string in contents.split('\n\n'):
        annotations = list(zip(*[ word_string.split() for word_string in sent_string.split('\n') ]))
        annotations = [[], annotations[0]]
        for word in annotations[1]:
            annotations[0].append(noise(word))
        conll_data.append(( annotations[0], annotations[1] ))
        #logging.info("CoNLL: {} ||| {}".format( " ".join(annotations[0]), " ".join(annotations[1])))
    return conll_data

def noise(word):
    random.seed(1)
#     adding '' empty strings to represent do nothing to word.
    choices = ['replace', 'add', 'delete', 'jumble', '', '', '', '', '', '', '']
    option = random.choice(choices)
    if len(word) > 3 and not hasnum(word) and word != '[UNK]':
        if option == 'replace':
            pos_replace = random.randint(1, len(word[1:-1]))
            rand_char = random.choice(string.ascii_letters.lower())
            w = list(word)
            w[pos_replace] = rand_char
            word = ''.join(w)
        elif option == 'add':
            pos_add = random.randint(1, len(word[1:-1]))
            word = word[:pos_add] + random.choice(string.ascii_lowercase) + word[pos_add:]
        elif option == 'delete':
            pos_delete = random.randint(1, len(word[1:-1]))
            word = word[0: pos_delete:] + word[pos_delete + 1::]
        elif option == 'jumble':
            shuf_word = ''.join(random.sample(word[1:-1], len(word[1:-1])))
            word = word[0] + shuf_word + word[-1]
    return word

def hasnum(word):
    for let in word:
        if let.isdigit():
            return True
    return False