In [1]:
import nltk
from nltk.corpus.reader import ConllCorpusReader

In [2]:
emergingE = ConllCorpusReader('emerging_entities_17-master/','.conll',  ('words', 'pos', 'chunk'))

### Data Preparation

In [3]:
## Training and testing

train_sents = list(emergingE.tagged_sents('wnut17train.conll')) 
valid_sents = list(emergingE.tagged_sents('emerging.dev.conll'))
test_sents = list(emergingE.tagged_sents('emerging.test.conll'))

print(train_sents[0])
#each tuple contains token, syntactic tag, ner label


[('@paulwalk', 'O'), ('It', 'O'), ("'s", 'O'), ('the', 'O'), ('view', 'O'), ('from', 'O'), ('where', 'O'), ('I', 'O'), ("'m", 'O'), ('living', 'O'), ('for', 'O'), ('two', 'O'), ('weeks', 'O'), ('.', 'O'), ('Empire', 'B-location'), ('State', 'I-location'), ('Building', 'I-location'), ('=', 'O'), ('ESB', 'B-location'), ('.', 'O'), ('Pretty', 'O'), ('bad', 'O'), ('storm', 'O'), ('here', 'O'), ('last', 'O'), ('evening', 'O'), ('.', 'O')]


In [4]:
# functions of sentence representations for sequence labelling
def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

In [5]:
# sentence representations for sequence labelling
train_sent_tokens = [sent2tokens(s) for s in train_sents]
train_labels = [sent2labels(s) for s in train_sents]

train_id_2_label = list(set([label for sent in train_labels for label in sent]))
train_label_2_id = {label:i for i, label in enumerate(train_id_2_label)}
print("Number of unique labels in training data:", len(train_id_2_label))

def convert_labels_to_inds(sent_labels, label_2_id):
  return [label_2_id[label] for label in sent_labels]

train_label_inds = [convert_labels_to_inds(sent_labels, train_label_2_id) for sent_labels in train_labels]

test_sent_tokens = [sent2tokens(s) for s in test_sents]
test_labels = [sent2labels(s) for s in test_sents]

### Test set contains label such as (B-corporation,B-person,B-location), 
### so we have to separate them into (B-corporation), (B-person) and (B-location)
### if not we will encounter key error

test_labels_sep = list(([label for sent in test_labels for label in sent]))
test_labels_sep = ",".join(test_labels_sep) 
test_labels_sep = test_labels_sep.split(",") 
###

test_label_inds = [convert_labels_to_inds(test_labels_sep, train_label_2_id) for s in test_labels_sep]

Number of unique labels in training data: 13


In [6]:
window_size = 2

# converting tokenized sentence lists to vocabulary indices
id_2_word = list(set([token for sent in train_sent_tokens for token in sent])) + ["<pad>", "<unk>"]
word_2_id = {w:i for i,w in enumerate(id_2_word)}

def convert_tokens_to_inds(sentence, word_2_id):
    return [word_2_id.get(t, word_2_id["<unk>"]) for t in sentence]

# padding for windows
def pad_sentence_for_window(sentence, window_size, pad_token="<pad>"):
    return [pad_token]*window_size + sentence + [pad_token]*window_size 


In [7]:
import pprint
pp = pprint.PrettyPrinter()

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from functools import partial

In [8]:
# Batching sentences together with a DataLoader

def my_collate(data, window_size, word_2_id):
    """
    For some chunk of sentences and labels
        -add winow padding
        -pad for lengths using pad_sequence
        -convert our labels to one-hots
        -return padded inputs, one-hot labels, and lengths
    """
    
    x_s, y_s = zip(*data)

    # deal with input sentences as we've seen
    window_padded = [convert_tokens_to_inds(pad_sentence_for_window(sentence, window_size), word_2_id)
                                                                                  for sentence in x_s]
    # append zeros to each list of token ids in batch so that they are all the same length
    padded = nn.utils.rnn.pad_sequence([torch.LongTensor(t) for t in window_padded], batch_first=True)
    
    # convert labels to one-hots
    labels = []
    lengths = []
    for y in y_s:
        lengths.append(len(y))
        one_hot = torch.zeros(len(y), len(train_id_2_label))
        y = torch.tensor(y)
        y = y.unsqueeze(1)
        label = one_hot.scatter_(1, y, 1)
        labels.append(label)
    padded_labels = nn.utils.rnn.pad_sequence(labels, batch_first=True)
    
    return padded.long(), padded_labels, torch.LongTensor(lengths)

In [9]:
batch_size = 4

# Shuffle True is good practice for train loaders.
# Use functools.partial to construct a partially populated collate function
train_loader = DataLoader(list(zip(train_sent_tokens, train_label_inds)), 
                            batch_size=batch_size, shuffle=True, 
                            collate_fn=partial(my_collate, window_size=2, word_2_id=word_2_id))

In [10]:
for batched_input, batched_labels, batch_lengths in train_loader:
    pp.pprint(("inputs", batched_input, batched_input.size()))
    pp.pprint(("labels", batched_labels, batched_labels.size()))
    pp.pprint(batch_lengths)
    break

('inputs',
 tensor([[14878, 14878,  4923, 11797,  1939,  9188,  5905, 14334, 11289,  9563,
          7654,  7312, 11697,  4933, 11886,   738,  1770,  1975,  6561,  1401,
         13472, 12751, 10026, 14878, 14878],
        [14878, 14878,  3772, 13404,  7387,  9851,  3887,  7848,  4655,  1558,
          1106, 14878, 14878,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0],
        [14878, 14878,  4923,  1554,  1939,  4452, 10077,  1703, 10096,  4737,
         11692, 12071, 11512, 12153,   783, 13421,  5573,  8290, 14878, 14878,
             0,     0,     0,     0,     0],
        [14878, 14878,  8287, 12870, 13852,  3001,   849,  5544,  4017, 13950,
          1690, 10368, 14878, 14878,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0]]),
 torch.Size([4, 25]))
('labels',
 tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
   

In [11]:
class SoftmaxWordWindowClassifier(nn.Module):
    """
    A one-layer, binary word-window classifier.
    """
    def __init__(self, config, vocab_size, pad_idx=0):
        super(SoftmaxWordWindowClassifier, self).__init__()
        """
        Instance variables.
        """
        self.window_size = 2*config["half_window"]+1
        self.embed_dim = config["embed_dim"]
        self.hidden_dim = config["hidden_dim"]
        self.num_classes = config["num_classes"]
        self.freeze_embeddings = config["freeze_embeddings"]
        
        """
        Embedding layer
        -model holds an embedding for each layer in our vocab
        -sets aside a special index in the embedding matrix for padding vector (of zeros)
        -by default, embeddings are parameters (so gradients pass through them)
        """
        self.embed_layer = nn.Embedding(vocab_size, self.embed_dim, padding_idx=pad_idx)
        if self.freeze_embeddings:
            self.embed_layer.weight.requires_grad = False
        
        """
        Hidden layer
        -we want to map embedded word windows of dim (window_size+1)*self.embed_dim to a hidden layer.
        -nn.Sequential allows you to efficiently specify sequentially structured models
            -first the linear transformation is evoked on the embedded word windows
            -next the nonlinear transformation tanh is evoked.
        """
        self.hidden_layer = nn.Sequential(nn.Linear(self.window_size*self.embed_dim, 
                                                    self.hidden_dim), 
                                          nn.Tanh())
        
        """
        Output layer
        -we want to map elements of the output layer (of size self.hidden dim) to a number of classes.
        """
        self.output_layer = nn.Linear(self.hidden_dim, self.num_classes)
        
        """
        Softmax
        -The final step of the softmax classifier: mapping final hidden layer to class scores.
        -pytorch has both logsoftmax and softmax functions (and many others)
        -since our loss is the negative LOG likelihood, we use logsoftmax
        -technically you can take the softmax, and take the log but PyTorch's implementation
         is optimized to avoid numerical underflow issues.
        """
        self.log_softmax = nn.LogSoftmax(dim=2)
        
    def forward(self, inputs):
        """
        Let B:= batch_size
            L:= window-padded sentence length
            D:= self.embed_dim
            S:= self.window_size
            H:= self.hidden_dim
            
        inputs: a (B, L) tensor of token indices
        """
        B, L = inputs.size()
        
        """
        Reshaping.
        Takes in a (B, L) LongTensor
        Outputs a (B, L~, S) LongTensor
        """
        # Fist, get our word windows for each word in our input.
        token_windows = inputs.unfold(1, self.window_size, 1)
        _, adjusted_length, _ = token_windows.size()
        
        # Good idea to do internal tensor-size sanity checks, at the least in comments!
        assert token_windows.size() == (B, adjusted_length, self.window_size)
        
        """
        Embedding.
        Takes in a torch.LongTensor of size (B, L~, S) 
        Outputs a (B, L~, S, D) FloatTensor.
        """
        embedded_windows = self.embed_layer(token_windows)
        
        """
        Reshaping.
        Takes in a (B, L~, S, D) FloatTensor.
        Resizes it into a (B, L~, S*D) FloatTensor.
        -1 argument "infers" what the last dimension should be based on leftover axes.
        """
        embedded_windows = embedded_windows.view(B, adjusted_length, -1)
        
        """
        Layer 1.
        Takes in a (B, L~, S*D) FloatTensor.
        Resizes it into a (B, L~, H) FloatTensor
        """
        layer_1 = self.hidden_layer(embedded_windows)
        
        
        """
        Layer 2
        Takes in a (B, L~, H) FloatTensor.
        Resizes it into a (B, L~, 2) FloatTensor.
        """
        output = self.output_layer(layer_1)
        
        """
        Softmax.
        Takes in a (B, L~, 2) FloatTensor of unnormalized class scores.
        Outputs a (B, L~, 2) FloatTensor of (log-)normalized class scores.
        """
        output = self.log_softmax(output)
        
        return output

In [12]:
def loss_function(outputs, labels, lengths):
    """Computes negative LL loss on a batch of model predictions."""
    B, L, num_classes = outputs.size()
    num_elems = lengths.sum().float()
        
    # get only the values with non-zero labels
    loss = outputs*labels
    
    # rescale average
    return -loss.sum() / num_elems

In [13]:
def train_epoch(loss_function, optimizer, model, train_data):
    
    ## For each batch, we must reset the gradients
    ## stored by the model.   
    total_loss = 0
    for batch, labels, lengths in train_data:
        # clear gradients
        optimizer.zero_grad()
        # evoke model in training mode on batch
        outputs = model.forward(batch)
        # compute loss w.r.t batch
        loss = loss_function(outputs, labels, lengths)
        # pass gradients back, startiing on loss value
        loss.backward()
        # update parameters
        optimizer.step()
        total_loss += loss.item()
    
    # return the total to keep track of how you did this time around
    return total_loss

In [14]:
config = {"batch_size": 4,
          "half_window": 2,
          "embed_dim": 25,
          "hidden_dim": 25,
          "num_classes": 13,
          "freeze_embeddings": False,
         }
learning_rate = 0.0002
num_epochs = 100
model = SoftmaxWordWindowClassifier(config, len(word_2_id))
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [15]:
for epoch in range(num_epochs):
    epoch_loss = train_epoch(loss_function, optimizer, model, train_loader)
    print(epoch, epoch_loss)

0 1984.0911266803741
1 1792.0715091228485
2 1607.2546564340591
3 1428.6124905347824
4 1261.417154788971
5 1111.3024247288704
6 978.8777705430984
7 868.2715724110603
8 777.5037584900856
9 699.8801460266113
10 634.8877540230751
11 585.3708375096321
12 544.7903020083904
13 506.13670694828033
14 477.1411560624838
15 454.2796929627657
16 431.78946666419506
17 413.5574513077736
18 401.9932784512639
19 385.9289086908102
20 376.22236704826355
21 366.0073767527938
22 358.61624309420586
23 349.6384425610304
24 341.78234745562077
25 336.6677959486842
26 332.50128585100174
27 326.40398567169905
28 321.82110530883074
29 319.4336385950446
30 316.42031425237656
31 313.945821352303
32 309.1180630326271
33 307.0945975407958
34 301.8980015181005
35 302.4247559569776
36 300.71974082291126
37 298.57036846131086
38 297.4774141609669
39 295.9250449799001
40 293.87659879401326
41 294.1891141496599
42 291.8307018019259
43 290.55560522153974
44 290.0176186785102
45 288.4694458581507
46 290.69103196263313
47 28

### Evaluation

In [16]:
test_loader = DataLoader(list(zip(test_sent_tokens, test_label_inds)), 
                            batch_size=batch_size, shuffle=False, 
                            collate_fn=partial(my_collate, window_size=2, word_2_id=word_2_id))

In [17]:
test_outputs = []
for test_instance, labs, _ in test_loader:
    outputs_full = model.forward(test_instance)
    outputs = torch.argmax(outputs_full, dim=2)
    for i in range(outputs.size(0)):
      test_outputs.append(outputs[i].tolist())


In [18]:
y_test = test_labels
y_pred = []
for test, pred in zip(test_labels, test_outputs):
  y_pred.append([train_id_2_label[id] for id in pred[:len(test)]])

assert len(y_pred) == len(y_test), '{} vs. {}'.format(len(y_pred), len(y_test))
for i, pred, test in zip(list(range(len(y_pred))), y_pred, y_test):
  assert len(pred) == len(test), '{}: {} vs. {}'.format(i, len(pred), len(test))


In [19]:
# evaluate CRF model
from sklearn_crfsuite import metrics

metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=train_id_2_label)

  'precision', 'predicted', average, warn_for)


0.9190593176758906