In [1]:
import torch
import torch.nn as nn
from torch.utils import data
from torch.functional import F
import matplotlib.pyplot as plt
from torch.nn.utils.rnn import pad_sequence, pack_sequence

In [2]:
import nltk
import numpy as np
from sklearn.preprocessing import LabelEncoder
import tqdm

## 1. Data parsing 

In [3]:
def read_data():
    reader = nltk.corpus.reader.conll.ConllCorpusReader(".", fileids=['train.txt'],
                                                   columntypes=['words','pos', 'tree', 'chunk'])
    train = reader.iob_sents()
    train = list(filter(lambda x: len(x) > 0, train))
    # Dev
    reader = nltk.corpus.reader.conll.ConllCorpusReader(".", fileids=['dev.txt'],
                                                   columntypes=['words','pos', 'tree', 'chunk'])
    dev = reader.iob_sents()
    dev = list(filter(lambda x: len(x) > 0, dev))
    #Test
    reader = nltk.corpus.reader.conll.ConllCorpusReader(".", fileids=['test.txt'],
                                               columntypes=['words','pos', 'tree', 'chunk'])
    test = reader.iob_sents()
    test = list(filter(lambda x: len(x) > 0, test))
    
    return train, dev, test
    

In [4]:
def read_embeddings():
    embeddings_index = dict()
    f = open('glove.6B.100d.txt')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    return embeddings_index

## 2. Embedding loaders 

In [5]:
def embedding_loader_a(embeddings, word):
    if word in embeddings:
        return embeddings[word]
    else:
        return np.zeros(100)
    
def embedding_loader_b(embeddings, word):
    if word.lower() in embeddings:
        return embeddings[word.lower()]
    else:
        return np.zeros(100)
    
def embedding_loader_c(embeddings, word):
    if word in embeddings:
        return embeddings[word]
    elif word.lower() in embeddings:
        return embeddings[word.lower()]
    else:
        return np.zeros(100)

In [6]:
train, dev, test = read_data()

In [7]:
embeddings = read_embeddings()

In [8]:
label_encoder = LabelEncoder()

In [12]:
all_tokens = list({o[2] for l in train for o in l})

In [14]:
label_encoder = LabelEncoder()
label_encoder.fit(['<PADDING>', *all_tokens])

LabelEncoder()

In [15]:
padding_label = label_encoder.transform(['<PADDING>'])[0]

## Defining neural network and batch dataloader

In [16]:
# Pad batch to the same seq len
def get_padding_collate(padding_label):
    def padding_collate(batch):
        data = [item[0] for item in batch]
        data = pad_sequence(data, batch_first=False)
        target = [item[1] for item in batch]
        target = pad_sequence(target, batch_first=False, padding_value=padding_label)

        mask = [item[2] for item in batch]
        mask = pad_sequence(mask, batch_first=False)

        return [data, target, mask]
    return padding_collate

class DataLoader(data.Dataset):
    def __init__(self, data, label_encoder, embeddings_index, embedding_loader):
        self.data = data
        self.label_encoder = label_encoder
        self.embedding_loader = embedding_loader
        self.embeddings_index = embeddings_index
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        words = np.array(self.data[idx])[:, 0]
        tokens = np.array(self.data[idx])[:, 2]
        x = np.array([self.embedding_loader(self.embeddings_index, word) for word in words])
        y = self.label_encoder.transform(tokens)
        target = torch.from_numpy(y).long()
        return torch.from_numpy(x).float(), target, torch.ones_like(target)

In [17]:
class Network(nn.Module):
    def __init__(self, categories_count):
        super(Network,self).__init__()
        hidden_size = 50
        self.lstm = nn.LSTM(input_size=100, hidden_size=hidden_size, batch_first=False,
                            num_layers=1, bidirectional=True)
        
        self.hidden2tag = nn.Linear(hidden_size * 2, categories_count)
    
    def forward(self, input):
        seq, batch, _ = input.shape
        a, (b, c) = self.lstm(input)
        tag_scores = []
        for output in a:
            tag_space = self.hidden2tag(output)
            score = F.log_softmax(tag_space, dim=1)
            tag_scores.append(score)
            
        return torch.stack(tag_scores)

## 3. Training on batches

In [18]:
class Trainer:
    def __init__(self, net, config, lr=0.001, l2_rate=0):
        net.train()
        net.to(config['DEVICE'])
        self.device = config['DEVICE']
        self.net = net
        self.config = config
        self.optimizer = torch.optim.Adam(net.parameters(), lr=lr, weight_decay=l2_rate)
        self.loss = nn.NLLLoss()
        self.losses = []

    def run(self, dataloader, epochs=1):
        print(">> Running trainer")
        for epoch in range(epochs):
            print(">>> Epoch %s" % epoch)
            for idx, (vectors, target, mask) in enumerate(tqdm.tqdm_notebook(dataloader, ascii=True)):
                vectors, target, mask = vectors.to(self.device), target.to(self.device), mask.to(self.device)
                self.optimizer.zero_grad()
                predicts = self.net(vectors)
                loss = 0
                for seq_idx in range(len(predicts)):
                    seq_loss = self.loss(predicts[seq_idx], target[seq_idx])
                    loss += seq_loss
                
                loss.backward()
                self.losses.append(loss.item())
                self.optimizer.step()
                if idx % 400 == 0:
                    print(">>> Loss: {}".format(np.mean(self.losses[-50:])))

                if self.config['DEBUG'] == True:
                    break
            print("Trainer epoch finished")


## 4. Score calculation and Evaluation 

In [19]:
from numba import jit

@jit
def calculate_metrics(prediction, gt, target):
    tp = np.sum(np.logical_and(prediction == target, gt == target))
    fp = np.sum(np.logical_and(prediction == target, gt != target))
    fn = np.sum(np.logical_and(prediction != target, gt == target))
    return (tp, fp, fn)

def calculate_f1_score(gt, predicted):
    gt = gt.reshape(-1)
    predicted = predicted.reshape(-1)
    tokens = set(np.unique(gt)).union(set(np.unique(predicted)))
    tp_a = 0
    fp_a = 0
    fn_a = 0
    for token in tqdm.tqdm_notebook(tokens):
        tp, fp, fn = calculate_metrics(predicted, gt, token)
        tp_a += tp
        fp_a += fp
        fn_a += fn
    precision = tp_a / (tp_a + fp_a)
    recall = tp_a / (tp_a + fn_a)
    
    f1_micro = (2 * precision * recall) / (precision + recall)
    f05_micro = ((1 + 0.5 * 0.5) * precision * recall) / ( 0.5 * 0.5 * precision + recall)
    return {
        'f1': f1_micro,
        'f05': f05_micro,
        'recall': recall,
        'precision': precision
    }


class Evaluation:
    def __init__(self, net, config):
        net.eval()
        net.to(config['DEVICE'])
        self.device = config['DEVICE']
        self.net = net
        self.config = config
        self.loss = nn.CrossEntropyLoss()

    def run(self, dataloader):
        predicted = []
        labels = []
        with torch.no_grad():
            for idx, (vectors, target, mask) in enumerate(tqdm.tqdm_notebook(dataloader, ascii=True)):
                vectors, target, mask = vectors.to(self.device), target.to(self.device), mask.to(self.device)
                predicts = self.net(vectors)
                batch_preds = []
                for seq_idx in range(len(predicts)):
                    pred = F.softmax(predicts[seq_idx], 1).argmax(dim=1, keepdim=True)
                    batch_preds.extend(list(pred.numpy()))
                
                predicted.extend(batch_preds)
                labels.extend(list(target.view(-1).numpy()))
                if self.config['DEBUG'] == True:
                    break
        predicted = np.array(predicted)
        labels = np.array(labels)
        return calculate_f1_score(labels, predicted.reshape(-1))

In [20]:
config = {
    'DEBUG': False,
    'CUDA': torch.cuda.is_available(),
    'DEVICE': torch.device("cuda" if torch.cuda.is_available() else "cpu")
}

In [23]:
padding_collate = get_padding_collate(padding_label)

In [24]:

def measure_performance(embedding_loader):
    train_data = DataLoader(train, label_encoder, embeddings, embedding_loader)
    test_data = DataLoader(test, label_encoder, embeddings, embedding_loader)
    dev_data = DataLoader(dev, label_encoder, embeddings, embedding_loader)

    train_loader = torch.utils.data.DataLoader(train_data, batch_size=8, num_workers=2,
                                               shuffle=False, collate_fn=padding_collate)
    test_loader = torch.utils.data.DataLoader(test_data, batch_size=64, num_workers=2, 
                                              shuffle=False, collate_fn=padding_collate)
    dev_loader = torch.utils.data.DataLoader(dev_data, batch_size=64, num_workers=2, 
                                              shuffle=False, collate_fn=padding_collate)

    net = Network(len(label_encoder.classes_))
    trainer = Trainer(net, config)
    for i in range(5):
        trainer.run(train_loader)
        dev_result = Evaluation(net, config).run(dev_loader)
        print(f"Epoch: {i+1} \n---- Dev result: " + str(dev_result))
        test_result = Evaluation(net, config).run(test_loader)
        print(f"Epoch: {i+1} \n---- Test result: " + str(test_result))

## 5. Performance reports for different embeddings

* Loader A: 0.971
* Loader B: 0.986
* Loader C: 0.986

In [25]:
measure_performance(embedding_loader_a)

>> Running trainer
>>> Epoch 0


HBox(children=(IntProgress(value=0, max=1756), HTML(value='')))

>>> Loss: 92.7218246459961
>>> Loss: 8.048011136054992
>>> Loss: 6.105228921175003
>>> Loss: 6.092414779663086
>>> Loss: 5.237387156486511

Trainer epoch finished


HBox(children=(IntProgress(value=0, max=51), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Epoch: 1 
---- Dev result: {'f1': 0.9614556105762284, 'f05': 0.9614556105762284, 'recall': 0.9614556105762284, 'precision': 0.9614556105762284}


HBox(children=(IntProgress(value=0, max=54), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Epoch: 1 
---- Test result: {'f1': 0.9591321450689752, 'f05': 0.9591321450689753, 'recall': 0.9591321450689752, 'precision': 0.9591321450689752}
>> Running trainer
>>> Epoch 0


HBox(children=(IntProgress(value=0, max=1756), HTML(value='')))

>>> Loss: 5.385257003307342
>>> Loss: 5.4594967699050905
>>> Loss: 4.536996765136719
>>> Loss: 4.968982446193695
>>> Loss: 4.260535646677017

Trainer epoch finished


HBox(children=(IntProgress(value=0, max=51), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Epoch: 2 
---- Dev result: {'f1': 0.9674057241931883, 'f05': 0.9674057241931883, 'recall': 0.9674057241931883, 'precision': 0.9674057241931883}


HBox(children=(IntProgress(value=0, max=54), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Epoch: 2 
---- Test result: {'f1': 0.9656859742610778, 'f05': 0.9656859742610778, 'recall': 0.9656859742610778, 'precision': 0.9656859742610778}
>> Running trainer
>>> Epoch 0


HBox(children=(IntProgress(value=0, max=1756), HTML(value='')))

>>> Loss: 4.56727454662323
>>> Loss: 4.829206333160401
>>> Loss: 4.10224422454834
>>> Loss: 4.601151878833771
>>> Loss: 3.9309593093395234

Trainer epoch finished


HBox(children=(IntProgress(value=0, max=51), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Epoch: 3 
---- Dev result: {'f1': 0.9703216739127579, 'f05': 0.9703216739127578, 'recall': 0.9703216739127579, 'precision': 0.9703216739127579}


HBox(children=(IntProgress(value=0, max=54), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Epoch: 3 
---- Test result: {'f1': 0.9690003198649761, 'f05': 0.9690003198649761, 'recall': 0.9690003198649761, 'precision': 0.9690003198649761}
>> Running trainer
>>> Epoch 0


HBox(children=(IntProgress(value=0, max=1756), HTML(value='')))

>>> Loss: 4.290261154174805
>>> Loss: 4.541636357307434
>>> Loss: 3.86840616106987
>>> Loss: 4.446622521877289
>>> Loss: 3.68896294593811

Trainer epoch finished


HBox(children=(IntProgress(value=0, max=51), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Epoch: 4 
---- Dev result: {'f1': 0.9723444498443513, 'f05': 0.9723444498443513, 'recall': 0.9723444498443513, 'precision': 0.9723444498443513}


HBox(children=(IntProgress(value=0, max=54), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Epoch: 4 
---- Test result: {'f1': 0.9704635319899004, 'f05': 0.9704635319899004, 'recall': 0.9704635319899004, 'precision': 0.9704635319899004}
>> Running trainer
>>> Epoch 0


HBox(children=(IntProgress(value=0, max=1756), HTML(value='')))

>>> Loss: 4.063642431497573
>>> Loss: 4.295072264671326
>>> Loss: 3.6730082666873933
>>> Loss: 4.281336473226547
>>> Loss: 3.5233314234018325

Trainer epoch finished


HBox(children=(IntProgress(value=0, max=51), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Epoch: 5 
---- Dev result: {'f1': 0.9731653816347707, 'f05': 0.9731653816347707, 'recall': 0.9731653816347707, 'precision': 0.9731653816347707}


HBox(children=(IntProgress(value=0, max=54), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Epoch: 5 
---- Test result: {'f1': 0.9712734028869515, 'f05': 0.9712734028869514, 'recall': 0.9712734028869515, 'precision': 0.9712734028869515}


In [26]:
measure_performance(embedding_loader_b)

>> Running trainer
>>> Epoch 0


HBox(children=(IntProgress(value=0, max=1756), HTML(value='')))

>>> Loss: 94.14912414550781
>>> Loss: 7.677001485824585
>>> Loss: 4.584715450406074
>>> Loss: 5.066996936798096
>>> Loss: 2.840318018198013

Trainer epoch finished


HBox(children=(IntProgress(value=0, max=51), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Epoch: 1 
---- Dev result: {'f1': 0.9808427357387729, 'f05': 0.980842735738773, 'recall': 0.9808427357387729, 'precision': 0.9808427357387729}


HBox(children=(IntProgress(value=0, max=54), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Epoch: 1 
---- Test result: {'f1': 0.9787323819051702, 'f05': 0.9787323819051703, 'recall': 0.9787323819051702, 'precision': 0.9787323819051702}
>> Running trainer
>>> Epoch 0


HBox(children=(IntProgress(value=0, max=1756), HTML(value='')))

>>> Loss: 3.3474169081449507
>>> Loss: 3.647911636829376
>>> Loss: 2.3691920852661132
>>> Loss: 3.4510275602340696
>>> Loss: 1.7726290482282638

Trainer epoch finished


HBox(children=(IntProgress(value=0, max=51), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Epoch: 2 
---- Dev result: {'f1': 0.986149238832044, 'f05': 0.9861492388320439, 'recall': 0.986149238832044, 'precision': 0.986149238832044}


HBox(children=(IntProgress(value=0, max=54), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Epoch: 2 
---- Test result: {'f1': 0.9839182779014135, 'f05': 0.9839182779014135, 'recall': 0.9839182779014135, 'precision': 0.9839182779014135}
>> Running trainer
>>> Epoch 0


HBox(children=(IntProgress(value=0, max=1756), HTML(value='')))

>>> Loss: 2.312031438946724
>>> Loss: 2.5950870645046233
>>> Loss: 1.6778836411237716
>>> Loss: 2.7679545640945435
>>> Loss: 1.4095083037018776

Trainer epoch finished


HBox(children=(IntProgress(value=0, max=51), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Epoch: 3 
---- Dev result: {'f1': 0.9880866378574337, 'f05': 0.9880866378574336, 'recall': 0.9880866378574337, 'precision': 0.9880866378574337}


HBox(children=(IntProgress(value=0, max=54), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Epoch: 3 
---- Test result: {'f1': 0.9852725998216922, 'f05': 0.9852725998216921, 'recall': 0.9852725998216922, 'precision': 0.9852725998216922}
>> Running trainer
>>> Epoch 0


HBox(children=(IntProgress(value=0, max=1756), HTML(value='')))

>>> Loss: 1.9052227288484573
>>> Loss: 2.0562340062856674
>>> Loss: 1.3238331937789918
>>> Loss: 2.316497153043747
>>> Loss: 1.1657696714997292

Trainer epoch finished


HBox(children=(IntProgress(value=0, max=51), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Epoch: 4 
---- Dev result: {'f1': 0.9889338394651466, 'f05': 0.9889338394651465, 'recall': 0.9889338394651466, 'precision': 0.9889338394651466}


HBox(children=(IntProgress(value=0, max=54), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Epoch: 4 
---- Test result: {'f1': 0.9861437214588565, 'f05': 0.9861437214588566, 'recall': 0.9861437214588565, 'precision': 0.9861437214588565}
>> Running trainer
>>> Epoch 0


HBox(children=(IntProgress(value=0, max=1756), HTML(value='')))

>>> Loss: 1.6437223947048187
>>> Loss: 1.713949654698372
>>> Loss: 1.084339406490326
>>> Loss: 1.9610538733005525
>>> Loss: 0.9714409869909286

Trainer epoch finished


HBox(children=(IntProgress(value=0, max=51), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Epoch: 5 
---- Dev result: {'f1': 0.9893081843615777, 'f05': 0.9893081843615777, 'recall': 0.9893081843615777, 'precision': 0.9893081843615777}


HBox(children=(IntProgress(value=0, max=54), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Epoch: 5 
---- Test result: {'f1': 0.9864159469704703, 'f05': 0.9864159469704703, 'recall': 0.9864159469704703, 'precision': 0.9864159469704703}


In [27]:
measure_performance(embedding_loader_c)

>> Running trainer
>>> Epoch 0


HBox(children=(IntProgress(value=0, max=1756), HTML(value='')))

>>> Loss: 95.03756713867188
>>> Loss: 7.953157591819763
>>> Loss: 4.674965653419495
>>> Loss: 5.1410298538208
>>> Loss: 2.9535986149311064

Trainer epoch finished


HBox(children=(IntProgress(value=0, max=51), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Epoch: 1 
---- Dev result: {'f1': 0.9806851168350124, 'f05': 0.9806851168350124, 'recall': 0.9806851168350124, 'precision': 0.9806851168350124}


HBox(children=(IntProgress(value=0, max=54), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Epoch: 1 
---- Test result: {'f1': 0.9789161341255097, 'f05': 0.9789161341255095, 'recall': 0.9789161341255096, 'precision': 0.9789161341255096}
>> Running trainer
>>> Epoch 0


HBox(children=(IntProgress(value=0, max=1756), HTML(value='')))

>>> Loss: 3.4155039381980896
>>> Loss: 3.6533314144611357
>>> Loss: 2.291423262357712
>>> Loss: 3.4486569917201995
>>> Loss: 1.8454523158073426

Trainer epoch finished


HBox(children=(IntProgress(value=0, max=51), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Epoch: 2 
---- Dev result: {'f1': 0.9864185044593015, 'f05': 0.9864185044593015, 'recall': 0.9864185044593015, 'precision': 0.9864185044593015}


HBox(children=(IntProgress(value=0, max=54), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Epoch: 2 
---- Test result: {'f1': 0.9838842497124618, 'f05': 0.9838842497124619, 'recall': 0.9838842497124618, 'precision': 0.9838842497124618}
>> Running trainer
>>> Epoch 0


HBox(children=(IntProgress(value=0, max=1756), HTML(value='')))

>>> Loss: 2.364742395877838
>>> Loss: 2.6362554973363874
>>> Loss: 1.6309620189666747
>>> Loss: 2.783191310763359
>>> Loss: 1.4425462484359741

Trainer epoch finished


HBox(children=(IntProgress(value=0, max=51), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Epoch: 3 
---- Dev result: {'f1': 0.9880997727660804, 'f05': 0.9880997727660804, 'recall': 0.9880997727660804, 'precision': 0.9880997727660804}


HBox(children=(IntProgress(value=0, max=54), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Epoch: 3 
---- Test result: {'f1': 0.9853202392862247, 'f05': 0.9853202392862247, 'recall': 0.9853202392862247, 'precision': 0.9853202392862247}
>> Running trainer
>>> Epoch 0


HBox(children=(IntProgress(value=0, max=1756), HTML(value='')))

>>> Loss: 2.004744299054146
>>> Loss: 2.1058286398649217
>>> Loss: 1.2832660102844238
>>> Loss: 2.33428885102272
>>> Loss: 1.1694016057252883

Trainer epoch finished


HBox(children=(IntProgress(value=0, max=51), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Epoch: 4 
---- Dev result: {'f1': 0.9889732441910866, 'f05': 0.9889732441910867, 'recall': 0.9889732441910867, 'precision': 0.9889732441910867}


HBox(children=(IntProgress(value=0, max=54), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Epoch: 4 
---- Test result: {'f1': 0.9860280256164207, 'f05': 0.9860280256164207, 'recall': 0.9860280256164207, 'precision': 0.9860280256164207}
>> Running trainer
>>> Epoch 0


HBox(children=(IntProgress(value=0, max=1756), HTML(value='')))

>>> Loss: 1.7796859395503999
>>> Loss: 1.753575022816658
>>> Loss: 1.0459512716531754
>>> Loss: 1.9909034770727159
>>> Loss: 0.9722058302164078

Trainer epoch finished


HBox(children=(IntProgress(value=0, max=51), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Epoch: 5 
---- Dev result: {'f1': 0.9894066961764281, 'f05': 0.9894066961764282, 'recall': 0.9894066961764281, 'precision': 0.9894066961764281}


HBox(children=(IntProgress(value=0, max=54), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Epoch: 5 
---- Test result: {'f1': 0.9863546962303572, 'f05': 0.9863546962303573, 'recall': 0.9863546962303572, 'precision': 0.9863546962303572}
