In [261]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import io
from torch import nn
from torch.utils.data import DataLoader, Dataset
import itertools
import copy

In [241]:
def get_vocab(vocab_path, tags_path):
    vocab = {}
    with open(vocab_path) as f:
        for i, l in enumerate(f.read().splitlines()):
            vocab[l] = i  # to avoid the 0
        # loading tags (we require this to map tags to their indices)
    vocab['<PAD>'] = len(vocab) # 35180
    tag_map = {}
    with open(tags_path) as f:
        for i, t in enumerate(f.read().splitlines()):
            tag_map[t] = i 
    
    return vocab, tag_map

def get_params(vocab, tag_map, sentences_file, labels_file):
    sentences = []
    labels = []

    with open(sentences_file) as f:
        for sentence in f.read().splitlines():
            # replace each token by its index if it is in vocab
            # else use index of UNK_WORD
            s = [vocab[token] if token in vocab 
                 else vocab['UNK']
                 for token in sentence.split(' ')]
            sentences.append(s)

    with open(labels_file) as f:
        for sentence in f.read().splitlines():
            # replace each label by its index
            l = [tag_map[label] for label in sentence.split(' ')] # I added plus 1 here
            labels.append(l) 
    return sentences, labels, len(sentences)

In [242]:
words, tags = get_vocab('./data/utf-8\'\'words.txt', './data/utf-8\'\'tags.txt')
tr_sentence, tr_labels, len_train = get_params(words, tags, './data/train/utf-8\'\'sentences.txt', './data/train/utf-8\'\'labels.txt')
e_sentence, e_labels, len_eval = get_params(words, tags, './data/eval/utf-8\'\'sentences.txt', './data/eval/utf-8\'\'labels.txt')
te_sentence, te_labels, len_test = get_params(words, tags, './data/test/utf-8\'\'sentences.txt', './data/test/utf-8\'\'labels.txt')

In [243]:
print(f"print word['the']: {words['the']}")
print(f"print word['<PAD>']: {words['<PAD>']}")

print word['the']: 9
print word['<PAD>']: 35180


In [244]:
print(tags)

{'O': 0, 'B-geo': 1, 'B-gpe': 2, 'B-per': 3, 'I-geo': 4, 'B-org': 5, 'I-org': 6, 'B-tim': 7, 'B-art': 8, 'I-art': 9, 'I-per': 10, 'I-gpe': 11, 'I-tim': 12, 'B-nat': 13, 'B-eve': 14, 'I-eve': 15, 'I-nat': 16}


In [245]:
print('The number of outputs is tag_map', len(tags))

g_vocab_size = len(words)
print(f"Num of vocabulary words: {g_vocab_size}")
print('The vocab size is', len(words))
print('The training size is', len_train)
print('The validation size is', len_eval)
print('An example of the first sentence is', tr_sentence[1])
print('An example of its corresponding label is', tr_labels[1])

The number of outputs is tag_map 17
Num of vocabulary words: 35181
The vocab size is 35181
The training size is 33570
The validation size is 7194
An example of the first sentence is [22, 1, 23, 24, 11, 9, 25, 26, 9, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 35, 13, 35, 40, 9, 41, 21, 35]
An example of its corresponding label is [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [246]:
class data_generator(Dataset):
    
    def __init__(self, sentences, tags, max_length, pading):
        
        self.instances = []
        for sentence, tag in zip(sentences,tags):
            padded_sentece = np.full((1,max_length), pading )
            padded_sentece[:,:len(sentence)] = sentence
            padded_tag = np.full((1,max_length), pading )
            padded_tag[:,:len(sentence)] = tag
            self.instances.append((padded_sentece,padded_tag))
    
    
    def __getitem__(self, index):
        
        sentence, tag = self.instances[index]
        sentence = torch.tensor(sentence).view(1,-1)
        tag = torch.tensor(tag).view(1,-1)
        return sentence, tag
        
        
    def __len__(self):
        
        return len(self.instances)
        

In [247]:
max_length = 0 
for sentence1, sentence2, sentence3 in itertools.zip_longest(tr_sentence,e_sentence,te_sentence, fillvalue = [] ):
    length = np.max((len(sentence1),len(sentence2), len(sentence3)))
    if length > max_length:
        max_length = length
print(max_length)

104


In [248]:
data = data_generator(tr_sentence, tr_labels, max_length, words['<PAD>'])
train_loader = DataLoader(data, batch_size=2, shuffle=True)
x,y = next(iter(train_loader))
print(x)
print(y)


tensor([[[   49,   126,     9,   533,   686,  2892,  1002,    34, 12794,    29,
            243,  6852, 12568,    63,  6836,  4289,    11,     9,   309,    21,
          35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180,
          35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180,
          35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180,
          35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180,
          35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180,
          35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180,
          35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180,
          35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180,
          35180, 35180, 35180, 35180]],

        [[  272,    78,  4076, 12236,  3534,  4796,     1, 12237,    93,  1346,
             21, 35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180,

In [357]:
class LSTMNER(nn.Module):
    
    def __init__(self, batch_size, vocab_size, num_tags,  hidden, num_layer, embedding):
        super(LSTMNER, self).__init__()
        
        self.batch_size = batch_size
        self.vocab_size = vocab_size
        self.num_tags = num_tags
        self.embedding = embedding
        self.num_hidden = hidden
        self.num_layer = num_layer
        
        self.embed_layer = nn.Embedding(self.vocab_size, self.embedding)
        self.LSTM = nn.LSTM(self.embedding, self.num_hidden, self.num_layer, batch_first = True)
        self.decoder = nn.Linear(self.num_hidden, self.num_tags)
    
    
    def forward(self, x, hidden):

        embed = self.embed_layer(x)

        output, hidden = self.LSTM(embed.unsqueeze(1), hidden)
        preds = self.decoder(output)
        return preds, hidden 
    
    def init_hidden(self, batch_size):
        hidden_state = torch.randn(self.num_layer,batch_size,self.num_hidden)
        cell_state = torch.randn(self.num_layer,batch_size,self.num_hidden)
        hidden = (hidden_state,cell_state)
        return hidden
    
    def evaluate(self, eval_loader, criterion):
        
        model.train(mode = False)
        with torch.no_grad():
            losses = []
            total = 0
            total_correct = 0
            total_words = 0
            

            for sentence, label in eval_loader:
                model.zero_grad()
                sentence = torch.squeeze(sentence,dim=1)
                label = torch.squeeze(label, dim=1)

                
                batch_loss = 0
                if len(sentence.size()) == 1:
                    continue

                batch_size = sentence.size(0)
                hidden = model.init_hidden(batch_size)
                
                for word_id in range(sentence.size(1)):
                    
                    output, hidden = model(sentence[:,word_id],hidden)
                    tg_label = label[:,word_id]
                    batch_loss += criterion(torch.squeeze(output,dim=1), tg_label)
                    
                    total_words += np.sum(np.array([1 if tg_label[i] != words['<PAD>'] else 0 for i in range(batch_size)]))
                    total_correct += torch.sum(np.argmax(output.squeeze(),axis=-1) == tg_label)
                    
                avg_loss = batch_loss.item()/sentence.size(1)
                losses.append(avg_loss)
                total += 1
                
            accuracy = float(total_correct)/total_words
            epoch_loss = sum(losses) / total
            
            return epoch_loss, accuracy
        
    def test_ner(self, test_loader):
            
            model.train(mode = False)
            with torch.no_grad():

                total_correct = 0
                total_words = 0


                for sentence, label in test_loader:
                    model.zero_grad()
                    sentence = torch.squeeze(sentence,dim=1)
                    label = torch.squeeze(label, dim=1)

                    if len(sentence.size()) == 1:
                        continue

                    batch_size = sentence.size(0)
                    hidden = model.init_hidden(batch_size)

                    for word_id in range(sentence.size(1)):

                        output, hidden = model(sentence[:,word_id],hidden)
                        tg_label = label[:,word_id]

                        total_words += np.sum(np.array([1 if tg_label[i] != words['<PAD>'] else 0 for i in range(batch_size)]))
                        total_correct += torch.sum(np.argmax(output.squeeze(),axis=-1) == tg_label)


                accuracy = float(total_correct)/total_words

                return accuracy
            
    
    def predict(self, sentence):
        
        return True
        

            
                

In [358]:
num_samples = len(tr_sentence)
batch_size = 64
print('Number of samples from the dataset:', num_samples)
print('Batch size (a power of 2):', int(batch_size))
steps_per_epoch = int(num_samples/batch_size)
print('Number of steps to cover one epoch:', steps_per_epoch)

Number of samples from the dataset: 33570
Batch size (a power of 2): 64
Number of steps to cover one epoch: 524


In [362]:
train_data = data_generator(tr_sentence, tr_labels, max_length, words['<PAD>'])
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
eval_data = data_generator(e_sentence, e_labels, max_length, words['<PAD>'])
eval_loader = DataLoader(eval_data, batch_size=batch_size, shuffle=True)

In [363]:
def train_network(model, train_loader, optim):
    
    losses = []
    total = 0
    
    total_correct = 0
    total_words = 0
    
    for sentence, label in train_loader:

        model.zero_grad()
        model.train(mode = True)
        sentence = sentence.squeeze()
        label = label.squeeze()
        
        batch_loss = 0
        if len(sentence.size()) == 1:
            continue
                
        batch_size = sentence.size(0)
        hidden = model.init_hidden(batch_size)

        for word_id in range(sentence.size(1)):
            
            output, hidden = model(sentence[:,word_id],hidden)
            tg_label = label[:,word_id]
            batch_loss += criterion(output.squeeze(), tg_label)
            converted_output = output.detach()
            
            total_words += np.sum(np.array([1 if tg_label[i] != words['<PAD>'] else 0 for i in range(batch_size)]))
            total_correct += torch.sum(np.argmax(converted_output.squeeze(),axis=-1) == tg_label)
            
        batch_loss.backward()
        optim.step()
        avg_loss = batch_loss.item()/sentence.size(1)
        losses.append(avg_loss)
        total += 1
    
    accuracy = float(total_correct)/total_words
    
    return losses, total, accuracy

In [364]:
np.random.seed(1)
torch.manual_seed(1)
model = LSTMNER(batch_size, len(words), len(tags), 50, 1, 50)
criterion = nn.CrossEntropyLoss(reduction='mean', ignore_index=words['<PAD>'])
optim = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0, betas=(0.9, 0.999),
                         eps=1e-8, amsgrad=False)

epoch = 5
least_cost = np.inf
train_loss = []
eval_loss = []

for i in range(epoch):
    
    print('Ep {:4d}'.format(i), end='')
    
    losses, total, tr_accuracy = train_network(model, train_loader, optim)
    
    epoch_loss = np.sum(losses)/total
    train_loss.append(epoch_loss)
    
    print(' |Train loss {:4f}'.format(epoch_loss), end='')
    print(' |Train Acc {:4f}'.format(tr_accuracy), end='')
    evaluate_loss, accuracy = model.evaluate(eval_loader, criterion)
    eval_loss.append(evaluate_loss)
    print(' |Evaluation loss {:4f}'.format(evaluate_loss), end='')
    print(' |Evaluation Acc {:4f}'.format(accuracy), end='')
    
    if least_cost > evaluate_loss :
        least_loss = evaluate_loss
        torch.save(model.state_dict(), './NER.pth')
        best_model = copy.deepcopy(model)
        print('|Saved\n')
    else:
        print('\n')
            

Ep    0 |Train loss 0.328526 |Train Acc 0.813266 |Evaluation loss 0.199674 |Evaluation Acc 0.859466|Saved

Ep    1 |Train loss 0.170322 |Train Acc 0.884685 |Evaluation loss 0.140305 |Evaluation Acc 0.904519|Saved

Ep    2 |Train loss 0.124468 |Train Acc 0.915822 |Evaluation loss 0.109306 |Evaluation Acc 0.922637|Saved

Ep    3 |Train loss 0.100078 |Train Acc 0.931342 |Evaluation loss 0.093730 |Evaluation Acc 0.933803|Saved

Ep    4 |Train loss 0.083213 |Train Acc 0.941031 |Evaluation loss 0.082697 |Evaluation Acc 0.939545|Saved



In [365]:
test_data = data_generator(te_sentence, te_labels, max_length, words['<PAD>'])
test_loader = DataLoader(test_data, batch_size=len(te_sentence), shuffle=True)
accuracy = best_model.test_ner(test_loader)
print(f"Accuracy of the model on test data: {accuracy}" )

Accuracy of the model on test data: 0.938944147880743


In [430]:
def prediction(model,sentence):
    hidden = model.init_hidden(1)
    vec = [words[token] if token in words else words['UNK'] for token in sentence.split(' ')]
    batch_data = np.zeros((1,len(vec)))
    batch_data[0][:] = vec
    sentence_vec = torch.tensor(batch_data, dtype= int)
    labels = list(tags.keys())
    pred = []
    with torch.no_grad():
        for id_word in range(len(vec)):
            output, hidden = model(sentence_vec[:,id_word], hidden)
            pred.append(labels[int(np.argmax(output, axis= -1))])
    return pred

In [439]:
sentence = 'Ali is going on vacation in December to New York'
pred = prediction(best_model, sentence)

In [440]:
for word, label in zip(sentence.split(' '), pred):
    print(f"{word}\t{label}")

Ali	B-per
is	O
going	O
on	O
vacation	O
in	O
December	B-tim
to	O
New	B-geo
York	I-geo
