In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import os
import gensim
from string import punctuation
import regex as re
import random
import time
from seqeval.metrics import classification_report,accuracy_score,f1_score


define word2vec embeddings and vocab dictonary

In [2]:
path = os.path.join(os.path.expanduser('~'), 'Documents', 'AIT 726','HW3')
googlePath = os.path.join(os.path.expanduser('~'), 'Documents', 'AIT 726','HW3','GoogleNews-vectors-negative300.bin')

google = gensim.models.KeyedVectors.load_word2vec_format(googlePath,binary=True)

idx2word = {idx: word for idx, word in enumerate(google.index2word)}
word2idx = {word: idx for idx, word in enumerate(google.index2word)}
embeddings = nn.Embedding.from_pretrained(torch.Tensor(google.vectors))

use_gpu = torch.cuda.is_available()



Read text file, create padding function for mini batching the data into the lstm

In [3]:
def readfile(path):
    f = open(path)
    data = []
    sentence = []
    label = []
    for line in f:
        if len(line)==0 or line.startswith('-DOCSTART') or line[0]=='\n':
            if len(sentence) > 0:
                data.append((sentence,label))
                sentence = []
                label = []
            continue
        splits = line.split(' ')
        sentence.append(splits[0])
        label.append(splits[-1][:-1])

    if len(sentence) > 0:
        data.append((sentence, label))
        sentence = []
        label = []
    return data


def padding(document):
    _max = max([len(document[i][0]) for i in range(len(document))])
    for i in range(len(document)):
        document[i] = (document[i][0] + ['<PAD>']*(_max-len(document[i][0])),
                document[i][1] + ['<PAD>']*(_max-len(document[i][1])))
    return document


def encoding(document):
    vocab = []
    tag = []
    data = []
    document = padding(document)
    for i in range(len(document)):
        vocab = torch.LongTensor([word2idx[j] if j in word2idx.keys() and j != '<PAD>' else
                                       word2idx['unk'] for j in document[i][0]])
        tag = torch.LongTensor([tag2idx[k] if j in word2idx.keys() or j != '<PAD>' else
                                     tag2idx['<PAD>'] for j, k in zip(document[i][0], document[i][1])])
        data.append((vocab,tag))
    return data



Custom loss function to not learn paddings and custom prediction function to not incorporate padding into the accuracy

In [4]:
def loss_fn(outputs, labels):
    labels = labels.view(-1)
    mask = (labels >= 1).float()
    num_tokens = int(torch.sum(mask))
    outputs = outputs[range(outputs.shape[0]), labels]*mask
    return -torch.sum(outputs)/num_tokens

def predict(outputs, labels):
    labels = labels.view(-1)
    outputs = outputs[labels!=0]
    labels = labels[labels!=0]
    preds = torch.max(outputs, 1)[1]
    correct = torch.sum(preds==labels).type(torch.float)
    return preds, correct
    

def trainModel(model, optimizer, dataset, minibatchsize, epoch):
    since = time.time()
    
    best_model_wts = model.state_dict()
    bestAcc = 0.0

    for i in range(epoch):
        print('Epoch {}/{}'.format(i, epoch-1))
        print('-' * 10)
        
        for phase in ['train','val']:
            if phase == 'train':
                model.train(True)
            else:
                model.train(False)

            random.shuffle(dataset[phase])

            total_loss = 0.0
            total_correct = 0

            for j in range(0,len(dataset[phase]), minibatchsize):
                z = encoding(dataset[phase][j:j+minibatchsize])
                inputs, labels = zip(*z)
                inputs = torch.stack(inputs)
                labels = torch.stack(labels)
                if use_gpu:
                    inputs = inputs.cuda()
                    labels = labels.cuda()
                
                
                optimizer.zero_grad()
                #Forward Pass
                outputs = model(inputs)
                loss = loss_fn(outputs, labels)
                preds, correct = predict(outputs, labels)
                #backward and optimization
                if phase == 'train':
                    loss.backward()
                    optimizer.step()

                total_loss += loss.item()
                total_correct += correct

            epochLoss = total_loss
            epochAcc = total_correct/datasetSize[phase]
            
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                    phase, epochLoss, epochAcc))
            
            if phase == 'val' and epochAcc > bestAcc:
                bestAcc = epochAcc
                best_model_wts = model.state_dict()
            
        timeElapsed = time.time() - since
        print('Elapsed {:.0f}m {:.0f}s\n'.format(timeElapsed // 60, timeElapsed % 60))
        
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
            time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(bestAcc))

    model.load_state_dict(best_model_wts)
    return model


In [5]:
train = readfile(path + '/train.txt')
val = readfile(path+'/valid.txt')
test = readfile(path+'/test.txt')
n_t = sum([len(train[i][0]) for i in range(len(train))])
n_v = sum([len(val[i][0]) for i in range(len(val))])
n_ts = sum([len(test[i][0]) for i in range(len(test))])

dataset = {'train': train,
           'val': val,
           'test': test}
datasetSize = {'train': n_t,
               'val': n_v,
               'test': n_ts}
del train, val, n_t, n_v # free memory
tags = ('<PAD>', 'O', 'I-LOC', 'B-PER', 'I-PER', 'I-ORG','I-MISC','B-MISC', 'B-LOC', 'B-ORG')
tag2idx = {tag: idx for idx, tag in enumerate(tags)}
idx2tag = {idx: tag for idx, tag in enumerate(tags)}



In [6]:
class Bilstm(nn.Module):
    def __init__(self, embeddings, n_class, n_hidden):
        super(Bilstm, self).__init__()
        self.embeddings = embeddings
        self.Bilstm = nn.LSTM(input_size = embeddings.embedding_dim, hidden_size = n_hidden, bidirectional=True,
                            num_layers=6)
        self.fc = nn.Linear(n_hidden*2, n_class)

    def forward(self, x):
        x = self.embeddings(x)
        x, _ = self.Bilstm(x)
        x = x.view(-1, x.shape[2])
        x = self.fc(x)
        return F.log_softmax(x, dim=1)


n_batch = 150
epochs = 50


model = Bilstm(embeddings, len(tags), 384)
if use_gpu:
    model = model.cuda()
optimizer = optim.Adam(model.parameters(), lr= 1e-3)
model = trainModel(model,optimizer, dataset, n_batch,epochs)


Epoch 0/49
----------
train Loss: 77.8869 Acc: 0.8243
val Loss: 17.1299 Acc: 0.8325
Elapsed 0m 23s

Epoch 1/49
----------
train Loss: 73.0479 Acc: 0.8328
val Loss: 17.0553 Acc: 0.8325
Elapsed 0m 45s

Epoch 2/49
----------
train Loss: 56.9657 Acc: 0.8429
val Loss: 8.7703 Acc: 0.8668
Elapsed 1m 8s

Epoch 3/49
----------
train Loss: 34.2721 Acc: 0.8844
val Loss: 7.1976 Acc: 0.8992
Elapsed 1m 31s

Epoch 4/49
----------
train Loss: 28.9880 Acc: 0.9037
val Loss: 6.3993 Acc: 0.9091
Elapsed 1m 54s

Epoch 5/49
----------
train Loss: 26.7014 Acc: 0.9123
val Loss: 6.2545 Acc: 0.9127
Elapsed 2m 17s

Epoch 6/49
----------
train Loss: 24.8424 Acc: 0.9195
val Loss: 5.6813 Acc: 0.9269
Elapsed 2m 39s

Epoch 7/49
----------
train Loss: 23.1549 Acc: 0.9282
val Loss: 5.4380 Acc: 0.9318
Elapsed 3m 2s

Epoch 8/49
----------
train Loss: 21.4693 Acc: 0.9363
val Loss: 4.9989 Acc: 0.9384
Elapsed 3m 25s

Epoch 9/49
----------
train Loss: 20.1950 Acc: 0.9405
val Loss: 4.8758 Acc: 0.9404
Elapsed 3m 48s

Epoch 10/4

In [7]:
model.eval()
predCache = []
    

for j in range(0,len(dataset['test']), n_batch):
    z = encoding(dataset['test'][j:j+n_batch])
    inputs, labels = zip(*z)
    inputs = torch.stack(inputs)
    labels = torch.stack(labels)
    if use_gpu:
        inputs = inputs.cuda()
        labels = labels.cuda()
    outputs = model(inputs)
    loss = loss_fn(outputs, labels)
    preds, correct = predict(outputs, labels)
    predCache.append(preds)

yhat = torch.cat(predCache).cpu().numpy()
yhat = [idx2tag[i] for i in yhat]
y = [dataset['test'][i][1] for i in range(len(dataset['test']))]
y = [j for i in y for j in i]
x = [dataset['test'][i][0] for i in range(len(dataset['test']))]
x = [j for i in x for j in i]

print(x[:10],y[:10], yhat[:10])



['SOCCER', '-', 'JAPAN', 'GET', 'LUCKY', 'WIN', ',', 'CHINA', 'IN', 'SURPRISE'] ['O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O'] ['O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O']


Full report of performance

In [9]:

print("f1 socre: %f"%(f1_score(y, yhat)))
print("Accuracy score: %f"%(accuracy_score(y, yhat)))

report = classification_report(y, yhat,digits=4)

print(report)

f1 socre: 0.620639
Accuracy score: 0.935803
           precision    recall  f1-score   support

      PER     0.5113    0.5745    0.5411      1617
      ORG     0.5076    0.6237    0.5597      1661
     MISC     0.5697    0.6752    0.6180       702
      LOC     0.7029    0.8255    0.7593      1668

micro avg     0.5739    0.6756    0.6206      5648
macro avg     0.5741    0.6756    0.6206      5648

