In [1]:
import pandas as pd
import numpy as np
import re
import string
import gensim
import time 
import json

import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.corpora import Dictionary
from sklearn.feature_extraction.text import CountVectorizer
import seaborn as sns

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import functional as F
import math
from argparse import Namespace

from torch.utils.data import Dataset, DataLoader
from collections import Counter
from tqdm import tqdm_notebook
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
import codecs
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from sklearn.metrics import accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay
from tabulate import tabulate

In [2]:
args = Namespace(
    # Data and Path hyper parameters
    data_file='./data/ner.csv',
    lable_file='labels.txt',
    vectorizer_file="nervectorizer.json",
    model_state_file="NERteachingForce.h5",
    lr = 0.0001,
    seed=666,
    dropout_p=0.1,
    batch_train=1024,
    batch_valid=512,
    batch_test=1024,
    num_epochs=200,
    cuda=True,
    reload_from_files=False,
    expand_filepaths_to_save_dir=True,
    source_embedding_size=128, 
    target_embedding_size=128, 
    encoding_size=128,
)

np.random.seed(666)
torch.manual_seed(666)
if args.cuda:
    torch.cuda.manual_seed_all(666)

# Functions

In [3]:
def update_train_state(args, model, train_state):
    # Save one model at least
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False

    # Save model if performance improved
    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['val_loss'][-2:]

        # If loss worsened
        if loss_t >= train_state['early_stopping_best_val']:
            # Update step
            train_state['early_stopping_step'] += 1
        # Loss decreased
        else:
            # Save the best model
            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_filename'])

            # Reset early stopping step
            train_state['early_stopping_step'] = 0

        # Stop early ?
        train_state['stop_early'] = \
            train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state

def generate_batches(dataset, batch_size, shuffle=True, drop_last=True, device="cpu"):
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)
    for data_dict in dataloader:
        lengths = data_dict['x_length'].numpy()
        sorted_length_indices = lengths.argsort()[::-1].tolist()
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name][sorted_length_indices].to(device)
        yield out_data_dict

def sequence_loss(y_pred, y_true, mask_index, weight=None):
    y_pred, y_true = normalize_sizes(y_pred, y_true)
    indexs = []
    for i in range(len(y_true)):
        if y_true[i].item() != 179 or i%4 ==0:
            indexs.append(i)
    return F.cross_entropy(y_pred[indexs], y_true[indexs], ignore_index=mask_index, weight=weight)

def compute_accuracy(y_pred, y_true, mask_index):
    y_pred, y_true = normalize_sizes(y_pred, y_true)

    _, y_pred_indices = y_pred.max(dim=1)
    
    correct_indices = torch.eq(y_pred_indices, y_true).float()
    valid_indices = torch.ne(y_true, mask_index).float()
    
    n_correct = (correct_indices * valid_indices).sum().item()
    n_valid = valid_indices.sum().item()

    return n_correct / n_valid * 100

def normalize_sizes(y_pred, y_true):
    if len(y_pred.size()) == 3:
        y_pred = y_pred.contiguous().view(-1, y_pred.size(2))
    if len(y_true.size()) == 2:
        y_true = y_true.contiguous().view(-1)
    return y_pred, y_true

def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

def make_train_state(args):
    return {'stop_early': False, 'early_stopping_step': 0, 'early_stopping_best_val': 1e8, 'learning_rate': args.lr, 'epoch_index': 0, 'train_loss': [], 'train_acc': [], 'val_loss': [],
            'val_acc': [], 'test_loss': -1, 'test_acc': -1, 'model_filename': args.model_state_file}

def pretty_print(epoch, t_loss, t_acc, v_loss, v_acc):
    det = str(epoch)
    if epoch < 100:
        det = '0' + det
    
    if epoch < 10:
        det = '0' + det
    print("Epochs:", det, '| Train loss', str(t_loss)[: 6], 'Train acc', str(t_acc)[: 6], 'Val loss', str(v_loss)[: 6], 'Val acc', str(v_acc)[: 6])

# Dataset

In [4]:
class Vocabulary(object):
    def __init__(self, token_to_idx=None):
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx: token for token, idx in self._token_to_idx.items()}

    def to_serializable(self):
        return {'token_to_idx': self._token_to_idx}

    @classmethod
    def from_serializable(cls, contents):
        return cls(**contents)

    def add_token(self, token):
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index

    def add_many(self, tokens):
        return [self.add_token(token) for token in tokens]

    def lookup_token(self, token):
        return self._token_to_idx[token]

    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]

    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)

    def __len__(self):
        return len(self._token_to_idx)

class SequenceVocabulary(Vocabulary):
    def __init__(self, token_to_idx=None, unk_token="<UNK>", mask_token="<MASK>", begin_seq_token="<BEGIN>", end_seq_token="<END>"):

        super(SequenceVocabulary, self).__init__(token_to_idx)
        self._mask_token = mask_token
        self._unk_token = unk_token
        self._begin_seq_token = begin_seq_token
        self._end_seq_token = end_seq_token

        self.mask_index = self.add_token(self._mask_token)
        self.unk_index = self.add_token(self._unk_token)
        self.begin_seq_index = self.add_token(self._begin_seq_token)
        self.end_seq_index = self.add_token(self._end_seq_token)

    def to_serializable(self):
        contents = super(SequenceVocabulary, self).to_serializable()
        contents.update({'unk_token': self._unk_token, 'mask_token': self._mask_token, 'begin_seq_token': self._begin_seq_token, 'end_seq_token': self._end_seq_token})
        return contents

    def lookup_token(self, token):
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

class NERVectorizer(object):
    def __init__(self, NARRATIVE_vocab, target_vocab):
        self.NARRATIVE_vocab = NARRATIVE_vocab
        self.target_vocab = target_vocab

    def vectorize(self, NARRATIVE, target, vector_length=-1, target_vector_length=-1):
        indices = [self.NARRATIVE_vocab.begin_seq_index]
        indices.extend(self.NARRATIVE_vocab.lookup_token(token) for token in NARRATIVE.split("|"))
        indices.append(self.NARRATIVE_vocab.end_seq_index)

        if vector_length < 0:
            vector_length = len(indices)

        x_source = np.zeros(vector_length, dtype=np.int64)
        x_source[:len(indices)] = indices
        x_source[len(indices):] = self.NARRATIVE_vocab.mask_index
        
        out_indices = [self.target_vocab.begin_seq_index]
        out_indices.extend(self.target_vocab.lookup_token(token) for token in target.split("|"))
        out_indices.append(self.target_vocab.end_seq_index)
        
        if target_vector_length < 0:
            target_vector_length = len(out_indices)
        
        x_target = np.zeros(target_vector_length, dtype=np.int64)
        x_target[:len(out_indices[:-1])] = out_indices[:-1]
        x_target[len(out_indices[:-1]):] = self.target_vocab.mask_index
        
        y_target = np.zeros(target_vector_length-1, dtype=np.int64)
        y_target[:len(out_indices[1:])] = out_indices[1:]
        y_target[len(out_indices[1:]):] = self.target_vocab.mask_index
        return x_source, x_target, y_target, len(indices), len(out_indices)

    @classmethod
    def from_dataframe(cls, df, targets):
        target_vocab = SequenceVocabulary()        
        for target in sorted(targets):
            target_vocab.add_token(target)

        word_counts = Counter()
        for NARRATIVE in df.NARRATIVE:
            for token in NARRATIVE.split("|"):
                word_counts[token] += 1

        NARRATIVE_vocab = SequenceVocabulary()
        for word, word_count in word_counts.items():
            NARRATIVE_vocab.add_token(word)
        return cls(NARRATIVE_vocab, target_vocab)

    @classmethod
    def from_serializable(cls, contents):
        title_vocab = SequenceVocabulary.from_serializable(contents['NARRATIVE_vocab'])
        category_vocab = Vocabulary.from_serializable(contents['target_vocab'])
        return cls(NARRATIVE_vocab=NARRATIVE_vocab, target_vocab=target_vocab)

    def to_serializable(self):
        return {'NARRATIVE_vocab': self.NARRATIVE_vocab.to_serializable(), 'target_vocab': self.target_vocab.to_serializable()}

class NERDataset(Dataset):
    def __init__(self, df, vectorizer, labels):
        self.df = df
        self._vectorizer = vectorizer
        self.labels=labels

        measure_len = lambda context: len(context.split("|"))
        self._max_seq_length = max(map(measure_len, df.NARRATIVE)) + 2
        
        self.train_df = self.df[self.df.split=='train']
        self.train_size = len(self.train_df)

        self.val_df = self.df[self.df.split=='val']
        self.validation_size = len(self.val_df)

        self.test_df = self.df[self.df.split=='test']
        self.test_size = len(self.test_df)

        self._lookup_dict = {'train': (self.train_df, self.train_size), 'val': (self.val_df, self.validation_size), 'test': (self.test_df, self.test_size)}
        self.set_split('train')

    @classmethod
    def load_dataset_and_make_vectorizer(cls, news_csv, label_txt):
        labels=[]
        with codecs.open('labels.txt', 'r', encoding='utf-8', errors='ignore') as f:
            for line in f:
                if len(line)>0:
                    labels.append(line.strip())
        
        df = pd.read_csv(news_csv)
        return cls(df, NERVectorizer.from_dataframe(df, labels), labels)

    @classmethod
    def load_dataset_and_load_vectorizer(cls, news_csv, vectorizer_filepath):
        df = pd.read_csv(news_csv)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(news_csv, vectorizer)

    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        with open(vectorizer_filepath) as fp:
            return NameVectorizer.from_serializable(json.load(fp))

    def save_vectorizer(self, vectorizer_filepath):
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)

    def get_vectorizer(self):
        return self._vectorizer

    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        row = self._target_df.iloc[index]
        NARRATIVE_vector, x_target, y_target, length1, length2 = self._vectorizer.vectorize(row.NARRATIVE, row.target, self._max_seq_length, self._max_seq_length)
        return {"x_source": NARRATIVE_vector, 
                "x_target": x_target,
                "y_target": y_target, 
                'x_length': length1,
                'y_length': length2}

    def get_num_batches(self, batch_size):
        return len(self) // batch_size

## Load data

In [5]:
dataset = NERDataset.load_dataset_and_make_vectorizer(args.data_file, args.lable_file)
dataset.save_vectorizer(args.vectorizer_file)
vectorizer = dataset.get_vectorizer()

In [6]:
dataset.df.shape

(10000, 3)

In [7]:
dataset[0]

{'x_source': array([ 2,  4,  5,  6,  7,  8,  9, 10,  5, 11, 12, 13, 14,  5, 15,  8, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 13, 27, 23, 28,  5, 29, 30,
        31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 23, 37,  9,
         6,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]),
 'x_target': array([  2,   5, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179,
        179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179,
        179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179,
        179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0]),
 'y_target': array([  5, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179,
        179, 179, 179, 179, 179, 179, 179, 179, 179, 

# Teaching Force

## Model

In [8]:
class SeqToSeq(nn.Module):
    def __init__(self, source_vocab_size, source_embedding_size,  target_vocab_size, target_embedding_size, encoding_size,  target_bos_index):
        super(SeqToSeq, self).__init__()
        self.encoder = Encoder(num_embeddings=source_vocab_size,  embedding_size=source_embedding_size, rnn_hidden_size=encoding_size)
        decoding_size = encoding_size * 2
        self.decoder = Decoder(num_embeddings=target_vocab_size,  embedding_size=target_embedding_size,  rnn_hidden_size=decoding_size, bos_index=target_bos_index)
    
    def forward(self, x_source, x_source_lengths, target_sequence, _train):
        encoder_state, final_hidden_states = self.encoder(x_source, x_source_lengths)
        decoded_states = self.decoder(encoder_state=encoder_state,  initial_hidden_state=final_hidden_states,  target_sequence=target_sequence, _train=_train)
        return decoded_states

class Decoder(nn.Module):
    def __init__(self, num_embeddings, embedding_size, rnn_hidden_size, bos_index):
        super(Decoder, self).__init__()
        self._rnn_hidden_size = rnn_hidden_size
        self.target_embedding = nn.Embedding(num_embeddings=num_embeddings,  embedding_dim=embedding_size,  padding_idx=0)
        self.rnn = nn.GRUCell(embedding_size,  rnn_hidden_size)
        self.classifier1 = nn.Linear(rnn_hidden_size, num_embeddings)
        self.classifier2 = nn.Linear(num_embeddings, num_embeddings)
        self.bos_index = bos_index
    
    def _init_indices(self, batch_size):
        return torch.ones(batch_size, dtype=torch.int64) * self.bos_index
    
    def forward(self, encoder_state, initial_hidden_state, target_sequence, _train):

        target_sequence = target_sequence.permute(1, 0)
        output_sequence_size = target_sequence.size(0)
        
        h_t = initial_hidden_state
        batch_size = encoder_state.size(0)


        y_t_index = self._init_indices(batch_size)
        
        h_t = h_t.to(encoder_state.device)
        y_t_index = y_t_index.to(encoder_state.device)

        output_vectors = []
        y_t_index = target_sequence[0]
        ''' Lab example is from 0-output_sequence_size'''
        for i in range(1, output_sequence_size):
            '''Lab example use target_sequence[i] not correct'''
            y_input_vector = self.target_embedding(y_t_index)
            h_t = self.rnn(y_input_vector, h_t)

            score_for_y_t_index = self.classifier1(h_t)
            score_for_y_t_index = self.classifier2(score_for_y_t_index)
            output_vectors.append(score_for_y_t_index)
            
            _, y_t_index = score_for_y_t_index.max(dim=1)
#             y_t_index = score_for_y_t_index.argmax(1)
            if self.training and torch.rand(1) < 0.2:
                y_t_index = target_sequence[i]
            
        output_vectors = torch.stack(output_vectors).permute(1, 0, 2)
        return output_vectors

class Encoder(nn.Module):
    def __init__(self, num_embeddings, embedding_size, rnn_hidden_size):
        super(Encoder, self).__init__()
    
        self.source_embedding = nn.Embedding(num_embeddings, embedding_size, padding_idx=0)
        self.rnn = nn.GRU(embedding_size, rnn_hidden_size, bidirectional=True, batch_first=True)
    
    def forward(self, x_source, x_lengths):
        x_embedded = self.source_embedding(x_source)
        x_packed = pack_padded_sequence(x_embedded, x_lengths.detach().cpu().numpy(), batch_first=True)

        out, hidden  = self.rnn(x_packed)
        hidden = hidden.permute(1, 0, 2)
        hidden = hidden.contiguous().view(hidden.size(0), -1)
        out, _ = pad_packed_sequence(out, batch_first=True)
        
        return out, hidden

## Build model

In [9]:
NERmodel = SeqToSeq(source_vocab_size=len(vectorizer.NARRATIVE_vocab), 
                 source_embedding_size=args.source_embedding_size, 
                 target_vocab_size=len(vectorizer.target_vocab),
                 target_embedding_size=args.target_embedding_size, 
                 encoding_size=args.encoding_size,
                 target_bos_index=vectorizer.target_vocab.begin_seq_index)

In [10]:
NERmodel

SeqToSeq(
  (encoder): Encoder(
    (source_embedding): Embedding(13225, 128, padding_idx=0)
    (rnn): GRU(128, 128, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (target_embedding): Embedding(180, 128, padding_idx=0)
    (rnn): GRUCell(128, 256)
    (classifier1): Linear(in_features=256, out_features=180, bias=True)
    (classifier2): Linear(in_features=180, out_features=180, bias=True)
  )
)

## Set up

In [11]:
train_state = make_train_state(args)
set_seed_everywhere(args.seed, args.cuda)
args.device = torch.device("cuda" if args.cuda else "cpu")
NERmodel = NERmodel.to(args.device)

## Train

Try to run if model dosent saved, then reset this kernel to free up GPU or CPU memory

In [12]:
optimizer = optim.Adam(NERmodel.parameters(), lr=args.lr)
mask_index = vectorizer.target_vocab.mask_index

for epoch_index in range(args.num_epochs):
    dataset.set_split('train')
    batch_generator = generate_batches(dataset, batch_size=args.batch_train, device=args.device)
    running_loss = 0.0
    running_acc = 0.0
    NERmodel.train()

    for batch_index, batch_dict in enumerate(batch_generator):  
        # step 1. zero the gradients
        optimizer.zero_grad()

        # step 2. compute the output
        y_pred = NERmodel(batch_dict['x_source'],  batch_dict['x_length'], batch_dict['x_target'], _train=True)

        # step 3. compute the loss
        loss = sequence_loss(y_pred, batch_dict['y_target'], mask_index)

        # step 4. use loss to produce gradients
        loss.backward()

        # step 5. use optimizer to take gradient step
        optimizer.step()

        # compute the running loss and running accuracy
        running_loss += (loss.item() - running_loss) / (batch_index + 1)

        acc_t = compute_accuracy(y_pred, batch_dict['y_target'], mask_index)
        running_acc += (acc_t - running_acc) / (batch_index + 1)
    
    train_state['train_loss'].append(running_loss)
    train_state['train_acc'].append(running_acc)
    
    dataset.set_split('val')
    args.batch_valid = args.batch_valid if args.batch_valid else len(dataset)
    batch_generator = generate_batches(dataset,  batch_size=args.batch_valid,  device=args.device)
    
    running_loss = 0.
    running_acc = 0.
    NERmodel.eval()

    for batch_index, batch_dict in enumerate(batch_generator):
        # compute the output
        y_pred = NERmodel(batch_dict['x_source'],  batch_dict['x_length'],  batch_dict['x_target'], _train=False)

        # step 1. compute the loss
        loss = sequence_loss(y_pred, batch_dict['y_target'], mask_index)

        # compute the running loss and accuracy
        running_loss += (loss.item() - running_loss) / (batch_index + 1)

        acc_t = compute_accuracy(y_pred, batch_dict['y_target'], mask_index)
        running_acc += (acc_t - running_acc) / (batch_index + 1)
    
    train_state['val_loss'].append(running_loss)
    train_state['val_acc'].append(running_acc)
    
    try:
        if train_state['train_loss'][-1]< train_state['train_loss'][-2]:
            pretty_print(epoch_index, train_state['train_loss'][-1], train_state['train_acc'][-1], train_state['val_loss'][-1], train_state['val_acc'][-1])
    except:
        pretty_print(epoch_index, train_state['train_loss'][-1], train_state['train_acc'][-1], train_state['val_loss'][-1], train_state['val_acc'][-1])
    train_state = update_train_state(args=args, model=NERmodel, train_state=train_state)

Epochs: 000 | Train loss 5.1436 Train acc 16.288 Val loss 5.0210 Val acc 45.027
Epochs: 001 | Train loss 4.7757 Train acc 75.596 Val loss 4.5663 Val acc 72.631
Epochs: 002 | Train loss 4.2888 Train acc 81.388 Val loss 3.9805 Val acc 81.383
Epochs: 003 | Train loss 3.5968 Train acc 87.895 Val loss 3.1260 Val acc 88.324
Epochs: 004 | Train loss 2.8501 Train acc 87.760 Val loss 2.4846 Val acc 87.610
Epochs: 005 | Train loss 2.4187 Train acc 87.601 Val loss 2.4147 Val acc 87.432
Epochs: 006 | Train loss 2.3595 Train acc 87.588 Val loss 2.2832 Val acc 87.757
Epochs: 007 | Train loss 2.2349 Train acc 87.587 Val loss 2.1087 Val acc 87.979
Epochs: 008 | Train loss 2.1013 Train acc 87.577 Val loss 2.0230 Val acc 87.660
Epochs: 009 | Train loss 2.0399 Train acc 88.142 Val loss 2.0103 Val acc 88.449
Epochs: 010 | Train loss 1.9745 Train acc 88.104 Val loss 1.9361 Val acc 88.009
Epochs: 011 | Train loss 1.9082 Train acc 87.785 Val loss 1.9178 Val acc 87.588
Epochs: 012 | Train loss 1.8652 Train ac

## Test

In [13]:
args.device = torch.device("cpu")
# checkpoint = torch.load(args.model_state_file)
# NERmodel.load_state_dict(checkpoint['model_state_dict'])
NERmodel.load_state_dict(torch.load(train_state['model_filename']))
NERmodel = NERmodel.to(args.device)

dataset.set_split('test')
args.batch_test = args.batch_test if args.batch_test else len(dataset)
args.batch_test = len(dataset)
batch_generator = generate_batches(dataset,  batch_size=args.batch_test, shuffle=False, device=args.device)

running_loss = 0.
running_acc = 0.
NERmodel.eval()
mask_index = vectorizer.target_vocab.mask_index
for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred = NERmodel(batch_dict['x_source'],  batch_dict['x_length'],  batch_dict['x_target'], _train=False)
    y_true = batch_dict['y_target']

    # step 1. compute the loss
    loss = sequence_loss(y_pred, batch_dict['y_target'], mask_index)

    # compute the running loss and accuracy
    running_loss += (loss.item() - running_loss) / (batch_index + 1)

    acc_t = compute_accuracy(y_pred, batch_dict['y_target'], mask_index)
    running_acc += (acc_t - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

print('Test loss', str(running_loss)[: 6], 'Test acc', str(running_acc)[: 6])

Test loss 1.4316 Test acc 89.613


In [None]:
View accuracy

In [14]:
pred, true = normalize_sizes(y_pred, y_true)
true = true.cpu().detach().numpy()
_, y_pred_indices = pred.max(dim=1)

y_pred_indices = y_pred_indices.cpu().detach().numpy()
df = pd.DataFrame({'y_true':true, 'y_pred':y_pred_indices})
df = df[df['y_true'] != 0]
df = df[df['y_true'] != 179]
accuracy_score(df['y_true'], df['y_pred'])

0.2773091297383876

In [15]:
# Plot is too big

# from matplotlib.pyplot import figure
# cmp = ConfusionMatrixDisplay.from_predictions(df['y_true'], df['y_pred'], normalize='pred')
# fig, ax = plt.subplots(figsize=(80,80))
# cmp.plot(ax=ax)

## View the entity
By just look at the accuracy of prediciton will not make sense, as some labels in the sentinces may misclassified by human or some entites are similar, to check the actual performance we need to look at it result 

Most entities it did not successfully predicted, only tag it can successfulyy predict is 
```
employee                        B-person/employee                         B-person/employee
```
Methdos below using slef-attention is much better

## Print prediciton

In [16]:
args.device = torch.device("cpu")
# checkpoint = torch.load(args.model_state_file)
# NERmodel.load_state_dict(checkpoint['model_state_dict'])
NERmodel.load_state_dict(torch.load(train_state['model_filename']))
NERmodel = NERmodel.to(args.device)

dataset.set_split('test')
batch_generator = generate_batches(dataset,  batch_size=1, shuffle=False, device=args.device)

running_loss = 0.
running_acc = 0.
NERmodel.eval()
mask_index = vectorizer.target_vocab.mask_index
i = 0
for batch_index, batch_dict in enumerate(batch_generator):
    y_pred = NERmodel(batch_dict['x_source'],  batch_dict['x_length'],  batch_dict['x_target'], _train=False)
    origin = batch_dict['x_source'][0]
    y_true = batch_dict['y_target']
    pred, true = normalize_sizes(y_pred, y_true)
    _, y_pred_indices = pred.max(dim=1)
    
    final = []
    for j in range(len(true)):
        if true[j].item() != 0:
            tmp = []
            tmp.append(vectorizer.NARRATIVE_vocab.lookup_index(origin[j+1].item()))
            tmp.append(vectorizer.target_vocab.lookup_index(true[j].item()))
            tmp.append(vectorizer.target_vocab.lookup_index(y_pred_indices[j].item()))
            
            final.append(tmp)

    print(tabulate(final))

    i += 1
    if i == 50:
        break

------------------------------  ----------------------------------------  -----------------
employee                        B-person/employee                         B-person/employee
when                            O                                         O
work                            O                                         O
in                              O                                         O
his                             O                                         O
area                            O                                         O
cone                            O                                         O
crusher                         B-equipment/mechanical_equipment/crusher  O
he                              O                                         O
be                              O                                         O
wear                            O                                         O
his                             O                       

# Self attention

## Functions

In [17]:
def verbose_attention(encoder_state_vectors, query_vector):
    batch_size, num_vectors, vector_size = encoder_state_vectors.size()
    vector_scores = torch.sum(encoder_state_vectors * query_vector.view(batch_size, 1, vector_size),  dim=2) # Apply dot product
    vector_probabilities = F.softmax(vector_scores, dim=1) # Apply softmax to ensure row sums to 1
    weighted_vectors = encoder_state_vectors * vector_probabilities.view(batch_size, num_vectors, 1)
    context_vectors = torch.sum(weighted_vectors, dim=1)
    return context_vectors, vector_probabilities, vector_scores

class Attention(nn.Module):
    def __init__(self, dim):
        super(Attention, self).__init__()

    def set_mask(self, mask=0):
        self.mask = mask

    def forward(self, encoder_state_vectors, query_vector):
        batch_size, num_vectors, vector_size = encoder_state_vectors.size()
        
        vector_scores = torch.sum(encoder_state_vectors * query_vector.view(batch_size, 1, vector_size),  dim=2) # Apply dot product
        vector_probabilities = F.softmax(vector_scores, dim=1) # Apply softmax to make sure row sums to 1
        
        weighted_vectors = encoder_state_vectors * vector_probabilities.view(batch_size, num_vectors, 1) # Get Y^T
        context_vectors = torch.sum(weighted_vectors, dim=1)  # Get y sum
        
        prediction_vector = torch.cat((context_vectors, query_vector), dim=1)

        return prediction_vector

In [24]:
args = Namespace(
    # Data and Path hyper parameters
    data_file='./data/ner.csv',
    lable_file='labels.txt',
    vectorizer_file="nervectorizer.json",
    model_state_file="NERSelfAttention.h5",
    lr = 0.001,
    seed=666,
    dropout_p=0.1,
    batch_train=1024,
    batch_valid=512,
    batch_test=1024,
    num_epochs=200,
    cuda=True,
    reload_from_files=False,
    expand_filepaths_to_save_dir=True,
    source_embedding_size=128, 
    target_embedding_size=128, 
    encoding_size=128,
)

## Model

In [25]:
class SeqToSeq(nn.Module):
    def __init__(self, source_vocab_size, source_embedding_size,  target_vocab_size, target_embedding_size, encoding_size,  target_bos_index):
        super(SeqToSeq, self).__init__()
        self.encoder = Encoder(num_embeddings=source_vocab_size,  embedding_size=source_embedding_size, rnn_hidden_size=encoding_size)
        decoding_size = encoding_size * 2
        self.decoder = Decoder(num_embeddings=target_vocab_size,  embedding_size=target_embedding_size,  rnn_hidden_size=decoding_size, bos_index=target_bos_index)
    
    def forward(self, x_source, x_source_lengths, target_sequence):
        encoder_state, final_hidden_states = self.encoder(x_source, x_source_lengths)
        decoded_states = self.decoder(encoder_state=encoder_state,  initial_hidden_state=final_hidden_states,  target_sequence=target_sequence)
        return decoded_states

class Decoder(nn.Module):
    def __init__(self, num_embeddings, embedding_size, rnn_hidden_size, bos_index):
        super(Decoder, self).__init__()
        self._rnn_hidden_size = rnn_hidden_size
        self.target_embedding = nn.Embedding(num_embeddings=num_embeddings,  embedding_dim=embedding_size,  padding_idx=0)
        self.rnn = nn.GRUCell(embedding_size + rnn_hidden_size,  rnn_hidden_size)
        self.hidden_map = nn.Linear(rnn_hidden_size, rnn_hidden_size)
        self.classifier1 = nn.Linear(rnn_hidden_size * 2, num_embeddings*2)
        self.classifier2 = nn.Linear(num_embeddings * 2, num_embeddings)
        self.bos_index = bos_index
        self.relu = nn.ReLU()
    
    def _init_indices(self, batch_size):
        return torch.ones(batch_size, dtype=torch.int64) * self.bos_index
    
    def _init_context_vectors(self, batch_size):
        return torch.zeros(batch_size, self._rnn_hidden_size)
            
    def forward(self, encoder_state, initial_hidden_state, target_sequence):
        target_sequence = target_sequence.permute(1, 0)
        output_sequence_size = target_sequence.size(0)

        h_t = self.hidden_map(initial_hidden_state)
        batch_size = encoder_state.size(0)
        context_vectors = self._init_context_vectors(batch_size)

        y_t_index = self._init_indices(batch_size)
        h_t = h_t.to(encoder_state.device)
        y_t_index = y_t_index.to(encoder_state.device)
        context_vectors = context_vectors.to(encoder_state.device)
        y_t_index = target_sequence[0]

        output_vectors = []
        for i in range(1, output_sequence_size):
#             y_t_index = target_sequence[i] # Lab example use this as prediction
                
            # Step 1: Embed word and concat with previous context
            y_input_vector = self.target_embedding(y_t_index)
            rnn_input = torch.cat([y_input_vector, context_vectors], dim=1)
            
            # Step 2: Make a GRU step, getting a new hidden vector
            h_t = self.rnn(rnn_input, h_t)
            
            # Step 3: Use the current hidden to attend to the encoder state
            context_vectors, p_attn, _ = verbose_attention(encoder_state_vectors=encoder_state, query_vector=h_t)
            
            # Step 4: Use the current hidden and context vectors to make a prediction to the next word
            prediction_vector = torch.cat((context_vectors, h_t), dim=1)
            score_for_y_t_index = self.classifier1(F.dropout(prediction_vector, 0.1))
            score_for_y_t_index = self.classifier2(F.dropout(score_for_y_t_index, 0.1))
            '''
            I switch to this one
            due the understanding of Seq2Seq
            '''
            _, y_t_index = score_for_y_t_index.max(dim=1)
            if self.training and torch.rand(1) < 0.2: 
                y_t_index = target_sequence[i]
            output_vectors.append(score_for_y_t_index)
        return torch.stack(output_vectors).permute(1, 0, 2)

class Encoder(nn.Module):
    def __init__(self, num_embeddings, embedding_size, rnn_hidden_size):
        super(Encoder, self).__init__()
    
        self.source_embedding = nn.Embedding(num_embeddings, embedding_size, padding_idx=0)
        self.rnn = nn.GRU(embedding_size, rnn_hidden_size, bidirectional=True, batch_first=True)
    
    def forward(self, x_source, x_lengths):
        x_embedded = self.source_embedding(x_source)
        x_packed = pack_padded_sequence(x_embedded, x_lengths.detach().cpu().numpy(), batch_first=True)

        out, hidden  = self.rnn(x_packed)
        hidden = hidden.permute(1, 0, 2)
        hidden = hidden.contiguous().view(hidden.size(0), -1)
        out, _ = pad_packed_sequence(out, batch_first=True)
        return out, hidden

In [26]:
NERmodel = SeqToSeq(source_vocab_size=len(vectorizer.NARRATIVE_vocab), 
                 source_embedding_size=args.source_embedding_size, 
                 target_vocab_size=len(vectorizer.target_vocab),
                 target_embedding_size=args.target_embedding_size, 
                 encoding_size=args.encoding_size,
                 target_bos_index=vectorizer.target_vocab.begin_seq_index)

In [27]:
len(vectorizer.target_vocab)

180

In [28]:
NERmodel

SeqToSeq(
  (encoder): Encoder(
    (source_embedding): Embedding(13225, 128, padding_idx=0)
    (rnn): GRU(128, 128, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (target_embedding): Embedding(180, 128, padding_idx=0)
    (rnn): GRUCell(384, 256)
    (hidden_map): Linear(in_features=256, out_features=256, bias=True)
    (classifier1): Linear(in_features=512, out_features=360, bias=True)
    (classifier2): Linear(in_features=360, out_features=180, bias=True)
    (relu): ReLU()
  )
)

## Set up

In [29]:
train_state = make_train_state(args)
set_seed_everywhere(args.seed, args.cuda)
args.device = torch.device("cuda" if args.cuda else "cpu")
NERmodel = NERmodel.to(args.device)

## Train

Try to run if model dosent saved, then reset this kernel to free up GPU or CPU memory

In [30]:
optimizer = optim.Adam(NERmodel.parameters(), lr=args.lr)
mask_index = vectorizer.target_vocab.mask_index
# dataset.class_weights = dataset.class_weights.to(args.device)

for epoch_index in range(args.num_epochs):
    dataset.set_split('train')
    batch_generator = generate_batches(dataset, batch_size=args.batch_train, device=args.device)
    running_loss = 0.0
    running_acc = 0.0
    NERmodel.train()

    for batch_index, batch_dict in enumerate(batch_generator):  
        # step 1. zero the gradients
        optimizer.zero_grad()

        # step 2. compute the output
        y_pred = NERmodel(batch_dict['x_source'],  batch_dict['x_length'], batch_dict['x_target'])

        # step 3. compute the loss
        loss = sequence_loss(y_pred, batch_dict['y_target'], mask_index)

        # step 4. use loss to produce gradients
        loss.backward()

        # step 5. use optimizer to take gradient step
        optimizer.step()

        # compute the running loss and running accuracy
        running_loss += (loss.item() - running_loss) / (batch_index + 1)

        acc_t = compute_accuracy(y_pred, batch_dict['y_target'], mask_index)
        running_acc += (acc_t - running_acc) / (batch_index + 1)
    
    train_state['train_loss'].append(running_loss)
    train_state['train_acc'].append(running_acc)
    
    dataset.set_split('val')
    args.batch_valid = args.batch_valid if args.batch_valid else len(dataset)
    batch_generator = generate_batches(dataset,  batch_size=args.batch_valid,  device=args.device)
    running_loss = 0.
    running_acc = 0.
    NERmodel.eval()

    for batch_index, batch_dict in enumerate(batch_generator):
        # compute the output
        y_pred = NERmodel(batch_dict['x_source'],  batch_dict['x_length'],  batch_dict['x_target'])

        # step 1. compute the loss
        loss = sequence_loss(y_pred, batch_dict['y_target'], mask_index)

        # compute the running loss and accuracy
        running_loss += (loss.item() - running_loss) / (batch_index + 1)

        acc_t = compute_accuracy(y_pred, batch_dict['y_target'], mask_index)
        running_acc += (acc_t - running_acc) / (batch_index + 1)

    train_state['val_loss'].append(running_loss)
    train_state['val_acc'].append(running_acc)
    try:
        if train_state['train_loss'][-1]< train_state['train_loss'][-2]:
            pretty_print(epoch_index, train_state['train_loss'][-1], train_state['train_acc'][-1], train_state['val_loss'][-1], train_state['val_acc'][-1])
    except:
        pretty_print(epoch_index, train_state['train_loss'][-1], train_state['train_acc'][-1], train_state['val_loss'][-1], train_state['val_acc'][-1])
    train_state = update_train_state(args=args, model=NERmodel, train_state=train_state)

Epochs: 000 | Train loss 3.3870 Train acc 75.156 Val loss 2.1036 Val acc 87.708
Epochs: 001 | Train loss 2.0260 Train acc 82.505 Val loss 1.9685 Val acc 87.956
Epochs: 002 | Train loss 1.8694 Train acc 87.647 Val loss 1.8300 Val acc 87.410
Epochs: 003 | Train loss 1.7977 Train acc 87.693 Val loss 1.7637 Val acc 87.958
Epochs: 004 | Train loss 1.7539 Train acc 87.791 Val loss 1.7213 Val acc 87.998
Epochs: 005 | Train loss 1.7164 Train acc 87.894 Val loss 1.7388 Val acc 87.638
Epochs: 006 | Train loss 1.6904 Train acc 88.078 Val loss 1.6670 Val acc 88.830
Epochs: 007 | Train loss 1.6525 Train acc 88.603 Val loss 1.6221 Val acc 88.916
Epochs: 008 | Train loss 1.6219 Train acc 88.590 Val loss 1.6116 Val acc 88.691
Epochs: 009 | Train loss 1.5931 Train acc 88.498 Val loss 1.6176 Val acc 88.454
Epochs: 010 | Train loss 1.5625 Train acc 88.486 Val loss 1.5743 Val acc 88.397
Epochs: 011 | Train loss 1.5230 Train acc 88.318 Val loss 1.5613 Val acc 87.922
Epochs: 012 | Train loss 1.4754 Train ac

## Test

In [31]:
args.device = torch.device("cpu")
# checkpoint = torch.load(args.model_state_file)
# NERmodel.load_state_dict(checkpoint['model_state_dict'])
NERmodel.load_state_dict(torch.load(train_state['model_filename']))
NERmodel = NERmodel.to(args.device)

dataset.set_split('test')
args.batch_test = len(dataset)
batch_generator = generate_batches(dataset,  batch_size=args.batch_test,  device=args.device)

running_loss = 0.
running_acc = 0.
NERmodel.eval()
mask_index = vectorizer.target_vocab.mask_index
for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred = NERmodel(batch_dict['x_source'],  batch_dict['x_length'],  batch_dict['x_target'])

    # step 1. compute the loss
    loss = sequence_loss(y_pred, batch_dict['y_target'], mask_index)

    # compute the running loss and accuracy
    running_loss += (loss.item() - running_loss) / (batch_index + 1)

    acc_t = compute_accuracy(y_pred, batch_dict['y_target'], mask_index)
    running_acc += (acc_t - running_acc) / (batch_index + 1)
    y_true = batch_dict['y_target']

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

print('Test loss', str(running_loss)[: 6], 'Test acc', str(running_acc)[: 6])

Test loss 0.4125 Test acc 90.968


In [32]:
y_pred, y_true = normalize_sizes(y_pred, y_true)
y_true = y_true.cpu().detach().numpy()
_, y_pred_indices = y_pred.max(dim=1)

y_pred_indices = y_pred_indices.cpu().detach().numpy()
df = pd.DataFrame({'y_true':y_true, 'y_pred':y_pred_indices})
df = df[df['y_true'] != 0]
df = df[df['y_true'] != 179] # Remvoe "O" tag
accuracy_score(df['y_true'], df['y_pred'])

0.7278163374265884

Out of 179 classes (exclude "O" tag) the performance of the mode  is not good in terms of accuracy, But as it have 179 entiyies the performance is better than baseline (0.56%   1/179)

During our test we found our Performance improved from 0.27 to 0.72 which around 45% of performance improvement, this demonstrate Seq2Seq with self attention will perfrom model performance significantly

To view the confusion matrix comment out code below - long time to run

In [33]:
# Plot is too big

# from matplotlib.pyplot import figure
# cmp = ConfusionMatrixDisplay.from_predictions(df['y_true'], df['y_pred'], normalize='pred')
# fig, ax = plt.subplots(figsize=(80,80))
# cmp.plot(ax=ax)

# View the entity
By just look at the accuracy of prediciton will not make sense, as some labels in the sentinces may misclassified by human or some entites are similar, to check the actual performance we need to look at it result

Uncomment code out if needed, 

Bu viewing the text we can see the the model is catching up some newly tagged entities that make send like the one below
```
right         O                                        B-body_part/arm/shoulder/right_shoulder
shoulder      B-body_part/arm/shoulder                 I-body_part/arm/shoulder/right_shoulder


sudden                          O                       B-body_part/back
back                            B-body_part/back        B-body_part/back

high                 O                      B-accident_cause/fall
fall                 B-accident_cause/fall  B-accident_cause/fall
```
Which make sense 


But also sometime this mode is also catching up some information that suppose not be in the entities categories such as 
```
his                             O                                         B-body_part/arm/hand/left_hand
left                            B-body_part/arm/hand/left_hand            I-body_part/arm/hand/left_hand
hand                            I-body_part/arm/hand/left_hand            O
and                             O                                         B-body_part/neck
his                             O                                         B-body_part/neck
neck                            B-body_part/neck                          O
when                            O                                         B-body_part/neck


muscle                          B-injury/muscle         O
fatigue                         O                       B-injury/muscle
```
We can see there's slightly shift to the actual prediction

which it taged the words in font of hand as `B-body_part/arm/hand` but not the `hand` word

Also by looking at the prediction result we can see it actually catching `pronoun` following by `Adj`, `Noun` etc. which is understandable. 

```
his       O                                 B-body_part/leg/foot/left_foot
left      B-body_part/leg/foot/left_foot    B-body_part/leg/foot/left_foot
foot      I-body_part/leg/foot/left_foot    I-body_part/leg/foot/left_foot
```

# Print 50 samples

In [34]:
args.device = torch.device("cpu")
# checkpoint = torch.load(args.model_state_file)
# NERmodel.load_state_dict(checkpoint['model_state_dict'])
NERmodel.load_state_dict(torch.load(train_state['model_filename']))
NERmodel = NERmodel.to(args.device)

dataset.set_split('test')
batch_generator = generate_batches(dataset,  batch_size=1, shuffle=False, device=args.device)

running_loss = 0.
running_acc = 0.
NERmodel.eval()
mask_index = vectorizer.target_vocab.mask_index
i = 0
for batch_index, batch_dict in enumerate(batch_generator):
    y_pred = NERmodel(batch_dict['x_source'],  batch_dict['x_length'],  batch_dict['x_target'])
    origin = batch_dict['x_source'][0]
    y_true = batch_dict['y_target']
    pred, true = normalize_sizes(y_pred, y_true)
    _, y_pred_indices = pred.max(dim=1)
    
    final = []
    for j in range(len(true)):
        if true[j].item() != 0:
            tmp = []
            tmp.append(vectorizer.NARRATIVE_vocab.lookup_index(origin[j+1].item()))
            tmp.append(vectorizer.target_vocab.lookup_index(true[j].item()))
            tmp.append(vectorizer.target_vocab.lookup_index(y_pred_indices[j].item()))
            
            final.append(tmp)

    print(tabulate(final))

    i += 1
    if i == 50:
        break

------------------------------  ----------------------------------------  ----------------------------------------
employee                        B-person/employee                         B-person/employee
when                            O                                         O
work                            O                                         O
in                              O                                         O
his                             O                                         O
area                            O                                         O
cone                            O                                         O
crusher                         B-equipment/mechanical_equipment/crusher  B-equipment/mechanical_equipment/crusher
he                              O                                         O
be                              O                                         O
wear                            O                                     

# Comment 
We an clearly see Sequence to Sequence with Self attention perfrom better than naive Sequence to Sequence model.

Also we found out the loss function will affect the model performance significantly, to loss function we adopted from the lab with come modification, so it can make "O" tag less important, so ican can pay more attention to other tags.
```Python
def sequence_loss(y_pred, y_true, mask_index, weight=None):
    y_pred, y_true = normalize_sizes(y_pred, y_true)
    indexs = []
    # Make "O" less important
    for i in range(len(y_true)):
        if y_true[i].item() != 179 or i%4 ==0:
            indexs.append(i)
    return F.cross_entropy(y_pred[indexs], y_true[indexs], ignore_index=mask_index, weight=weight)
```

For future improvement we can investegate more one better loss fucntion that suitable for this dataset.

Also due in the data perpation stage we removed punctuation and any word length < 1, this may affected the model performance. 