# Import

In [1]:
import pandas as pd
import numpy as np
import string
import time 
import copy
import json

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.corpora import Dictionary
from sklearn.feature_extraction.text import CountVectorizer
import seaborn as sns

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import functional as F
from torch import Tensor
from torch.nn import Transformer
from argparse import Namespace
import math

from torch.utils.data import Dataset, DataLoader
from collections import Counter
from tqdm import tqdm_notebook
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
import codecs
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from sklearn.metrics import accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay
from tabulate import tabulate

In [2]:
args = Namespace(
    # Data and Path hyper parameters
    data_file='./data/ner.csv',
    lable_file='labels.txt',
    vectorizer_file="nervectorizer.json",
    model_state_file="NERteachingForce.h5",
    lr = 0.0001,
    seed=666,
    dropout_p=0.1,
    batch_train=1024,
    batch_valid=512,
    batch_test=1024,
    num_epochs=50,
    cuda=True,
    reload_from_files=False,
    expand_filepaths_to_save_dir=True,
    source_embedding_size=128, 
    target_embedding_size=128, 
    encoding_size=128,
)

best_val  = {
    'epoch': 0,
    'model_state_dict': 0,
    'optimizer_state_dict': 0,
    'loss': 0,
    'acc':0
}


np.random.seed(666)
torch.manual_seed(666)
if args.cuda:
    torch.cuda.manual_seed_all(666)

# Functions

In [3]:
def generate_batches(dataset, batch_size, shuffle=True, drop_last=True, device="cpu"):
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)
    for data_dict in dataloader:
        lengths = data_dict['x_length'].numpy()
        sorted_length_indices = lengths.argsort()[::-1].tolist()
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name][sorted_length_indices].to(device)
        yield out_data_dict["x_source"].permute(1, 0), out_data_dict["x_target"].permute(1, 0), out_data_dict["y_target"].permute(1, 0)

def sequence_loss(y_pred, y_true, mask_index, weight=None):
    y_pred, y_true = normalize_sizes(y_pred, y_true)
    return F.cross_entropy(y_pred, y_true, ignore_index=mask_index, weight=weight)

def compute_accuracy(y_pred, y_true, mask_index):
    y_pred, y_true = normalize_sizes(y_pred, y_true)

    _, y_pred_indices = y_pred.max(dim=1)
    
    correct_indices = torch.eq(y_pred_indices, y_true).float()
    valid_indices = torch.ne(y_true, mask_index).float()
    
    n_correct = (correct_indices * valid_indices).sum().item()
    n_valid = valid_indices.sum().item()

    return n_correct / n_valid * 100

def normalize_sizes(y_pred, y_true):
    if len(y_pred.size()) == 3:
        y_pred = y_pred.contiguous().view(-1, y_pred.size(2))
    if len(y_true.size()) == 2:
        y_true = y_true.contiguous().view(-1)
    return y_pred, y_true

def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

def make_train_state(args):
    return {'stop_early': False, 'early_stopping_step': 0, 'early_stopping_best_val': 1e8, 'learning_rate': args.lr, 'epoch_index': 0, 'train_loss': [], 'train_acc': [], 'val_loss': [],
            'val_acc': [], 'test_loss': -1, 'test_acc': -1, 'model_filename': args.model_state_file}

def pretty_print(epoch, t_loss, v_loss):
    det = str(epoch)
    if epoch < 100:
        det = '0' + det
    
    if epoch < 10:
        det = '0' + det
    print("Epochs:", det, '| Train loss', str(t_loss)[: 6],  'Val loss', str(v_loss)[: 6])

# Dataset

In [4]:
class Vocabulary(object):
    def __init__(self, token_to_idx=None):
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx: token for token, idx in self._token_to_idx.items()}

    def to_serializable(self):
        return {'token_to_idx': self._token_to_idx}

    @classmethod
    def from_serializable(cls, contents):
        return cls(**contents)

    def add_token(self, token):
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index

    def add_many(self, tokens):
        return [self.add_token(token) for token in tokens]

    def lookup_token(self, token):
        return self._token_to_idx[token]

    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]

    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)

    def __len__(self):
        return len(self._token_to_idx)

class SequenceVocabulary(Vocabulary):
    def __init__(self, token_to_idx=None, unk_token="<UNK>", mask_token="<MASK>", begin_seq_token="<BEGIN>", end_seq_token="<END>"):

        super(SequenceVocabulary, self).__init__(token_to_idx)
        self._mask_token = mask_token
        self._unk_token = unk_token
        self._begin_seq_token = begin_seq_token
        self._end_seq_token = end_seq_token

        self.mask_index = self.add_token(self._mask_token)
        self.unk_index = self.add_token(self._unk_token)
        self.begin_seq_index = self.add_token(self._begin_seq_token)
        self.end_seq_index = self.add_token(self._end_seq_token)

    def to_serializable(self):
        contents = super(SequenceVocabulary, self).to_serializable()
        contents.update({'unk_token': self._unk_token, 'mask_token': self._mask_token, 'begin_seq_token': self._begin_seq_token, 'end_seq_token': self._end_seq_token})
        return contents

    def lookup_token(self, token):
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

class NERVectorizer(object):
    def __init__(self, NARRATIVE_vocab, target_vocab):
        self.NARRATIVE_vocab = NARRATIVE_vocab
        self.target_vocab = target_vocab

    def vectorize(self, NARRATIVE, target, vector_length=-1, target_vector_length=-1):
        indices = [self.NARRATIVE_vocab.begin_seq_index]
        indices.extend(self.NARRATIVE_vocab.lookup_token(token) for token in NARRATIVE.split("|"))
        indices.append(self.NARRATIVE_vocab.end_seq_index)

        if vector_length < 0:
            vector_length = len(indices)

        x_source = np.zeros(vector_length, dtype=np.int64)
        x_source[:len(indices)] = indices
        x_source[len(indices):] = self.NARRATIVE_vocab.mask_index
        
        out_indices = [self.target_vocab.begin_seq_index]
        out_indices.extend(self.target_vocab.lookup_token(token) for token in target.split("|"))
        out_indices.append(self.target_vocab.end_seq_index)
        
        if target_vector_length < 0:
            target_vector_length = len(out_indices)
        
        x_target = np.zeros(target_vector_length-1, dtype=np.int64)
        x_target[:len(out_indices[:-1])] = out_indices[:-1]
        x_target[len(out_indices[:-1]):] = self.target_vocab.mask_index
        
        y_target = np.zeros(target_vector_length-1, dtype=np.int64)
        y_target[:len(out_indices[1:])] = out_indices[1:]
        y_target[len(out_indices[1:]):] = self.target_vocab.mask_index
        return x_source, x_target, y_target, len(indices), len(out_indices)

    @classmethod
    def from_dataframe(cls, df, targets):
        target_vocab = SequenceVocabulary()        
        for target in sorted(targets):
            target_vocab.add_token(target)

        word_counts = Counter()
        for NARRATIVE in df.NARRATIVE:
            for token in NARRATIVE.split("|"):
                word_counts[token] += 1

        NARRATIVE_vocab = SequenceVocabulary()
        for word, word_count in word_counts.items():
            NARRATIVE_vocab.add_token(word)
        return cls(NARRATIVE_vocab, target_vocab)

    @classmethod
    def from_serializable(cls, contents):
        title_vocab = SequenceVocabulary.from_serializable(contents['NARRATIVE_vocab'])
        category_vocab = Vocabulary.from_serializable(contents['target_vocab'])
        return cls(NARRATIVE_vocab=NARRATIVE_vocab, target_vocab=target_vocab)

    def to_serializable(self):
        return {'NARRATIVE_vocab': self.NARRATIVE_vocab.to_serializable(), 'target_vocab': self.target_vocab.to_serializable()}

class NERDataset(Dataset):
    def __init__(self, df, vectorizer, labels):
        self.df = df
        self._vectorizer = vectorizer
        self.labels=labels

        measure_len = lambda context: len(context.split("|"))
        self._max_seq_length = max(map(measure_len, df.NARRATIVE)) + 2
        
        self.train_df = self.df[self.df.split=='train']
        self.train_size = len(self.train_df)

        self.val_df = self.df[self.df.split=='val']
        self.validation_size = len(self.val_df)

        self.test_df = self.df[self.df.split=='test']
        self.test_size = len(self.test_df)

        self._lookup_dict = {'train': (self.train_df, self.train_size), 'val': (self.val_df, self.validation_size), 'test': (self.test_df, self.test_size)}
        self.set_split('train')

    @classmethod
    def load_dataset_and_make_vectorizer(cls, news_csv, label_txt):
        labels=[]
        with codecs.open('labels.txt', 'r', encoding='utf-8', errors='ignore') as f:
            for line in f:
                if len(line)>0:
                    labels.append(line.strip())
        
        df = pd.read_csv(news_csv)
        return cls(df, NERVectorizer.from_dataframe(df, labels), labels)

    @classmethod
    def load_dataset_and_load_vectorizer(cls, news_csv, vectorizer_filepath):
        df = pd.read_csv(news_csv)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(news_csv, vectorizer)

    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        with open(vectorizer_filepath) as fp:
            return NameVectorizer.from_serializable(json.load(fp))

    def save_vectorizer(self, vectorizer_filepath):
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)

    def get_vectorizer(self):
        return self._vectorizer

    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        row = self._target_df.iloc[index]
        NARRATIVE_vector, x_target, y_target, length1, length2 = self._vectorizer.vectorize(row.NARRATIVE, row.target, self._max_seq_length, self._max_seq_length)
        return {"x_source": NARRATIVE_vector, 
                "x_target": x_target,
                "y_target": y_target, 
                'x_length': length1,
                'y_length': length2}

    def get_num_batches(self, batch_size):
        return len(self) // batch_size

## Load data

In [5]:
dataset = NERDataset.load_dataset_and_make_vectorizer(args.data_file, args.lable_file)
dataset.save_vectorizer(args.vectorizer_file)
vectorizer = dataset.get_vectorizer()

In [6]:
dataset.df.shape

(10000, 3)

In [7]:
dataset[0]

{'x_source': array([ 2,  4,  5,  6,  7,  8,  9, 10,  5, 11, 12, 13, 14,  5, 15,  8, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 13, 27, 23, 28,  5, 29, 30,
        31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 23, 37,  9,
         6,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]),
 'x_target': array([  2,   5, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179,
        179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179,
        179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179,
        179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0]),
 'y_target': array([  5, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179,
        179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 

# Attention is All You Need  - Transformer
https://github.com/bentrevett/pytorch-seq2seq/blob/master/6%20-%20Attention%20is%20All%20You%20Need.ipynb

# Model

In [8]:
# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

# Seq2Seq Network
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None, src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding( self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding( self.tgt_tok_emb(tgt)), memory, tgt_mask)

In [9]:
SRC_VOCAB_SIZE = len(vectorizer.NARRATIVE_vocab)
TGT_VOCAB_SIZE = len(vectorizer.target_vocab)
EMB_SIZE = 128
NHEAD = 4
FFN_HID_DIM = 256
BATCH_SIZE = 128
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE, NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
transformer = transformer.to(DEVICE)

In [10]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

def create_mask(src, tgt):
    PAD_IDX = vectorizer.target_vocab.mask_index
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

def make_train_state():
    return {'stop_early': False, 'early_stopping_step': 0, 'early_stopping_best_val': 1e8, 'learning_rate': 0.0001, 'epoch_index': 0, 'train_loss': [], 'train_acc': [], 'val_loss': [],
            'val_acc': [], 'test_loss': -1, 'test_acc': -1, 'model_filename': "transformer.h5"}

In [11]:
from torch.nn import functional as F
def _loss(hy, y):
    indexs= []
    for i in range(len(y)):
        if torch.rand(1) < 0.2 or y[i].item() != 179:
            indexs.append(i)
    
    return F.cross_entropy(hy[indexs], y[indexs], ignore_index=0)

def _loss_test(hy, y):
    return F.cross_entropy(hy, y, ignore_index=0)

In [12]:
def train_epoch(model, optimizer, epoch):
    model.train()
    losses = 0.
    dataset.set_split('train')
    batch_generator = generate_batches(dataset, batch_size=BATCH_SIZE, device=DEVICE)
    model.train()
#     loss_fn = nn.CrossEntropyLoss(ignore_index=0)
    for src, tgt_input, tgt_out  in batch_generator:
        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()
        
        loss = _loss(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += (loss.item() - losses) / (epoch + 1)
    return losses

def evaluate(model, epoch):
    model.eval()
    losses = 0.

    dataset.set_split('val')
    batch_generator = generate_batches(dataset, batch_size=BATCH_SIZE, device=DEVICE)
    for src, tgt_input, tgt_out  in batch_generator:
        
        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)
        
        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)
        
        loss = _loss_test(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        
        losses += (loss.item() - losses) / (epoch + 1)
    return losses

In [13]:
NUM_EPOCHS = 50
BATCH_SIZE = 256
lowest_val_loss =9999999.0
optimizer = optim.Adam(transformer.parameters(), lr=0.001)

for epoch in range(NUM_EPOCHS):
    train_loss = train_epoch(transformer, optimizer, epoch)
    val_loss = evaluate(transformer, epoch)
    
    if lowest_val_loss > val_loss:
        torch.save(transformer.state_dict(), "transformer.h5")
        lowest_val_loss = val_loss
        pretty_print(epoch, train_loss, val_loss)

Epochs: 000 | Train loss 1.9834 Val loss 0.8592
Epochs: 001 | Train loss 1.9090 Val loss 0.7767
Epochs: 002 | Train loss 1.6741 Val loss 0.5315
Epochs: 003 | Train loss 1.4786 Val loss 0.3910
Epochs: 004 | Train loss 1.4482 Val loss 0.3508
Epochs: 005 | Train loss 1.3498 Val loss 0.3085
Epochs: 006 | Train loss 1.2947 Val loss 0.2820
Epochs: 007 | Train loss 1.1948 Val loss 0.1854
Epochs: 009 | Train loss 0.8908 Val loss 0.1397
Epochs: 010 | Train loss 0.7430 Val loss 0.1169
Epochs: 011 | Train loss 0.6312 Val loss 0.0970
Epochs: 012 | Train loss 0.5465 Val loss 0.0863
Epochs: 013 | Train loss 0.4642 Val loss 0.0692
Epochs: 015 | Train loss 0.3642 Val loss 0.0609
Epochs: 016 | Train loss 0.3221 Val loss 0.0463
Epochs: 018 | Train loss 0.2527 Val loss 0.0406
Epochs: 020 | Train loss 0.2043 Val loss 0.0321
Epochs: 021 | Train loss 0.1793 Val loss 0.0239
Epochs: 022 | Train loss 0.1578 Val loss 0.0217
Epochs: 024 | Train loss 0.1189 Val loss 0.0153
Epochs: 025 | Train loss 0.1025 Val loss

## Testing

NOTE: Need 14 GB RAM

#### change `len(dataset)`to reduce RAM requirement

In [14]:
DEVICE = torch.device('cpu')

transformer.load_state_dict(torch.load("transformer.h5"))
transformer = transformer.to(DEVICE)

transformer.eval()
dataset.set_split('test')
batch_generator = generate_batches(dataset, batch_size=len(dataset), device=DEVICE)
for src, tgt_input, tgt_out  in batch_generator:
    src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)
    logits = transformer(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)
    break

### Test Lost

In [15]:
_loss_test(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))

tensor(0.0587, grad_fn=<NllLossBackward>)

### Test Acc

In [16]:
_, pred = logits.reshape(-1, logits.shape[-1]).max(1)
true = tgt_out.reshape(-1)
del src_mask
del tgt_mask
del src_padding_mask
del tgt_padding_mask
del logits
del _

In [17]:
tgt_out.reshape(-1)

tensor([112, 112, 179,  ...,   0,   0,   0])

In [18]:
accuracy_score(true, pred)

0.4566172839506173

In [19]:
count = 0
total = 0
for i in range(len(true)):
    if true[i].item() != vectorizer.target_vocab.lookup_token("O") and true[i].item() != vectorizer.target_vocab.lookup_token("<MASK>"):
        if true[i].item() == pred[i].item():
            count +=1
        total +=1

#### Actual accuracy in predicting taged values

In [20]:
count / total

0.9853710624666311

# View the entity
By just look at the accuracy of prediciton will not make sense, as some labels in the sentinces may misclassified by human or some entites are similar, to check the actual performance we need to look at it result

Almost all prediction results are correct. The those ones were misspredicted were caused by it actually identified entities that not indentified by human, such as below
```
leg                              O                  B-body_part/leg
<END>                            <END>              <END>

drive          O                  B-activity/driving
```
which those tage were actually make sense. And compare to `one slef-attention seq2seq` it dont have issues in `prediciton shift` (predict one above of below as the tag).

For the mode we also addopted the loss methods from one slef-attention seq2seq but with slightly change (to random) `torch.rand(1) < 0.2`
``` python
# For evaulate training 
def _loss(hy, y):
    indexs= []
    for i in range(len(y)):
        if torch.rand(1) < 0.2 or y[i].item() != 179:
            indexs.append(i)
    
    return F.cross_entropy(hy[indexs], y[indexs], ignore_index=0)

# For evaulate testing and valid 
def _loss_test(hy, y):
    return F.cross_entropy(hy, y, ignore_index=0)
```


In [21]:
DEVICE = torch.device('cpu')
transformer.load_state_dict(torch.load("transformer.h5"))
transformer = transformer.to(DEVICE)

dataset.set_split('test')
batch_generator = generate_batches(dataset,  batch_size=1, shuffle=False, device=DEVICE)

transformer.eval()
i = 0
for src, tgt_input, tgt_out  in batch_generator:
    src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)
    logits = transformer(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)
    
    origin = src.reshape(-1)
    _, pred = logits.reshape(-1, logits.shape[-1]).max(1)
    true = tgt_out.reshape(-1)
    
    final = []
    for j in range(len(true)):
        if true[j].item() != 0:
            tmp = []
            tmp.append(vectorizer.NARRATIVE_vocab.lookup_index(origin[j+1].item()))
            tmp.append(vectorizer.target_vocab.lookup_index(true[j].item()))
            tmp.append(vectorizer.target_vocab.lookup_index(pred[j].item()))
            final.append(tmp)
            
    print(tabulate(final))

    i += 1
    if i == 50:
        break

------------------------------  ----------------------------------------  ----------------------------------------
employee                        B-person/employee                         B-person/employee
when                            O                                         O
work                            O                                         O
in                              O                                         O
his                             O                                         O
area                            O                                         O
cone                            O                                         O
crusher                         B-equipment/mechanical_equipment/crusher  B-equipment/mechanical_equipment/crusher
he                              O                                         O
be                              O                                         O
wear                            O                                     