In [1]:
import pandas as pd
import numpy as np
import re
import string
import gensim
import time 
import json


import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.corpora import Dictionary
from sklearn.feature_extraction.text import CountVectorizer
import seaborn as sns

import multiprocessing as mp
from spellchecker import SpellChecker

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import functional as F
import math
from argparse import Namespace

from torch.utils.data import Dataset, DataLoader
from collections import Counter
from tqdm import tqdm_notebook
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
import codecs

def generate_batches(dataset, batch_size, shuffle=True, drop_last=True, device="cpu"): 
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)
    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict
        
class Vocabulary(object):
    def __init__(self, token_to_idx=None):
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx: token for token, idx in self._token_to_idx.items()}

    def to_serializable(self):
        return {'token_to_idx': self._token_to_idx}

    @classmethod
    def from_serializable(cls, contents):
        return cls(**contents)

    def add_token(self, token):
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index

    def add_many(self, tokens):
        return [self.add_token(token) for token in tokens]

    def lookup_token(self, token):
        return self._token_to_idx[token]

    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]

    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)

    def __len__(self):
        return len(self._token_to_idx)

class SequenceVocabulary(Vocabulary):
    def __init__(self, token_to_idx=None, unk_token="<UNK>", mask_token="<MASK>", begin_seq_token="<BEGIN>", end_seq_token="<END>"):

        super(SequenceVocabulary, self).__init__(token_to_idx)
        self._mask_token = mask_token
        self._unk_token = unk_token
        self._begin_seq_token = begin_seq_token
        self._end_seq_token = end_seq_token

        self.mask_index = self.add_token(self._mask_token)
        self.unk_index = self.add_token(self._unk_token)
        self.begin_seq_index = self.add_token(self._begin_seq_token)
        self.end_seq_index = self.add_token(self._end_seq_token)

    def to_serializable(self):
        contents = super(SequenceVocabulary, self).to_serializable()
        contents.update({'unk_token': self._unk_token, 'mask_token': self._mask_token, 'begin_seq_token': self._begin_seq_token, 'end_seq_token': self._end_seq_token})
        return contents

    def lookup_token(self, token):
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

class NERVectorizer(object):
    def __init__(self, NARRATIVE_vocab, target_vocab):
        self.NARRATIVE_vocab = NARRATIVE_vocab
        self.target_vocab = target_vocab

    def vectorize(self, NARRATIVE, target, vector_length=-1, target_vector_length=-1):
        indices = [self.NARRATIVE_vocab.begin_seq_index]
        indices.extend(self.NARRATIVE_vocab.lookup_token(token) for token in NARRATIVE.split(","))
        indices.append(self.NARRATIVE_vocab.end_seq_index)

        if vector_length < 0:
            vector_length = len(indices)

        from_vector = np.zeros(vector_length, dtype=np.int64)
        from_vector[vector_length - len(indices):] = indices
        from_vector[:vector_length - len(indices)] = self.NARRATIVE_vocab.mask_index
        
        out_indices = [self.target_vocab.begin_seq_index]
        out_indices.extend(self.target_vocab.lookup_token(token) for token in target.split(","))
        out_indices.append(self.target_vocab.end_seq_index)
        
        if target_vector_length < 0:
            target_vector_length = len(out_indices)
        
        to_vector = np.zeros(target_vector_length, dtype=np.int64)
        to_vector[target_vector_length - len(out_indices):] = out_indices
        to_vector[:target_vector_length - len(out_indices)] = self.target_vocab.mask_index
        return from_vector, to_vector, vector_length, target_vector_length

    @classmethod
    def from_dataframe(cls, df, targets):
        target_vocab = SequenceVocabulary()        
        for target in sorted(targets):
            target_vocab.add_token(target)

        word_counts = Counter()
        for NARRATIVE in df.NARRATIVE:
            for token in NARRATIVE.split(","):
                word_counts[token] += 1

        NARRATIVE_vocab = SequenceVocabulary()
        for word, word_count in word_counts.items():
            NARRATIVE_vocab.add_token(word)
        return cls(NARRATIVE_vocab, target_vocab)

    @classmethod
    def from_serializable(cls, contents):
        title_vocab = SequenceVocabulary.from_serializable(contents['NARRATIVE_vocab'])
        category_vocab = Vocabulary.from_serializable(contents['target_vocab'])
        return cls(NARRATIVE_vocab=NARRATIVE_vocab, target_vocab=target_vocab)

    def to_serializable(self):
        return {'NARRATIVE_vocab': self.NARRATIVE_vocab.to_serializable(), 'target_vocab': self.target_vocab.to_serializable()}

class NERDataset(Dataset):
    def __init__(self, df, vectorizer, labels):
        self.df = df
        self._vectorizer = vectorizer
        self.labels=labels

        measure_len = lambda context: len(context.split(","))
        self._max_seq_length = max(map(measure_len, df.NARRATIVE)) + 2
        
        self._max_tag_length = max(len(labels), self._max_seq_length)
        
        self.train_df = self.df[self.df.split=='train']
        self.train_size = len(self.train_df)

        self.val_df = self.df[self.df.split=='val']
        self.validation_size = len(self.val_df)

        self.test_df = self.df[self.df.split=='test']
        self.test_size = len(self.test_df)

        self._lookup_dict = {'train': (self.train_df, self.train_size), 'val': (self.val_df, self.validation_size), 'test': (self.test_df, self.test_size)}
        self.set_split('train')

    @classmethod
    def load_dataset_and_make_vectorizer(cls, news_csv, label_txt):
        labels=[]
        with codecs.open('labels.txt', 'r', encoding='utf-8', errors='ignore') as f:
            for line in f:
                if len(line)>0:
                    labels.append(line.strip())
        
        df = pd.read_csv(news_csv)
        return cls(df, NERVectorizer.from_dataframe(df, labels), labels)

    @classmethod
    def load_dataset_and_load_vectorizer(cls, news_csv, vectorizer_filepath):
        df = pd.read_csv(news_csv)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(news_csv, vectorizer)

    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        with open(vectorizer_filepath) as fp:
            return NameVectorizer.from_serializable(json.load(fp))

    def save_vectorizer(self, vectorizer_filepath):
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)

    def get_vectorizer(self):
        return self._vectorizer

    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        row = self._target_df.iloc[index]
        NARRATIVE_vector, to_vector, length1, length2 = self._vectorizer.vectorize(row.NARRATIVE, row.target, self._max_seq_length, self._max_seq_length)
        return {'x_data': NARRATIVE_vector,'y_target': to_vector, 'x_length': length1, 'y_length': length2}

    def get_num_batches(self, batch_size):
        return len(self) // batch_size

In [2]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.outputs = None
        
    def forward(self, src, tag, teacher_forcing_ratio=0.5):
        
        batch_size = tag.shape[1]
        tag_len = tag.shape[0]
        tag_vocab_size = self.decoder.output_dim
        self.outputs = torch.zeros(tag_len, batch_size, tag_vocab_size).to(device)
        
        hidden, cell = self.encoder(src)
        x = tag[0,:]
        
        for t in range(1, tag_len):
            output, hidden, cell = self.decoder(x, hidden, cell)
            self.outputs[t] = output
            teacher_force = torch.rand(1) < teacher_forcing_ratio
            top1 = output.argmax(1) 
            x = tag[t] if teacher_force else top1
        
        return self.outputs

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers=1):
        super().__init__()
        
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, batch_first=False)
        
    def forward(self, x):
        embedded = self.embedding(x)
        
        outputs, (hidden, cell) = self.rnn(embedded)
        
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers=1):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, batch_first=False)
        
        self.fc1 = nn.Linear(hid_dim, output_dim)
        
    def forward(self, x, hidden, cell):
        x = x.unsqueeze(0)
        
        embedded = self.embedding(x)
                
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        
        out = self.fc1(output.squeeze(0))

        
        return out, hidden, cell
    
def normalize_sizes(y_pred, y_true):
    if len(y_pred.size()) == 3:
        y_pred = y_pred.contiguous().view(-1, y_pred.size(2))
    if len(y_true.size()) == 2:
        y_true = y_true.contiguous().view(-1)
    return y_pred, y_true

def compute_accuracy(y_pred, y_true, mask_index):
    y_pred, y_true = normalize_sizes(y_pred, y_true)

    _, y_pred_indices = y_pred.max(dim=1)
    
    correct_indices = torch.eq(y_pred_indices, y_true).float()
    valid_indices = torch.ne(y_true, mask_index).float()
    
    n_correct = (correct_indices * valid_indices).sum().item()
    n_valid = valid_indices.sum().item()

    return n_correct / n_valid * 100

def sequence_loss(y_pred, y_true, mask_index):
    y_pred, y_true = normalize_sizes(y_pred, y_true)
    return F.cross_entropy(y_pred, y_true, ignore_index=mask_index)

In [3]:
dataset = NERDataset.load_dataset_and_make_vectorizer('data/ner.csv', 'label_txt')
dataset.save_vectorizer('nervectorizer.json')
vectorizer = dataset.get_vectorizer()

In [4]:
device = torch.device('cuda')
INPUT_DIM = len(vectorizer.NARRATIVE_vocab)
OUTPUT_DIM = len(vectorizer.target_vocab)
ENC_EMB_DIM = 50
DEC_EMB_DIM = 50
HID_DIM = 64

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM)

dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM)

model = Seq2Seq(enc, dec, device).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [5]:
model

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(11965, 50)
    (rnn): LSTM(50, 64)
  )
  (decoder): Decoder(
    (embedding): Embedding(180, 50)
    (rnn): LSTM(50, 64)
    (fc1): Linear(in_features=64, out_features=180, bias=True)
  )
)

In [6]:
for epoch_index in range(10):
    dataset.set_split('train')
    
    batch_generator = generate_batches(dataset, batch_size=512, device="cuda")
    running_loss = 0.0
    running_acc = 0.0
    model.train()
    
    for batch_index, batch_dict in enumerate(batch_generator):
        optimizer.zero_grad()
        y_pred = model(batch_dict['x_data'].float(), batch_dict['y_target'].float())
        loss = sequence_loss(y_pred, batch_dict['y_target'], vectorizer.target_vocab.mask_index)
        loss.backward()
        optimizer.step()
        
        running_loss += (loss.item() - running_loss) / (batch_index + 1)
        acc_t = compute_accuracy(y_pred, batch_dict['y_target'], vectorizer.target_vocab.mask_index)
        running_acc += (acc_t - running_acc) / (batch_index + 1)
    
    print("Train loss", running_loss, "Train acc", running_acc)
    
    dataset.set_split('val')
    batch_generator = generate_batches(dataset, batch_size=len(dataset), device="cuda")
    running_loss = 0.
    running_acc = 0.
    model.eval()
    
    for batch_index, batch_dict in enumerate(batch_generator):
        
        y_pred = model(batch_dict['x_data'].float(), batch_dict['y_target'].float())
        loss = sequence_loss(y_pred, batch_dict['y_target'], vectorizer.target_vocab.mask_index)
        
        running_loss += (loss.item() - running_loss) / (batch_index + 1)
        acc_t = compute_accuracy(y_pred, batch_dict['y_target'], vectorizer.target_vocab.mask_index)
        running_acc += (acc_t - running_acc) / (batch_index + 1)
    
    print("Val loss", running_loss, "Val acc", running_acc)

Train loss 4.384732127189636 Train acc 70.2216486235525
Val loss 3.3224072456359863 Val acc 89.26405461058926
Train loss 2.2976983189582825 Train acc 89.25764827421223
Val loss 1.3219678401947021 Val acc 89.27533779018928
Train loss 1.0050078417573656 Train acc 88.74998061146235
Val loss 0.8085257411003113 Val acc 89.34867845758934
Train loss 0.7824036266122545 Train acc 89.1278029759152
Val loss 0.7507113218307495 Val acc 89.36278243208936
Train loss 0.7406699785164425 Train acc 89.26284904259043
Val loss 0.7258737087249756 Val acc 89.35996163718936
Train loss 0.7210081986018588 Train acc 89.27608729560356
Val loss 0.7173876166343689 Val acc 89.26123381568925
Train loss 0.711839484316962 Train acc 89.28235719834866
Val loss 0.7157663702964783 Val acc 89.2019971227892
Train loss 0.7079586684703827 Train acc 89.26163781412859
Val loss 0.7100784778594971 Val acc 89.21892189218921
Train loss 0.7042489988463266 Train acc 89.28004704245849
Val loss 0.7053143978118896 Val acc 89.269696200389

In [9]:
dataset.set_split('test')
batch_generator = generate_batches(dataset, batch_size=len(dataset), device="cuda")

running_acc = 0.
running_loss=0.
model.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred = model(batch_dict['x_data'].long(), batch_dict['y_target'].long())

    # compute the loss
    loss = sequence_loss(y_pred, batch_dict['y_target'], vectorizer.target_vocab.mask_index)
    
    # compute the running loss and running accuracy
    running_loss += (loss.item() - running_loss) / (batch_index + 1)
    acc_t = compute_accuracy(y_pred, batch_dict['y_target'], vectorizer.target_vocab.mask_index)
    running_acc += (acc_t - running_acc) / (batch_index + 1)
    
    y_true=batch_dict['y_target'].long()

In [11]:
y_true

tensor([[  0,   0,   0,  ..., 134, 179,   3],
        [  0,   0,   0,  ..., 179, 179,   3],
        [  0,   0,   0,  ..., 179, 179,   3],
        ...,
        [  0,   0,   0,  ..., 135, 179,   3],
        [  0,   0,   0,  ..., 179, 179,   3],
        [  0,   0,   0,  ..., 179, 179,   3]], device='cuda:0')

In [12]:
y_pred, y_true = normalize_sizes(y_pred, y_true)

In [16]:
_, y_pred_indices = y_pred.max(dim=1)
y_pred_indices

tensor([  0,   0,   0,  ..., 179, 179,   3], device='cuda:0')

In [17]:
y_true

tensor([  0,   0,   0,  ..., 179, 179,   3], device='cuda:0')