#Init

In [6]:
## for data
import os
import numpy as np

## for plotting
import matplotlib.pyplot as plt

## for processing
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import string

nltk.download('wordnet')
nltk.download('universal_tagset')
nltk.download('averaged_perceptron_tagger')

## for word embedding
import gensim
import gensim.downloader as gensim_api

## for model 
import torch
from torch import nn

##Other
import random
from torch.utils.data import Dataset, DataLoader
from gensim.models import KeyedVectors
from typing import List, Tuple
from seqeval.metrics import accuracy_score
from seqeval.metrics import f1_score
import seqeval

from utility import StudDataset
from utility import CRF
from utility import scatter_sum
from utility import StudentParams

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\orlan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\orlan\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\orlan\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [7]:
def seed_all(seed: int = 42):
    print("[ Using Seed : ", seed, " ]")

    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [8]:
seed_all()

[ Using Seed :  42  ]


In [9]:
temp_path = "../../"
data_path = os.path.join(temp_path, "data")
model_p = os.path.join(temp_path, "model")
train_path = os.path.join(data_path, "train.tsv")
dev_path = os.path.join(data_path, "dev.tsv")

In [10]:
def read_dataset(path: str) -> Tuple[List[List[str]], List[List[str]]]:
    tokens_s = []
    labels_s = []

    tokens = []
    labels = []

    with open(path, encoding="utf-8") as f:

        for line in f:

            line = line.strip()

            if line.startswith("#\t"):
                tokens = []
                labels = []
            elif line == "":
                tokens_s.append(tokens)
                labels_s.append(labels)
            else:
                token, label = line.split("\t")
                tokens.append(token)
                labels.append(label)

    assert len(tokens_s) == len(labels_s)

    return tokens_s, labels_s

In [11]:
class StudentModel(nn.Module):
	def __init__(self, params: StudentParams):
		super(StudentModel, self).__init__()
		self.params = params
		self.device = params.device

		# EMBEDDING LAYERS
		self.word_embedding = nn.Embedding.from_pretrained(torch.FloatTensor(self.params.word_vocab.vectors), freeze=self.params.words_freeze, padding_idx=self.params.word_vocab.key_to_index["<PAD>"])
		self.pos_embedding = nn.Embedding(num_embeddings=self.params.pos_vocab_size, embedding_dim=self.params.pos_embedding_dim, padding_idx=self.params.pos_vocab_index["<PAD>"])
		self.char_embedding = nn.Embedding(num_embeddings=self.params.char_vocab_size, embedding_dim=self.params.char_embedding_dim, padding_idx=self.params.char_vocab_index["<PAD>"])

		# LAYER 1.1: Char BiLSTM
		self.char_lstm = nn.LSTM(input_size=self.params.char_embedding_dim, hidden_size=self.params.char_hidden_dim, num_layers=self.params.char_lstm_layers, bidirectional=self.params.char_bidir, dropout=self.params.char_dropout if self.params.char_lstm_layers > 1 else 0, batch_first=True)
		self.dense_char_embedding = nn.Linear(self.params.char_hidden_dim * (2 if self.params.char_bidir else 1), self.params.char_hidden_dim * (2 if self.params.char_bidir else 1))
		self.char_dropout = nn.Dropout(self.params.char_dropout)

		# LAYER 1.2: Global BiLSTM
		self.global_contextual_embedding = nn.LSTM(input_size=self.params.global_embedding_dim, hidden_size=self.params.global_hidden_dim, num_layers=self.params.global_lstm_layers, bidirectional=self.params.global_bidir, dropout=self.params.global_dropout if self.params.global_lstm_layers > 1 else 0, batch_first=True)

		self.dense_global_embedding = nn.Linear(self.params.global_hidden_dim * (2 if self.params.global_bidir else 1), self.params.global_hidden_dim * (2 if self.params.global_bidir else 1))
		self.global_dropout = nn.Dropout(self.params.global_dropout)

		# LAYER 2: Feature BiLSTM
		self.combined_lstm = nn.LSTM(input_size=self.params.full_embedding_dim, hidden_size=self.params.combined_hidden_dim, num_layers=self.params.combined_lstm_layers, bidirectional=self.params.combined_bidir, dropout=self.params.combined_dropout if self.params.combined_lstm_layers > 1 else 0, batch_first=True)

		self.dense_combined_embedding = nn.Linear(self.params.combined_hidden_dim * (2 if self.params.combined_bidir else 1), self.params.feature_dim_out)
		self.combined_dropout = nn.Dropout(self.params.combined_dropout)

		# LAYER 3.1: Classificator
		self.batchnorm = nn.BatchNorm1d(self.params.feature_dim_out)
		self.SELU = nn.SELU()

		self.fc = nn.Linear(self.params.feature_dim_out, 128)
		self.fc2 = nn.Linear(128, self.params.num_classes)
		self.fc_dropout = nn.Dropout(self.params.classificator_dropout)

		# LAYER 3.2: Classificator
		self.conll_fc = nn.Linear(self.params.feature_dim_out, self.params.conll_num_classes)
		self.conll_fc_dropout = nn.Dropout(self.params.conll_classificator_dropout)

		self.softmax = nn.LogSoftmax(dim=1)

		# LAYER 4
		if self.params.crf:
			self.crf = CRF(self.params.num_classes).to(self.device)
			# LAYER 4.2
			self.conll_crf = CRF(self.params.conll_num_classes).to(self.device)

	def ner_loss(self, y_pred, y, mask, criterion=None, conll=False):
		if criterion is None:
			criterion = nn.CrossEntropyLoss(ignore_index=StudDataset.encode_class("<PAD>"), reduction="mean")

		if self.params.crf:
			if conll:
				loss = -(self.conll_crf(y_pred, y, mask)).mean()
			else:
				loss = -(self.crf(y_pred, y, mask)).mean()
		else:
			# labels  [[1,2,3], [18, 12, 3]] after the view(-1) [1,2,3, 18, 12, 3]
			y_pred = y_pred.view(-1, y_pred.shape[-1])
			y = y.view(-1)
			# FLATTENED MASK
			f_mask = mask.view(-1)

			# FILTER NOT PADDING
			y_pred = y_pred[f_mask]
			y = y[f_mask]
			loss = criterion(y_pred, y)
		return loss

	def forward(self, x, conll=False, verbose=False):

		# Unpack chars and sentence_char_len
		words = x[0]
		poses = x[1]
		chars = x[2]
		char_indexes = x[3]
		mask = x[4]

		# Embedding layers
		poses_out = self.pos_embedding(poses)
		words_out = self.word_embedding(words)

		# FIRST LSTM layer
		if self.params.char:
			chars_out = self.char_embedding(chars)
			char_lstm_out, _ = self.char_lstm(chars_out)

			# char_lstm_out = self.dense_char_embedding(char_lstm_out)
			char_lstm_out = self.char_dropout(char_lstm_out)

			if verbose: print("FIRST lstm out shape: {}".format(char_lstm_out.shape))
			# COMBINE CHARS FOR EACH WORDS

			batch_sentences = scatter_sum(char_lstm_out, char_indexes, dim=1)[:, :words_out.shape[1], :]

			if verbose: print("Scatter out shape: {}".format(batch_sentences.shape))
			if verbose: print("words out shape: {}".format(words_out.shape))
			if verbose: print("poses out shape: {}".format(poses_out.shape))

			combined_embeddings = torch.cat((words_out, batch_sentences, poses_out), 2)
		else:
			combined_embeddings = torch.cat((words_out, poses_out), 2)

		if self.params.global_vector:
			# Global Contextual Embedding

			global_lstm_out, _ = self.global_contextual_embedding(combined_embeddings)
			global_lstm_out = self.dense_global_embedding(global_lstm_out)
			global_lstm_out = self.global_dropout(global_lstm_out)

			global_context_emb = torch.sum(global_lstm_out, dim=1)
			global_context_emb = torch.unsqueeze(global_context_emb, dim=1)

			global_context_emb = global_context_emb.expand(-1, global_lstm_out.shape[1], -1)

			if verbose: print("global_context_emb embeddings shape: {}".format(global_context_emb.shape))

		if self.params.global_vector:
			combined_embeddings = torch.cat((global_context_emb, combined_embeddings), 2)

		if verbose: print("COMBINED embeddings shape: {}".format(combined_embeddings.shape))

		# SECOND LSTM LAYER
		full_lstm_out, _ = self.combined_lstm(combined_embeddings)

		if verbose: print("SECOND lstm shape: {}".format(full_lstm_out.shape))
		feature_lstm_out = self.dense_combined_embedding(full_lstm_out)
		feature_lstm_out = self.combined_dropout(feature_lstm_out)

		if verbose: print("FEATURE extractor shape: {}".format(feature_lstm_out.shape))
		feature_lstm_out = self.batchnorm(feature_lstm_out.permute(0, 2, 1))

		if verbose: print("BATCHNORM shape: {}".format(feature_lstm_out.shape))

		activation_function = self.SELU

		out = activation_function(feature_lstm_out.permute(0, 2, 1))

		# IF IS USED THE CONLL DATASET USE A CLASSIFICATOR WITH LESS CLASSES
		if conll:
			out = self.conll_fc(out)

		else:
			out = self.fc(out)
			out = self.fc_dropout(out)
			out = self.fc2(out)

		logits = self.softmax(out)
		if verbose: print("LOGITS shape: {}".format(out.shape))

		if self.params.crf:
			if conll:
				out = self.conll_crf.viterbi_decode(logits, mask)
			else:
				out = self.crf.viterbi_decode(logits, mask)
		return logits, out


	def predict(self, tokens: List[List[str]], conll = False) -> List[List[str]]:
		all_predict = []
		dev_dataset = StudDataset(tokens, self.params.word_vocab.key_to_index, self.params.pos_vocab_index, self.params.char_vocab_index, device=self.params.device, conll_dataset=conll)

		dataloader = DataLoader(dev_dataset, batch_size=self.params.batch_size, collate_fn=dev_dataset.collate_fn)
		self.eval()

		with torch.no_grad():
			for (xd, ys) in dataloader:
				mask = xd["mask"].to(self.device)
				x = (xd["words"].to(self.device), xd["poses"].to(self.device), xd["chars"].to(self.device), xd["scattered"].to(self.device), mask)

				hidden, out = self(x,conll)

				### START EVAL PART ###
				for s_ine, sentence in enumerate(out):
					all_predict.append([StudDataset.decode_class(x, conll) for x in sentence])
				### END EVAL PART ###

		return all_predict


# Def Train-Eval functions

In [12]:
def train(model, train_dataloader, optimizer, criterion, device, conll=False):
    epoch_loss = 0
    model.train()

    for (xd, ys) in train_dataloader:

        y = ys["labels"].to(device)

        mask = xd["mask"].to(device)
        x = (xd["words"].to(device), xd["poses"].to(device), xd["chars"].to(device), xd["scattered"].to(device), mask)

        optimizer.zero_grad()

        hidden, out = model(x, conll=conll)

        loss = model.ner_loss(hidden, y, mask, criterion, conll=conll)
        loss.backward()

        optimizer.step()
        epoch_loss += loss.item()

    return epoch_loss / len(train_dataloader)

def evaluate(model, iterator, criterion, device, labels_s=None):
    all_predict = []
    epoch_loss = 0
    model.eval()

    with torch.no_grad():
        for (xd, ys) in iterator:

            mask = xd["mask"].to(device)
            x = (xd["words"].to(device), xd["poses"].to(device), xd["chars"].to(device), xd["scattered"].to(device), mask)
            y = ys["labels"].to(device)

            hidden, out = model(x)

            ### START EVAL PART ###
            for s_ine, sentence in enumerate(out):
                all_predict.append([StudDataset.decode_class(x) for x in sentence])
            ### END EVAL PART ###

            loss = model.ner_loss(hidden, y, mask, criterion )

            epoch_loss += loss.item()

        if labels_s is not None:
            acc = accuracy_score(labels_s, all_predict)
            f1 = f1_score(labels_s, all_predict, average="macro")

    return epoch_loss / len(iterator),acc, f1

# Init Params Dataloaders and Model

In [13]:
params = StudentParams()

In [14]:
train_t, train_l = read_dataset(train_path)
dev_t, dev_l = read_dataset(dev_path)
if params.additional_dataset:
    add2_t, add2_l = read_dataset(os.path.join(data_path,"CoNLL.tsv"))

    add_t, add_l = read_dataset(os.path.join(data_path,"WNUT17.tsv"))
    add_t = [ [s.lower() for s in x] for x in add_t if len(x) > 10]
    add_l = [ x for x in add_l if len(x) > 10]
    train_t = train_t + add_t
    train_l = train_l + add_l

train_dataset = StudDataset(train_t, params.word_vocab.key_to_index, params.pos_vocab_index,
                          params.char_vocab_index, lemming=params.lemming, device=params.device, labels=train_l)
train_dataloader = DataLoader(train_dataset, batch_size=params.batch_size, collate_fn=train_dataset.collate_fn,
                              shuffle=True)

dev_dataset = StudDataset(dev_t, params.word_vocab.key_to_index, params.pos_vocab_index, params.char_vocab_index,
                        lemming=params.lemming, device=params.device, labels=dev_l)
dev_dataloader = DataLoader(dev_dataset, batch_size=params.batch_size, collate_fn=dev_dataset.collate_fn)

if params.additional_dataset:
    add2_t_dataset = StudDataset(add2_t, params.word_vocab.key_to_index, params.pos_vocab_index,
                              params.char_vocab_index, lemming=params.lemming, device=params.device ,labels=add2_l, conll_dataset=True)

    add2_t_dataloader = DataLoader(add2_t_dataset, batch_size=params.batch_size, collate_fn=add2_t_dataset.collate_fn,
                                  shuffle=True)

In [None]:
model = StudentModel(params).to(params.device)

optimizer = torch.optim.Adam(model.parameters(), lr=params.learning_rate, weight_decay=params.weight_decay)

criterion = nn.CrossEntropyLoss(ignore_index=StudDataset.encode_class("<PAD>"))

print("Initiated")

# Train

In [18]:
import time

best_valid_f1 = -float('inf')
losses = {"train": [], "val": []}
f1ns = {"train": [], "val": []}
accs = {"train": [], "val": []}
last_additional_train_loss = float('inf')

In [19]:
epochs = params.epochs
epochs = epochs
for epoch in range(epochs):
    start_time = time.time()

    print("EPOCH {}/{}".format(str(epoch + 1),epochs))
    if params.additional_dataset and last_additional_train_loss > 1.0  :
        last_additional_train_loss = train(model, add2_t_dataloader, optimizer, criterion, params.device, conll=True)
        print(f'\t Train Loss CoNLL: {last_additional_train_loss:.3f}')

    train_loss = train(model, train_dataloader, optimizer, criterion, params.device)
    print(f'\t Train Loss main Dataset: {train_loss:.3f}')


    valid_loss, valid_acc, valid_f1 = evaluate(model, dev_dataloader, criterion, params.device, dev_l)
    print(f'\t Val. Loss: {valid_loss:.3f} | Val Acc: {valid_acc:.2f}% | Val F1: {valid_f1 * 100:.2f}%')

    if valid_f1 > best_valid_f1:
        best_valid_f1 = valid_f1
        torch.save(model.state_dict(), os.path.join(temp_path, 'model.ckpt'))
        print("     NEW BEST MODEL")

    losses["train"].append(train_loss)
    losses["val"].append(valid_loss)
    f1ns["val"].append(valid_f1)
    accs["val"].append(valid_acc)
    print("     --- %s seconds ---" % (time.time() - start_time))

EPOCH 1/15


KeyboardInterrupt: 

## Plot train graph

In [None]:
plt.plot(losses["train"], label="train")
plt.plot(losses["val"], label="val")
plt.legend()
plt.show()

In [None]:
plt.plot(accs["val"], label="acc")
plt.plot(f1ns["val"], label="f1")
plt.legend()
plt.show()

In [26]:
predictions_s = model.predict(dev_t)

In [None]:
print(seqeval.metrics.classification_report(dev_l, predictions_s))

In [None]:
from sklearn.metrics import confusion_matrix
flat_label = [ x  for sentence in dev_l for x in sentence]
flat_predict = [ x  for sentence in predictions_s for x in sentence]
labels = [k for k,v in StudDataset.get_class_labels().items() if k != "<PAD>"]
cf_matrix = confusion_matrix(flat_label, flat_predict, labels=labels)
import seaborn as sns
import matplotlib.pyplot as plt

palette = ["#ffffff", "#e6f6ff", "#b8d5e6", "#8fb7cc", "#73a4bf", "#4d7e99", "#356a85","#2d6480" , "#185a7a", "#004c71"]


f, ax = plt.subplots(figsize=(15, 15))
cf_matrix = cf_matrix.astype('float') / cf_matrix.sum(axis=1)[:, np.newaxis]

ax = sns.heatmap(cf_matrix, annot=True,
            fmt='.2%',
            cmap=sns.color_palette(palette, 9)
            )

ax.set_title('Confusion Matrix\n\n')
ax.set_xlabel('\nPredicted NER')
ax.set_ylabel('Actual NER')

## Ticket labels - List must be in alphabetical order
ax.xaxis.set_ticklabels(labels)
ax.yaxis.set_ticklabels(labels)

## Display the visualization of the Confusion Matrix.
plt.show()