In [1]:
import codecs
from dataclasses import dataclass
from typing import List, Dict, Any, Sequence, Callable
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch import Tensor, nn
import gensim.models
GoogleEmbs = gensim.models.KeyedVectors.load_word2vec_format(
                                'nlp_project/models/GoogleNews-50k.bin', binary=True)
import random

# Ensuring reproducibility
seed = 0
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(seed)
random.seed(seed)

import sys
sys.path.append("nlp_project")
from nlp_project.scripts.read_write_data import read_processed_data, read_raw_data, load_data
from nlp_project.models.classes import Batch, DataIterator, PolyDataIterator, F1_evaluator, F1_error_evaluator, WeightedCrossEntropy #Train1BiLSTM #BaselineBiLSTM


In [2]:
class Train1BiLSTM(torch.nn.Module):
    def __init__(self,
                hidden_size=20,
                max_len=100,
                n_labels=3,
                batch_size=32,
                pad_token="<PAD>",
                pad_label=2,
                embedding_dim=300
                ):
        super().__init__()
        
        self.embedding_dim = embedding_dim  # length of embedding vectors
        self.hidden_size = hidden_size  # number of LSTM cells
        self.max_len=max_len  # maximum input sentence length, will be padded to this size
        self.n_labels = n_labels
        self.batch_size = batch_size
        self.pad_token = pad_token
        self.pad_label = pad_label

        self.lstm = nn.LSTM(input_size=self.embedding_dim, hidden_size=self.hidden_size, batch_first=True, bidirectional=True)
        self.linear = nn.Linear(in_features=2 * self.hidden_size, out_features=n_labels)

        self.data_iterator = DataIterator(batch_size=self.batch_size)

        # Logs for performance in each training epoch
        self.train_f1_log = []
        self.dev_f1_log = []

    def _pad_inputs(self, collection: List[List[int]], padding_token):
        to_series = [pd.Series(el) for el in collection]
        enc_matrix = (pd.concat(to_series, axis=1)
                        .reindex(range(self.max_len))
                        .fillna(padding_token)
                        .T)
        collection = enc_matrix.values.tolist()
        return collection

    def _pad_data(self, documents, labels):
        padded_documents = self._pad_inputs(documents, self.pad_token)
        padded_labels = self._pad_inputs(labels, self.pad_label)
        padded_labels = [list(map(int,sentence)) for sentence in padded_labels]
        return padded_documents, padded_labels

    def forward(self, inputs):
        '''
        Implements a forward pass through the BiLSTM.
        inputs are a batch (list) of sentences.
        '''
        word_embeds = self._get_google_embeds(inputs)
        lstm_result, _ = self.lstm(word_embeds)
        tags = self.linear(lstm_result)
        log_probs = torch.nn.functional.softmax(tags, dim=2)
        return log_probs
    
    def _get_google_embeds(self, inputs):
        embeddings = torch.Tensor()
        for sentence in inputs:
            sentence_embeds = torch.Tensor()
            for word in sentence:
                if GoogleEmbs.__contains__(word):
                    embed = GoogleEmbs.get_vector(word)
                    embed.setflags(write = True)
                    embed = torch.from_numpy(embed)
                else:
                    embed = torch.zeros(self.embedding_dim)  # the word is not in the model dictionary, so use zero vector
                sentence_embeds = torch.cat((sentence_embeds, embed), dim=0)
            embeddings = torch.cat((embeddings, sentence_embeds), dim=0)
        return embeddings.view(len(inputs), -1, self.embedding_dim)
    
    def fit(self, train, dev=None, epochs=3, print_metrics=False, learning_rate=0.05):
        
        documents, labels = train
        padded_documents, padded_labels = self._pad_data(documents, labels)  # Padding training data

        loss_func = torch.nn.CrossEntropyLoss(reduction='sum', ignore_index=self.pad_label)  # ignores loss for padding label
        Evaluator = F1_evaluator(self.pad_label)
        optimizer = torch.optim.Adam(params=self.parameters(), lr=learning_rate)

        for epoch in range(epochs):
            epoch_loss = 0
            self.train()
            for i, batch in enumerate(self.data_iterator(padded_documents, padded_labels)):
                pred_tags = self.forward(inputs=batch.inputs)
                # pred_tags = pred_tags.view(-1, self.n_labels) # probability distribution for each tag across all words in batch
                targets = torch.tensor(batch.targets)  # 
                # print(f"train batch shapes: {targets.shape, pred_tags.shape}")
                Evaluator.pass_batch(targets, pred_tags)  # passing batch labels to evaluator
                batch_loss = loss_func(pred_tags.permute(0,2,1), targets)
                epoch_loss += batch_loss.item()
                batch_loss.backward()
                optimizer.step()
                optimizer.zero_grad()
            train_metrics = Evaluator.metrics
            train_f1 = Evaluator.f1_score()
            self.train_f1_log.append(train_f1)

            if dev is None:  # print performance and go to next epoch if no dev data is supplied
                if print_metrics:
                    COR, PAR, INC, MIS, SPU, ACT, POS = list(train_metrics)
                    print(f"Epoch {epoch}, train: {train_f1:.3f} \n train metrics: {ACT} ACT, {POS} POS, {COR} COR, {PAR} PAR, {INC} INC ({MIS} MIS, {SPU} SPU)")
                else:
                    print(f"Epoch {epoch}, train: {train_f1:.3f}, loss: {epoch_loss:.3f}")
            else:
                # Dev evaluation
                x_dev, y_dev = dev
                dev_f1, dev_metrics = self._dev_evaluate(x_dev, y_dev)
                self.dev_f1_log.append(dev_f1)
                if print_metrics:
                    COR, PAR, INC, MIS, SPU, ACT, POS = list(dev_metrics)
                    print(f"Epoch {epoch}, train: {train_f1:.3f}, dev: {dev_f1:.3f} \n dev metrics: {ACT} ACT, {POS} POS, {COR} COR, {INC} INC ({PAR} PAR, {MIS} MIS, {SPU} SPU)")
                else:
                    print(f"Epoch {epoch}, train: {train_f1:.3f}, dev: {dev_f1:.3f}")

    def _dev_evaluate(self, x_dev, y_dev):
        """
        Evaluates model performance on supplied data.
        print_metrics set to print out metrics by default.
        return_metrics (optionally) returns F1 and metrics.
        """
        padded_dev_docs, padded_dev_labs = self._pad_data(x_dev, y_dev)  # Padding data
        self.eval()
        Evaluator = F1_evaluator(self.pad_label)
        for i, batch in enumerate(self.data_iterator(padded_dev_docs, padded_dev_labs)):
            with torch.no_grad():
                pred_dev = self.forward(batch.inputs)
            targets = torch.tensor(batch.targets)
            # print(f"dev batch shapes: {targets.shape, pred_dev.shape}")
            Evaluator.pass_batch(targets, pred_dev)
        dev_metrics = Evaluator.metrics
        dev_f1 = Evaluator.f1_score()
        return dev_f1, dev_metrics

    def evaluate(self, x_dev, y_dev, BIOlabels, domains, print_metrics=True, return_errors=True):
        """
        Evaluates model performance on supplied data.
        print_metrics set to print out f1, precision, recall and metrics by default.
        return_errors returns error DataFrame by default.
        """
        padded_dev_docs, padded_dev_labs = self._pad_data(x_dev, y_dev)  # Padding data
        self.eval()
        Evaluator = F1_error_evaluator(x_dev, BIOlabels, domains, pad_label=self.pad_label)
        for i, batch in enumerate(self.data_iterator(padded_dev_docs, padded_dev_labs)):
            with torch.no_grad():
                pred_dev = self.forward(batch.inputs)
            targets = torch.tensor(batch.targets)
            Evaluator.pass_batch(targets, pred_dev)
        dev_metrics = Evaluator.metrics
        error_df = Evaluator.errors
        precision, recall, f1 = Evaluator.f1_score(verbose=True)
        if print_metrics:
            print(f"F1: {f1:.3f} precision: {precision:.3f} recall: {recall:.3f}")
            COR, PAR, INC, MIS, SPU, ACT, POS = list(dev_metrics)
            print(f"Metrics: {ACT} ACT, {POS} POS, {COR} COR, {INC} INC ({PAR} PAR, {MIS} MIS, {SPU} SPU)")
        if return_errors:
            # error_df["general_label"] = error_df["BIO_label"][0]
            return error_df

In [3]:
class Train2BiLSTM(Train1BiLSTM):
    def __init__(self,
                hidden_size=20,
                max_len=100,
                n_labels=3,
                batch_size=32,
                pad_token="<PAD>",
                pad_label=2,
                embedding_dim=300
                ):
        super().__init__()
        self.embedding_dim = embedding_dim  # length of embedding vectors
        self.hidden_size = hidden_size  # number of LSTM cells
        self.max_len=max_len  # maximum input sentence length, will be padded to this size
        self.n_labels = n_labels
        self.batch_size = batch_size
        self.pad_token = pad_token
        self.pad_label = pad_label

        self.lstm = nn.LSTM(input_size=self.embedding_dim, hidden_size=self.hidden_size, batch_first=True, bidirectional=True)
        self.linear = nn.Linear(in_features=2 * self.hidden_size, out_features=n_labels)

        self.poly_data_iterator = PolyDataIterator(batch_size=self.batch_size)
        self.data_iterator = DataIterator(batch_size=self.batch_size)

        # Logs for performance in each training epoch
        self.train_f1_log = []
        self.dev_f1_log = []
    
    def fit(self, 
            train,
            train2, 
            dev=None,
            epochs=20, 
            print_metrics=False, 
            learning_rate=0.005,
            alpha=None):
        
        documents, labels = train
        pseudo_docs, pseudo_labels = train2

        # padding training data
        padded_documents, padded_labels = self._pad_data(documents, labels) 
        pseudo_docs, pseudo_labels = self._pad_data(pseudo_docs, pseudo_labels)

        loss_func = WeightedCrossEntropy(epochs, pad_label=self.pad_label)
        Evaluator = F1_evaluator(self.pad_label)
        optimizer = torch.optim.Adam(params=self.parameters(), lr=learning_rate)
        
        for epoch in range(epochs):
            epoch_loss = 0
            self.train()
            for labeled, paraphrased in self.poly_data_iterator([padded_documents, pseudo_docs], 
                                                                [padded_labels, pseudo_labels]):
                pred_tags = self.forward(inputs=labeled[0])
                pred_pseudo_tags = self.forward(inputs=paraphrased[0])
                # probability distribution for each tag across all words
                # pred_tags = pred_tags.view(-1, self.n_labels)
                # pred_pseudo_tags = pred_pseudo_tags.view(-1, self.n_labels)
                # true label for each word
                targets = torch.tensor(labeled[1])  # .flatten()
                pseudo_targets = torch.tensor(paraphrased[1])  # .flatten()
                # passing all batch labels to evaluator
                Evaluator.pass_batch(torch.cat((targets, pseudo_targets), 0), 
                                     torch.cat((pred_tags, pred_pseudo_tags), 0))
                batch_loss = loss_func(pred_tags.permute(0,2,1), pred_pseudo_tags.permute(0,2,1), targets, pseudo_targets, alpha=(alpha or epoch / epochs))
                epoch_loss += batch_loss.item()
                batch_loss.backward()
                optimizer.step()
                optimizer.zero_grad()
            train_metrics = Evaluator.metrics
            train_p, train_r, train_f1 = Evaluator.f1_score(verbose=True)
            self.train_f1_log.append(train_f1)

            if dev is None:  # print performance and go to next epoch if no dev data is supplied
                if print_metrics:
                    COR, PAR, INC, MIS, SPU, ACT, POS = list(train_metrics)
                    print(f"Epoch {epoch}, train: {train_f1:.3f} \n train metrics: {ACT} ACT, {POS} POS, {COR} COR, {PAR} PAR, {INC} INC ({MIS} MIS, {SPU} SPU)")
                else:
                    print(f"Epoch {epoch}, train: {train_f1:.3f}, loss: {epoch_loss:.3f}")
            else:
                # Dev evaluation
                x_dev, y_dev = dev
                dev_f1, dev_metrics = self._dev_evaluate(x_dev, y_dev)
                self.dev_f1_log.append(dev_f1)
                if print_metrics:
                    COR, PAR, INC, MIS, SPU, ACT, POS = list(dev_metrics)
                    print(f"Epoch {epoch}, train: {train_f1:.3f}, dev: {dev_f1:.3f} \n dev metrics: {ACT} ACT, {POS} POS, {COR} COR, {PAR} PAR, {INC} INC ({MIS} MIS, {SPU} SPU)")
                else:
                    print(f"Epoch {epoch}, train: {train_f1:.3f}, dev: {dev_f1:.3f}")

In [4]:
TRAIN_SET_PATH = "nlp_project/data/processed/train_splits/labeled.conll"
DEV_SET_PATH = "nlp_project/data/processed/dev.conll"
PARAPHRASED_PATH = "nlp_project/data/paraphrased/train_labeled.conll"

train_docs, train_labels, train_bio, train_domain = load_data(TRAIN_SET_PATH)
dev_docs, dev_labels, dev_bio, dev_domain = load_data(DEV_SET_PATH)

pp_docs = []
pp_labels = []
for words, labels in read_raw_data(PARAPHRASED_PATH):
    pp_docs.append(words)
    pp_labels.append(labels)
pp_bio = train_bio
pp_domain = train_domain

In [5]:
# model = Train1BiLSTM(hidden_size=10)
# model.fit(train = (train_docs, train_labels),
#           dev = (dev_docs, dev_labels),
#           print_metrics=True, 
#           learning_rate=0.005,
#           epochs=1)

In [6]:
# error_df = model.evaluate(dev_docs, dev_labels, dev_bio, dev_domain)
# error_df.head(5)

In [7]:
model2 = Train2BiLSTM(hidden_size=10)
model2.fit(train = (train_docs, train_labels),
          train2 = (pp_docs, pp_labels),
          dev = (dev_docs, dev_labels),
          print_metrics=True, 
          learning_rate=0.005,
          epochs=1)

Epoch 0, train: 0.001, dev: 0.000 
 dev metrics: 0 ACT, 1368 POS, 0 COR, 0 PAR, 1368 INC (1368 MIS, 0 SPU)


In [8]:
error_df = model2.evaluate(dev_docs, dev_labels, dev_bio, dev_domain)
error_df.head(5)

F1: 0.000 precision: 0.000 recall: 0.000
Metrics: 0 ACT, 1368 POS, 0 COR, 1368 INC (0 PAR, 1368 MIS, 0 SPU)


Unnamed: 0,error_type,entity,entity_BIO,sentence,sentence_BIO,domain
0,MIS,Washington Post,"[B-ORG, I-ORG]","[Anthony, Shadid, of, the, Washington, Post, r...","[B-PER, I-PER, O, O, B-ORG, I-ORG, O, O, O, O,...",weblogs
1,MIS,Anthony Shadid,"[B-PER, I-PER]","[Anthony, Shadid, of, the, Washington, Post, r...","[B-PER, I-PER, O, O, B-ORG, I-ORG, O, O, O, O,...",weblogs
2,MIS,CPA,[B-ORG],"[I, 'm, not, even, in, Iraq, and, I, could, ha...","[O, O, O, O, O, B-LOC, O, O, O, O, O, O, O, O,...",weblogs
3,MIS,Iraq,[B-LOC],"[I, 'm, not, even, in, Iraq, and, I, could, ha...","[O, O, O, O, O, B-LOC, O, O, O, O, O, O, O, O,...",weblogs
4,MIS,Army,[B-ORG],"[The, Army, is, unlikely, to, forgive, or, for...","[O, B-ORG, O, O, O, O, O, O, O, O, O, O, O, O,...",weblogs


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=b2f14aee-af04-4db5-af55-57a3a58b9f40' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>