In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gensim.downloader as api
import tqdm as tqdm

import json
import random

from mention import Mention, decode_bio
from score import ScoringCounts, score_mentions

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
UNK = '<unk>'
PAD = '<pad>'

In [4]:
def load_glove_embeddings(vocab, embedding_dim):
    glove_model = api.load(f"glove-wiki-gigaword-{embedding_dim}")
    embedding_matrix = np.zeros((len(vocab), embedding_dim))
    for index, word in enumerate(vocab):
        if word in glove_model:
            embedding_matrix[index] = glove_model[word]
        else:
            embedding_matrix[index] = np.random.normal(scale=0.1, size=(embedding_dim,))
    embedding_matrix[0] = np.zeros(embedding_dim)
    return embedding_matrix


In [5]:
class LSTMEncoderPreTrain(nn.Module):

    def __init__(self, vocab, embed_dim, hidden_dim, num_tags, num_layers, bidirectional=False):
        super().__init__()
        embedding_matrix = load_glove_embeddings(vocab, embed_dim)
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float), padding_idx=0, freeze=True)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True, bidirectional=bidirectional)
        if bidirectional:
            self.linear = nn.Linear(hidden_dim*2, num_tags)
        else:
            self.linear = nn.Linear(hidden_dim, num_tags)

    def forward(self, input_ids):
        # (1, src_len, embed_dim)
        input_embeds = self.embedding(input_ids)

        # (1, src_len, hidden_dim)
        lstm_hidden, _ = self.lstm(input_embeds)

        # (1, src_len, num_tags) -> (n, src_len, num_tags)
        emission = self.linear(lstm_hidden)
        return emission # (n, src_len, num_tags)

    def decode(self, input_ids):
        emission = self(input_ids) # (n, src_len, num_tags)
        probs = nn.functional.softmax(emission, dim=1) # (n, src_len, num_tags)
        preds = torch.argmax(probs, dim=1) # (n, src_len)
        return preds

    def nll_loss(self, input_ids, labels):
        # (src_len, num_tags)
        emission = self(input_ids)
        preds = nn.functional.log_softmax(emission, dim=2) # (n, src_len, num_tags)
        N, d, c = preds.shape
        loss = nn.functional.nll_loss(preds.reshape(N*d, c), labels.reshape(N*d), ignore_index=0)
        return loss

In [6]:
class LSTMCRF(nn.Module):
    def __init__(self, src_vocabs, tgt_vocabs, embed_dim, hidden_dim, num_layers, bidirectional=False):
        super().__init__()

        self.src_vocabs = src_vocabs
        self.tgt_vocabs = tgt_vocabs
        self.num_tags = len(tgt_vocabs)

        self.lstm = LSTMEncoderPreTrain(
          vocab=src_vocabs,
          embed_dim=embed_dim,
          hidden_dim=hidden_dim,
          num_tags=len(tgt_vocabs),
          num_layers=num_layers,
          bidirectional=bidirectional
        )
        self.transitions = nn.Parameter(torch.rand(self.num_tags, self.num_tags))

    def forward(self, input_ids):
        return self.lstm(input_ids), self.create_mask(input_ids) # (n, src_len, num_tags)
    
    def create_mask(self, input_ids):
        mask = (input_ids != 0).float().to(device) 
        return mask

    def forward_alg(self, emission, mask):
        batch_size, seq_len, _ = emission.size()
        alpha = torch.full((batch_size, self.num_tags), -1000., device=device)
        alpha[:, 0] = 0
        for w in range(seq_len):
            alpha_w = []
            mask_w = mask[:, w].view(batch_size, 1).expand(batch_size, self.num_tags)
            for next_tag in range(self.num_tags):
                emit_score = emission[:, w, next_tag].view(batch_size, 1).expand(batch_size, self.num_tags)
                trans_score = self.transitions[next_tag].view(1, -1).expand(batch_size, self.num_tags)
                next_tag_var = alpha + trans_score + emit_score
                alpha_w.append(torch.logsumexp(next_tag_var, dim=1).view(batch_size, 1))
            new_alpha = torch.cat(alpha_w, dim=1)
            alpha = torch.where(mask_w.bool(), new_alpha, alpha) # (n, num_tags-1)
        alpha = torch.logsumexp(alpha, dim=1) # (n)
        return alpha

    def score(self, emission, labels, mask):
        batch_size, seq_len, _ = emission.size()
        score = torch.zeros(batch_size, device=device)
        for i in range(seq_len):
            mask_i = mask[:, i]
            if i == 0:
                score += emission[range(batch_size), i, labels[:, i]] * mask_i
            else:
                score += (
                    self.transitions[labels[:, i], labels[:, i-1]] +  # Transition
                    emission[range(batch_size), i, labels[:, i]]       # Emission
                ) * mask_i
    
        return score # (n)

    def decode(self, input_ids):
        is_single_instance = False
        if input_ids.dim() == 1:
            is_single_instance = True
            input_ids = input_ids.unsqueeze(0)
        emission, mask = self(input_ids)
    
        batch_size, seq_len, _ = emission.size()
        
        backpointers = []

        forward_vvars = torch.full((batch_size, self.num_tags), -1000., device=device) # (n, num_tags)
        forward_vvars[:, 0] = 0

        for w in range(seq_len):
            bptrs_t = []
            viterbivars_t = []
            for next_tag in range(self.num_tags):
                next_tag_var = forward_vvars + self.transitions[next_tag].view(1, -1).expand(batch_size, self.num_tags) # (n, num_tags)
                best_tag_id = torch.argmax(next_tag_var, dim=1) # (n)
                bptrs_t.append(best_tag_id) # (n)
                viterbivars_t.append(next_tag_var[range(batch_size), best_tag_id].view(batch_size, 1))
            forward_vvars = (torch.cat(viterbivars_t, dim=1) + emission[:, w]).view(batch_size, -1) * mask[:, w].view(batch_size, 1) + forward_vvars * (1 - mask[:, w].view(batch_size, 1))
            backpointers.append(bptrs_t)
            #backpointers.append(torch.stack(bptrs_t, dim=1))

        best_tag_id = torch.argmax(forward_vvars, dim=1)
        path_score = forward_vvars[range(batch_size), best_tag_id]

        best_paths = []
        for j in range(batch_size):
            seq_end = int(mask[j].sum().item())
            best_path = [best_tag_id[j].item()]
            for bptrs_t in reversed(backpointers[:seq_end]):
                best_tag_id = bptrs_t[best_path[-1]][j]
                best_path.append(best_tag_id.item())

            best_path.reverse()
            best_paths.append(best_path)
        
        if is_single_instance:
            return best_paths[0]
        else:
            return best_paths
        #return path_score, best_paths

    def nll_loss(self, input_ids, labels):
        # (src_len, num_tags)
        emission, mask = self(input_ids)
        forward = self.forward_alg(emission, mask) # (n)
        score = self.score(emission, labels, mask) # (n)
        return (forward - score).mean() # (1)

In [7]:
def collect_vocabs(training_data):
    src_vocabs_list, tgt_vocabs_list = set(), set()
    for item in training_data:
        for word in item['text']:
            src_vocabs_list.add(word)
        for label in item['label']:
            tgt_vocabs_list.add(label)

    src_vocabs_list = sorted(src_vocabs_list)
    tgt_vocabs_list = sorted(tgt_vocabs_list)

    # words (tokens) vocab
    src_vocabs = {x: i for i, x in enumerate([PAD, UNK] + src_vocabs_list)}

    # NER labels vocab (and its inverse)
    tgt_vocabs, tgt_vocabs_inv = dict(), dict() 
    for i, x in enumerate([PAD] + tgt_vocabs_list):
        tgt_vocabs[x] = i
        tgt_vocabs_inv[i] = x

    return src_vocabs, tgt_vocabs, tgt_vocabs_inv


In [8]:
def vectorize(data, src_vocabs, tgt_vocabs):
    pad_num = 30

    out = []
    for item in data:
        cur_data = []
        word_list = item['text']
        labels = item['label']

        tgt = [tgt_vocabs[x] for x in labels]
        if len(tgt) > pad_num:
            tgt = tgt[:pad_num]
        else:
            tgt += [0] * (pad_num - len(tgt))
        # (1, src_len+1)
        tgt_tensor = torch.tensor(tgt, dtype=torch.int64)


        # tensorized source data (sentence tokens)
        src = [src_vocabs.get(x, src_vocabs[UNK]) for x in word_list]
        if len(src) > pad_num:
                src = src[:pad_num]
        else:
            src += [0] * (pad_num - len(src))
        # (src_len)
        src_tensor = torch.tensor(src, dtype=torch.int64)

        cur_data.insert(0, src_tensor)
        cur_data.insert(1, tgt_tensor)
        out.append(cur_data)

    return out

In [9]:
def process_output(reference_labels, predict_labels, tgt_vocabs_inv):
    reference_labels = [x.item() if torch.is_tensor(x) else x for x in reference_labels]
    predict_labels = [x.item() if torch.is_tensor(x) else x for x in predict_labels]
    # convert to string labels
    reference_labels = [tgt_vocabs_inv[x] for x in reference_labels]
    predict_labels = [tgt_vocabs_inv[x] for x in predict_labels]

    # remove padding labels
    reference_labels = [x for x in reference_labels if x != PAD]
    predict_labels = [x for x in predict_labels if x != PAD]

    return reference_labels, predict_labels

In [10]:
def calculation_matrix(reference_labels, predict_labels):
    reference = decode_bio(reference_labels)
    predict = decode_bio(predict_labels)
    matrix = score_mentions(reference, predict)
    return matrix, reference, predict

In [11]:
def evaluate(tagger, eval_dataset, tgt_vocabs_inv, device, epoch):
    outputs_list = []
    TP = FP = FN = 0
    with torch.no_grad():
        for eval_data in tqdm.tqdm(eval_dataset):
            # dev has labels but test doesn't
            input_ids = eval_data[0].to(device)
            labels = eval_data[1] if eval_data[1] != [] else None

            preds = tagger.decode(input_ids) # (n, src_len)

            if torch.is_tensor(preds):
                if preds.ndim == 2:
                        preds = preds[0]
                preds = preds.tolist()

            reference_labels, predict_labels = process_output(labels, preds, tgt_vocabs_inv)
            matrix, reference, predict = calculation_matrix(reference_labels, predict_labels)
            outputs_list.append({'reference': reference, 'predict': predict})
            TP += matrix[0]
            FP += matrix[1]
            FN += matrix[2]

    # calculate precision, recall and f1 score
    print(f'TP: {TP}, FP: {FP}, FN: {FN}')
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

    with open(f'CRF_Outputs/test_outputs_CRF_2_{epoch}.json', 'w') as f:
        json.dump(outputs_list, f)

    return f1

In [12]:
EMBED_DIM = 300
NUM_HIDDEN = 256
NUM_LAYERS = 2
BIDIRECTIONAL = True

LEARNING_RATE = 0.001
NUM_EPOCHS = 5
BATCH_SIZE = 16
SEED = 1334
EVAL_EVERY = 5

***Start Training***

In [13]:
train_data_path = 'dataset/train_BIO.jsonl'
dev_data_path = 'dataset/dev_BIO.jsonl'
test_data_path = 'dataset/test_BIO.jsonl'

In [13]:
instances = list(map(json.loads, open(train_data_path)))
dev_instances = list(map(json.loads, open(dev_data_path)))

In [14]:


with open('BiLSTM_vocab/src_vocabs.json', 'r') as src_file:
    src_vocabs = {k: int(v) for k, v in json.load(src_file).items()}

with open('BiLSTM_vocab/tgt_vocabs.json', 'r') as tgt_file:
    tgt_vocabs = {k: int(v) for k, v in json.load(tgt_file).items()}

with open('BiLSTM_vocab/tgt_vocabs_inv.json', 'r') as tgt_file:
    tgt_vocabs_inv = {int(k): v for k, v in json.load(tgt_file).items()}
'''

src_vocabs, tgt_vocabs, tgt_vocabs_inv = collect_vocabs(instances)
with open('src_vocabs.json', 'w') as src_file:
    json.dump(src_vocabs, src_file, indent=4)

with open('tgt_vocabs.json', 'w') as tgt_file:
    json.dump(tgt_vocabs, tgt_file, indent=4)

with open('tgt_vocabs_inv.json', 'w') as tgt_file:
    json.dump(tgt_vocabs_inv, tgt_file, indent=4)
'''


"\n\nsrc_vocabs, tgt_vocabs, tgt_vocabs_inv = collect_vocabs(instances)\nwith open('src_vocabs.json', 'w') as src_file:\n    json.dump(src_vocabs, src_file, indent=4)\n\nwith open('tgt_vocabs.json', 'w') as tgt_file:\n    json.dump(tgt_vocabs, tgt_file, indent=4)\n\nwith open('tgt_vocabs_inv.json', 'w') as tgt_file:\n    json.dump(tgt_vocabs_inv, tgt_file, indent=4)\n"

In [15]:
training_dataset = vectorize(instances, src_vocabs, tgt_vocabs)
dev_dataset = vectorize(dev_instances, src_vocabs, tgt_vocabs)

In [16]:
print(training_dataset[0])

[tensor([  39,   99, 3094, 2356, 2559, 3167, 3331, 3785,  813, 2867, 3770,  815,
        1029,   25,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0]), tensor([9, 3, 7, 9, 9, 9, 9, 9, 9, 9, 9, 3, 7, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0])]


In [16]:
tagger = LSTMEncoderPreTrain(
        vocab=src_vocabs,
        embed_dim=EMBED_DIM,
        hidden_dim=NUM_HIDDEN,
        num_tags=len(tgt_vocabs),
        num_layers=NUM_LAYERS,
        bidirectional=BIDIRECTIONAL
    )
tagger.to(device)

LSTMEncoderPreTrain(
  (embedding): Embedding(3831, 300, padding_idx=0)
  (lstm): LSTM(300, 128, num_layers=2, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=256, out_features=10, bias=True)
)

In [17]:
tagger = LSTMCRF(
        src_vocabs=src_vocabs,
        tgt_vocabs=tgt_vocabs,
        embed_dim=EMBED_DIM,
        hidden_dim=NUM_HIDDEN,
        num_layers=NUM_LAYERS,
        bidirectional=BIDIRECTIONAL
    )
tagger.to(device)

LSTMCRF(
  (lstm): LSTMEncoderPreTrain(
    (embedding): Embedding(3831, 300, padding_idx=0)
    (lstm): LSTM(300, 256, num_layers=2, batch_first=True, bidirectional=True)
    (linear): Linear(in_features=512, out_features=10, bias=True)
  )
)

In [18]:
optimizer = optim.Adam(tagger.parameters(), lr=LEARNING_RATE)

In [19]:
if len(training_dataset) % BATCH_SIZE == 0:
    batchCount = len(training_dataset) // BATCH_SIZE
else:
    batchCount = len(training_dataset) // BATCH_SIZE + 1

In [20]:
for epoch in range(1, NUM_EPOCHS + 1):
    epoch_loss = 0.

    random.shuffle(training_dataset)
    for i in tqdm.tqdm(range(batchCount), desc=f'[Epoch {epoch}/{NUM_EPOCHS}]'):
        if i == batchCount - 1:  # Last mini-batch
            mini_batch = training_dataset[i * BATCH_SIZE:]
        else:
            mini_batch = training_dataset[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]
        mini_input_ids = torch.stack([instance[0] for instance in mini_batch])
        mini_labels = torch.stack([instance[1] for instance in mini_batch])
        mini_input_ids, mini_labels = mini_input_ids.to(device), mini_labels.to(device)

        optimizer.zero_grad()

        loss = tagger.nll_loss(mini_input_ids, mini_labels)
        
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    # display info at the end of epoch
    log = f'[Epoch {epoch}/{NUM_EPOCHS}] Loss: {epoch_loss:.2f}'
    if epoch % EVAL_EVERY == 0:
        # Save the model at the end of each epoch
        save_path = f"CRF_Outputs/outputs_CRF_2_{epoch}.pt"
        torch.save(tagger.state_dict(), save_path)
        print(f"Model saved to {save_path}")

        print("Begin evaluation on dev set")
        dev_f1 = evaluate(tagger, dev_dataset, tgt_vocabs_inv, device=device, epoch=epoch)
        log = f'{log} | Dev F1: {dev_f1}%'
    print(log)

[Epoch 1/10]: 100%|██████████| 92/92 [00:10<00:00,  8.50it/s]


[Epoch 1/10] Loss: 1151.83


[Epoch 2/10]: 100%|██████████| 92/92 [00:10<00:00,  8.50it/s]


[Epoch 2/10] Loss: 642.67


[Epoch 3/10]: 100%|██████████| 92/92 [00:09<00:00,  9.22it/s]


[Epoch 3/10] Loss: 460.38


[Epoch 4/10]: 100%|██████████| 92/92 [00:10<00:00,  8.87it/s]


[Epoch 4/10] Loss: 337.55


[Epoch 5/10]: 100%|██████████| 92/92 [00:10<00:00,  8.80it/s]


Model saved to CRF_Outputs/outputs_CRF_2_5.pt
Begin evaluation on dev set


100%|██████████| 184/184 [00:05<00:00, 36.49it/s]


TP: 173, FP: 205, FN: 89
[Epoch 5/10] Loss: 263.07 | Dev F1: 0.5406249999999999%


[Epoch 6/10]: 100%|██████████| 92/92 [00:10<00:00,  9.07it/s]


[Epoch 6/10] Loss: 194.36


[Epoch 7/10]: 100%|██████████| 92/92 [00:10<00:00,  9.14it/s]


[Epoch 7/10] Loss: 123.60


[Epoch 8/10]: 100%|██████████| 92/92 [00:10<00:00,  9.02it/s]


[Epoch 8/10] Loss: 82.12


[Epoch 9/10]: 100%|██████████| 92/92 [00:10<00:00,  9.10it/s]


[Epoch 9/10] Loss: 39.57


[Epoch 10/10]: 100%|██████████| 92/92 [00:10<00:00,  8.95it/s]


Model saved to CRF_Outputs/outputs_CRF_2_10.pt
Begin evaluation on dev set


100%|██████████| 184/184 [00:04<00:00, 37.79it/s]

TP: 164, FP: 222, FN: 98
[Epoch 10/10] Loss: 17.31 | Dev F1: 0.5061728395061728%





***Evaluation***

In [18]:
'''
tagger = LSTMEncoderPreTrain(
        vocab=src_vocabs,
        embed_dim=EMBED_DIM,
        hidden_dim=NUM_HIDDEN,
        num_tags=len(tgt_vocabs),
        num_layers=NUM_LAYERS,
        bidirectional=BIDIRECTIONAL
    )
tagger.to(device)
'''

tagger = LSTMCRF(
        src_vocabs=src_vocabs,
        tgt_vocabs=tgt_vocabs,
        embed_dim=EMBED_DIM,
        hidden_dim=NUM_HIDDEN,
        num_layers=NUM_LAYERS,
        bidirectional=BIDIRECTIONAL
    )
tagger.to(device)

# Load the saved model weights
checkpoint_path = "CRF_Outputs/outputs_CRF_2_5_model.pt"  # Replace with your .pt file path
tagger.load_state_dict(torch.load(checkpoint_path))

# Set the model to evaluation mode
tagger.eval()

  tagger.load_state_dict(torch.load(checkpoint_path))


LSTMCRF(
  (lstm): LSTMEncoderPreTrain(
    (embedding): Embedding(3831, 300, padding_idx=0)
    (lstm): LSTM(300, 256, num_layers=2, batch_first=True, bidirectional=True)
    (linear): Linear(in_features=512, out_features=10, bias=True)
  )
)

In [19]:
test_instances = list(map(json.loads, open(test_data_path)))
test_dataset = vectorize(test_instances, src_vocabs, tgt_vocabs)

In [20]:
test_f1 = evaluate(tagger, test_dataset, tgt_vocabs_inv, device=device, epoch=1)
print(f"Test F1: {test_f1}%")

100%|██████████| 184/184 [00:05<00:00, 36.30it/s]

TP: 159, FP: 204, FN: 89
Test F1: 0.5204582651391162%



