# Named Entity Recognition with Bidirectional LSTMs
- TUTORIAL: [Sequence Models and Long Short-Term Memory Networks](https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html#sequence-models-and-long-short-term-memory-networks)

In [1]:
# !pip install torcheval

In [2]:
# imports
# import pdb # for step by step debugging
# assert(False) # use to stop at a specific line (think of like stop, quit, exit, etc)
import gzip
import torch
import shutil

import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F


from tqdm import tqdm
from datasets import Dataset


from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils import clip_grad_norm_
from sklearn.model_selection import train_test_split
from torcheval.metrics.functional import multiclass_f1_score

  from .autonotebook import tqdm as notebook_tqdm


## Load Data

In [3]:
# Load and Update Data
def load_data(file_path, new_sentence_col_name, word_col_name, ner_tag_col_name):
    sentence_idxs = []
    sentences = []
    sentences_ner_tags = []

    words = []
    ner_tags = []

    space_sentence_idxs = []
    space_words = []
    space_ner_tags = []
    
    with open(file_path, 'r') as f:
        lines = f.read().split("\n") # separate each sentence by new line
        # print(lines)
    sentence_idx = 0
    for line in lines:
        # text_idx, input_text, text_ner_tag = line.split("[")
        
        each_line = line.split(" ")
        # print(len(each_line))
        if len(each_line) != 1:
            sentence_idx += 1
            text_idx, input_text, ner_tag = each_line[0], each_line[1], each_line[2]
            # print(f"Line {text_idx} has word {input_text}, with NER Tag {ner_tag} --- {sentence_idx}")
            sentence_idxs.append(sentence_idx)

            words.append(input_text)
            ner_tags.append(ner_tag)
            

            space_sentence_idxs.append(sentence_idx)
            space_words.append(input_text)
            space_ner_tags.append(ner_tag)
        else:
            

            if words and ner_tags:
                
                # print(f"words {words}")
                sentences.append(words)
                # print(f"ner_tags {ner_tags}")
                sentences_ner_tags.append(ner_tags)
                
                words = []
                ner_tags = []

            sentence_idx = 0
            reset_sentence_idx, input_text, ner_tag = " ", " ", " "
            space_sentence_idxs.append(reset_sentence_idx)
            space_words.append(input_text)
            space_ner_tags.append(ner_tag)

    """Return as DF"""
    df = pd.DataFrame(zip(sentence_idxs, words, ner_tags), columns=[new_sentence_col_name, word_col_name, ner_tag_col_name])
    with_space_df = pd.DataFrame(zip(space_sentence_idxs, space_words, space_ner_tags), columns=[new_sentence_col_name, word_col_name, ner_tag_col_name])


    """Return as Dictionaries"""
    sentences_in_dict = {}
    ner_tags_in_dict = {}

    sentences_in_dict[word_col_name] = sentences
    ner_tags_in_dict[ner_tag_col_name] = sentences_ner_tags


    return df, with_space_df, sentences_in_dict, ner_tags_in_dict

In [4]:
train_file_path = "data/train"
new_sentence_col_name = 'New Sentence Index'
word_col_name = 'Word'
ner_tag_col_name = 'NER Tag'
ner_tag_idx_col_name = 'NER Tag Idx'

df, with_space_df, sentences_in_dict, ner_tags_in_dict = load_data(train_file_path, new_sentence_col_name, word_col_name, ner_tag_col_name)

In [5]:
# sentences_in_dict

In [6]:
# ner_tags_in_dict

In [7]:
def replace_ner_tag_with_idx(to_map, key_name, reordered_dict, ner_tag_idx_col_name):
    """Pair ner tag with corresponding index"""
    final_dict = {}
    per_sentence = []
    all_sentences = []

    ner_tags = to_map[key_name]

    # Iterate over the values_list
    for ner_tags_idx in range(len(ner_tags)):
        sentence_ner_tags = ner_tags[ner_tags_idx]
        # print(sentence_ner_tags)
        # assert(False)

        for sentence_ner_tags_idx in range(len(sentence_ner_tags)):
            ner_tag = sentence_ner_tags[sentence_ner_tags_idx]
            # print(ner_tag)

            # assert(False)

            for key, value in reordered_dict.items():
                # print("---", key, value)
                # assert(False)

                if ner_tag == value:
                    per_sentence.append(key)
                    # print("---", new_list) 
        # print(per_sentence)
        all_sentences.append(per_sentence)
        per_sentence = []
    # print(all_sentences)
    # assert(False)


    final_dict[ner_tag_idx_col_name] = all_sentences

    return final_dict


In [8]:
idx_at_ner_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}

ner_tags_idx_in_dict = replace_ner_tag_with_idx(ner_tags_in_dict, ner_tag_col_name, idx_at_ner_tag, ner_tag_idx_col_name)

In [9]:
# ner_tags_idx_in_dict

In [10]:
def create_dataset(sentences_dict, ner_tags_dict, ner_tags_idx_dict, word_col_name, ner_tag_idx_col_name, drop_ner_tags=True):


    train_dict = {}
    train_dict.update(sentences_dict)
    train_dict.update(ner_tags_dict)
    train_dict.update(ner_tags_idx_dict)

    train_df = pd.DataFrame(train_dict)

    if drop_ner_tags == True:
        sub_df = train_df.loc[:, [word_col_name, ner_tag_idx_col_name]]
        train_dataset = Dataset.from_dict(sub_df)
        return train_dataset
    else:
        print(f"Error with {drop_ner_tags}")


In [11]:
train_dataset = create_dataset(sentences_in_dict, ner_tags_in_dict, ner_tags_idx_in_dict, word_col_name, ner_tag_idx_col_name)

In [12]:
# train_dataset

### Load Glove Embeddings

In [13]:
# input_file = 'glove.6B.100d.gz'
# output_file = 'glove.6B.100d.txt'

# with gzip.open(input_file, 'rb') as f_in:
#     with open(output_file, 'wb') as f_out:
#         shutil.copyfileobj(f_in, f_out)

In [14]:
def load_glove_dataset(glove_file, embedding_dim, vocab_size=None):
    embeddings_index = {}
    vocabulary = []
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in tqdm(f, desc="Loading GloVe embeddings", total=vocab_size):
            values = line.split()
            word = values[0]
            vocabulary.append(word)
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    # Create a matrix to hold the embeddings
    embedding_matrix = torch.randn((vocab_size, embedding_dim))
    for i, (word, embedding_vector) in tqdm(enumerate(embeddings_index.items()), desc="Creating embedding matrix"):
        if i < vocab_size:
            embedding_matrix[i] = torch.from_numpy(embedding_vector)

    return embedding_matrix, vocabulary

In [15]:
glove_file = 'glove.6B.100d.txt'
glove_embedding_dim = 100
glove_vocab_size = 400002

glove_embedding_matrix, glove_vocabulary = load_glove_dataset(glove_file, glove_embedding_dim, glove_vocab_size)

Loading GloVe embeddings: 100%|█████████▉| 400000/400002 [00:07<00:00, 50270.66it/s]
Creating embedding matrix: 400000it [00:02, 171450.49it/s]


In [16]:
# glove_embedding_matrix

In [17]:
glove_embeddings = glove_embedding_matrix.numpy()
# glove_embeddings

In [18]:
# glove_vocabulary

In [19]:
glove_vocabulary = ['PAD', 'UNK'] + glove_vocabulary
# glove_vocabulary

In [20]:
def build_vocab_glove(words):
    """Build vocabulary for GloVe embeddings.

    Parameters
    ----------
    words: `list`:
        List of vocabulary terms.

    Returns
    -------
    word_idx, idx_word: `tuple`
        A tuple containing two dictionaries mapping words to their corresponding indices and mapping indices to their corresponding words.
    """
    word_at_idx = {}

    for word_idx, word in enumerate(words):
        # print(word, word_idx)
        
        word_at_idx[word] = word_idx
        # assert(False)

    idx_at_word = {}

    for word, word_idx in word_at_idx.items():
        # print(word, idx)

        idx_at_word[word_idx] = word
        
    return word_at_idx, idx_at_word


In [21]:
words_with_idx, idx_with_words = build_vocab_glove(np.array(glove_vocabulary))

In [22]:
# words_with_idx

In [23]:
# idx_with_words

In [24]:
# Define the conversion function
def convert_text_to_input_ids_glove(row, words_with_idx, ids_of_inputs_col_name, capitalize_words_col_name):
    """Convert tokenized text into input IDs using GloVe embeddings.
    
    Parameters
    ----------
    row: `dict`
        A dictionary containing the tokens for a single data point.
        
    Returns
    -------
    dict: A dictionary containing the input IDs and capitalization vector for the tokens.
        - 'ids_of_inputs' (list): List of input IDs corresponding to each token.
        - 'capital_words_vector' (list): Binary vector indicating capitalization of each token.
    """
    capital_words_vector = []  # Initialize list to store capitalization vector
    ids_of_inputs = []  # Initialize list to store input IDs
    for Word in row['Word']:  # Iterate through each word in the row
        # Check if any character in the word is uppercase
        if any(x.isupper() for x in Word):
            capital_words_vector.append(1)  # Append 1 if uppercase character is found
        else:
            capital_words_vector.append(0)  # Append 0 if no uppercase character is found
        
        Word_lower = Word.lower()  # Convert word to lowercase
        # Check if lowercase word is in word2idx dictionary
        if Word_lower in words_with_idx:
            ids_of_inputs.append(words_with_idx[Word_lower])  # Append corresponding index from word2idx
        else:
            ids_of_inputs.append(1)  # Append index 1 as default if word not found in word2idx
    
    return {
        ids_of_inputs_col_name: ids_of_inputs,  # Return input IDs
        capitalize_words_col_name: capital_words_vector  # Return capitalization vector
    }

In [25]:
ids_of_inputs_col_name = 'ID of Input'
capitalize_words_col_name = 'Capital Word'
# Assign the conversion function to a variable
convert_function = convert_text_to_input_ids_glove

# Create an empty list to store converted data points
converted_data = []

# Iterate through each row in the dataset and apply the conversion function
for row in train_dataset:
    converted_data.append(convert_function(row, words_with_idx, ids_of_inputs_col_name, capitalize_words_col_name))


In [26]:
# converted_data

In [27]:
def convert_dataset_to_glove(dataset, convert_function, words_with_idx, word_col_name, ner_tag_idx_col_name, ids_of_inputs_col_name, capitalize_words_col_name):
    converted_data = []

    for row in dataset:
        converted_row = {}
        converted_row[word_col_name] = row[word_col_name]
        converted_row[ner_tag_idx_col_name] = row[ner_tag_idx_col_name]
        converted_result = convert_function(row, words_with_idx, ids_of_inputs_col_name, capitalize_words_col_name)
        # print(converted_result)
        # assert(False)
        converted_row[ids_of_inputs_col_name] = converted_result[ids_of_inputs_col_name]
        converted_row[capitalize_words_col_name] = converted_result[capitalize_words_col_name]
        converted_data.append(converted_row)


    converted_data_dict = {}
    for key in converted_data[0]:
        converted_data_dict[key] = []
        for row in converted_data:
            converted_data_dict[key].append(row[key])

    # Create the converted_dataset
    converted_dataset = Dataset.from_dict(converted_data_dict)

    return converted_dataset

In [28]:
train_dataset_glove = convert_dataset_to_glove(train_dataset, convert_text_to_input_ids_glove, words_with_idx, word_col_name, ner_tag_idx_col_name, ids_of_inputs_col_name, capitalize_words_col_name)


In [29]:
# train_dataset_glove

In [30]:
train_dataset_glove.set_format(
    type='torch',
    columns=[ner_tag_idx_col_name, ids_of_inputs_col_name, capitalize_words_col_name]
)
print(train_dataset_glove[0])

{'NER Tag Idx': tensor([3, 0, 7, 0, 0, 0, 7, 0, 0]), 'ID of Input': tensor([  646,  7580,   516,   582,     6,  5262,   299, 10240,     4]), 'Capital Word': tensor([1, 0, 1, 0, 0, 0, 1, 0, 0])}


In [31]:
from torch.nn.utils.rnn import pad_sequence

def pad_sequence_per_batch(batch, ner_tag_idx_col_name=ner_tag_idx_col_name, ids_of_inputs_col_name=ids_of_inputs_col_name, capitalize_words_col_name=capitalize_words_col_name):
    """Pad sequences per batch
    Combines individual samples into batches, padding sequences to the same length.
    
    Parameter
    ---------
    batch: `list`
        A list of dictionaries, each containing 'ID of Input', 'Capital Word', and NER Tag Idx.

    Return
    ------
    padded_data: `dict`
        A dictionary containing padded sequences of ids_of_inputs, capital_words_vector, and NER Tag Idx.

    """
    ids_of_inputs_list = []
    capital_words_vector_list = []
    ner_tag_idx_list = []

    # Iterate through each item in the batch
    for batch_item in batch:
        # Get ids_of_inputs, capital_words_vector, and NER Tag Idx from the current item
        ids_of_inputs = batch_item[ids_of_inputs_col_name]
        capital_words_vector = batch_item[capitalize_words_col_name]
        ner_tag_idx = batch_item[ner_tag_idx_col_name]

        ids_of_inputs_list.append(ids_of_inputs)
        capital_words_vector_list.append(capital_words_vector)
        ner_tag_idx_list.append(ner_tag_idx)

    padded_ids_of_inputs = pad_sequence(ids_of_inputs_list, batch_first=True)
    padded_capital_words_vector = pad_sequence(capital_words_vector_list, batch_first=True)
    padded_ner_tag_idx = pad_sequence(ner_tag_idx_list, batch_first=True, padding_value=0)

    padded_data = {
        ids_of_inputs_col_name: padded_ids_of_inputs,
        capitalize_words_col_name: padded_capital_words_vector,
        ner_tag_idx_col_name: padded_ner_tag_idx
    }

    # Return the dictionary
    return padded_data


# Tasks 2: Using GloVe word embeddings

# 1. Simple Bidirectional LSTM model

### Define Hyper-parameters

In [32]:
EMBEDDING_DIM = 100
LSTM_LAYERS = 1

HIDDEN_DIM = 256
DROPOUT = 0.33

OUTPUT_DIM = 128

TRAIN_BATCH_SIZE = 64

train_params = {
    'batch_size': TRAIN_BATCH_SIZE,
    'shuffle': True,
    'num_workers': 0
}

### Create the BLSTM

In [33]:
train_dataloader_glove = torch.utils.data.DataLoader(train_dataset_glove, collate_fn=pad_sequence_per_batch, **train_params)

In [34]:
# train_dataloader_glove

In [35]:
class BLLSTM_NER_Tagger_GloVe(nn.Module):
  def __init__(self, embedding_dim, hidden_dim, output_dim, dropout_percentage, vocab_mappings, ner_tag_mappings, pretrained_embeddings):
    super(BLLSTM_NER_Tagger_GloVe, self).__init__()

    self.word_embeddings = nn.Embedding.from_pretrained(torch.from_numpy(pretrained_embeddings).float())
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True, num_layers=1)
    self.dropout = nn.Dropout(p=dropout_percentage)
    self.linear_1 = nn.Linear(hidden_dim * 2, output_dim)
    self.linear_elu = nn.ELU()
    self.linear_classifier = nn.Linear(output_dim, len(ner_tag_mappings))

  def forward(self, sentences, all_capitals):
    # print(f"sentences shape: {sentences.shape}")
    # assert(False)
    embeds = self.word_embeddings(sentences)
    # print(f"embeds shape: {embeds.shape}")

    all_capitals = all_capitals.unsqueeze(2)
    # print(f"all_capitals shape: {all_capitals.shape}")

    # concatenating captial vectors at the end of embeddings
    lstm_input = torch.cat([embeds, all_capitals], dim=2)
    # print(f"lstm_input shape: {lstm_input.shape}")

     # assert(False)
    lstm_out, _ = self.lstm(lstm_input)
    # print(f"lstm_out shape: {lstm_out.shape}")

    lstm_dropout = self.dropout(lstm_out)
    # print(f"lstm_dropout shape: {lstm_dropout.shape}")

    fc_layer = self.linear_1(lstm_dropout)
    # print(f"fc_layer shape: {fc_layer.shape}")

    elu_layer = self.linear_elu(fc_layer)
    # print(f"elu_layer shape: {elu_layer.shape}")

    tag_scores = self.linear_classifier(elu_layer)
    # print(f"tag_scores shape: {tag_scores.shape}")

    tag_scores = tag_scores.permute(0, 2, 1)
    # print(f"tag_scores shape: {tag_scores.shape}")
      
    return tag_scores


### TRAIN THE BLSTM

In [36]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print('Device: ', device.type)

In [37]:
# words_with_idx
# idx_at_ner_tag

In [38]:
gloVe_blstm_model_class = BLLSTM_NER_Tagger_GloVe(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT, words_with_idx, idx_at_ner_tag, pretrained_embeddings=glove_embeddings)
gloVe_blstm_model_class


BLLSTM_NER_Tagger_GloVe(
  (word_embeddings): Embedding(400002, 100)
  (lstm): LSTM(100, 256, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.33, inplace=False)
  (linear_1): Linear(in_features=512, out_features=128, bias=True)
  (linear_elu): ELU(alpha=1.0)
  (linear_classifier): Linear(in_features=128, out_features=9, bias=True)
)

In [39]:
LEARNING_RATE = 1
LOSS_FUNCTION = nn.CrossEntropyLoss()
OPTIMIZER = optim.SGD(gloVe_blstm_model_class.parameters(), lr=LEARNING_RATE)

In [40]:
# sentences_in_dict[word_col_name]

In [41]:
import random

def sentence_batch_generator(sentences, batch_indices):
    for idx in batch_indices:
        yield sentences[idx]

In [42]:
import torch
from tqdm import tqdm

def train_glove(model, training_data, sentences, N_epochs, loss_function, optimizer, word_to_ix, batch_size, per_batch):
    ids_of_inputs_col_name, capitalize_words_col_name, ner_tag_idx_col_name = per_batch
    store_predicted_ner_tag_scores_per_epoch = []

    for epoch_index in tqdm(range(N_epochs), desc="Epochs"):
        running_loss = 0
        store_predicted_ner_tag_scores = []

        model.train(True)  # Set the model to training mode
        
        for batch_idx, batch_of_training_data in enumerate(tqdm(training_data, desc=f"Epoch {epoch_index + 1}", leave=False)):
            optimizer.zero_grad()

            sentences_scores = batch_of_training_data[ids_of_inputs_col_name]
            one_hot_capitals = batch_of_training_data[capitalize_words_col_name]
            true_ner_tag_mappings = batch_of_training_data[ner_tag_idx_col_name]

            pred_ner_tag_scores_per_batch = model(sentences_scores, one_hot_capitals)
            loss_per_batch = loss_function(pred_ner_tag_scores_per_batch, true_ner_tag_mappings)

            # Accumulate the loss for this batch
            running_loss += loss_per_batch.item()

            # Backpropagation and optimization
            loss_per_batch.backward()
            optimizer.step()

            # Append the predicted scores for this batch
            store_predicted_ner_tag_scores.append((epoch_index, batch_idx, pred_ner_tag_scores_per_batch.detach().cpu()))

        epoch_loss = running_loss / len(training_data)
        print(f"Epoch {epoch_index}: Average Loss: {epoch_loss}")

        store_predicted_ner_tag_scores_per_epoch.append(store_predicted_ner_tag_scores)

    return store_predicted_ner_tag_scores_per_epoch


In [43]:
# glove_vocabulary[1:]

In [44]:
per_batch = (ids_of_inputs_col_name, capitalize_words_col_name, ner_tag_idx_col_name)
predicted_ner_tag_scores_per_epoch = train_glove(gloVe_blstm_model_class, train_dataloader_glove, glove_vocabulary[2:], 2, LOSS_FUNCTION, OPTIMIZER, words_with_idx, TRAIN_BATCH_SIZE, per_batch)


Epochs:  50%|█████     | 1/2 [00:24<00:24, 24.77s/it]

Epoch 0: Average Loss: 0.17267637612654807


Epochs: 100%|██████████| 2/2 [00:51<00:00, 25.92s/it]

Epoch 1: Average Loss: 0.12586468667426007





In [45]:
# Get the length of store_predicted_ner_tag_scores_per_epoch and store_predicted_ner_tag_scores_per_epoch[-1]
length_of_epoch_scores = len(predicted_ner_tag_scores_per_epoch)
last_epoch_scores_length = len(predicted_ner_tag_scores_per_epoch[-1])

# Print the explanation
print(f"The length of store_predicted_ner_tag_scores_per_epoch is {length_of_epoch_scores}.")
print(f"The length of store_predicted_ner_tag_scores_per_epoch[-1] is {last_epoch_scores_length}.")
print("This indicates that during the last epoch of training:")
print("- The training data was divided into", last_epoch_scores_length, "batches,")
print("- And the model processed each batch, making predictions for each batch.")


The length of store_predicted_ner_tag_scores_per_epoch is 2.
The length of store_predicted_ner_tag_scores_per_epoch[-1] is 235.
This indicates that during the last epoch of training:
- The training data was divided into 235 batches,
- And the model processed each batch, making predictions for each batch.


In [46]:
# predicted_ner_tags_scores = predicted_ner_tag_scores_per_epoch[-1]
# type(predicted_ner_tags_scores), len(predicted_ner_tags_scores)

In [47]:
# predicted_ner_tags_scores

In [48]:
# def find_max_score_and_tag(predicted_ner_tags_scores, tag_to_ix):
#     """Find the maximum scores and corresponding tags"""
#     # print(ner_tag_prediction_scores)
#     store_predicted_ner_tags = []
# 
#     for predicted_ner_tags_scores_idx in range(len(predicted_ner_tags_scores)):
#         ner_tags_per_sentence = []
#         batch = predicted_ner_tags_scores[predicted_ner_tags_scores_idx]
#         # print(batch, type(batch))
        
#         batch_idx = batch[0]
#         batch_sentences = batch[1]
#         batch_scores = batch[2][0]
#         # print(f"batch_sentences: {batch_sentences}")
#         # print(f"   batch_scores: {batch_scores.shape}")
        
#         max_scores, max_tag_idxs = torch.max(batch_scores, dim=1)
#         # print(f"   max_tag_idxs: {max_tag_idxs}")
#         max_tag_idxs = max_tag_idxs.tolist()
#         max_tag_idxs = sum(max_tag_idxs, [])
#         # print(f"   max_tag_idxs: {max_tag_idxs}")
#         # assert(False)

#         pred_ner_tag_mapped = []
#         for idx in max_tag_idxs:
#             for key, value in tag_to_ix.items():
#                 if key == idx:
#                     # print(True)
#                     pred_ner_tag_mapped.append(value)
#                     # assert(False)
#                     break
#             ner_tags_per_sentence.append(pred_ner_tag_mapped)
#         # print("pred_ner_tag_mapped")
#         # print(pred_ner_tag_mapped)
#         # assert(False)
#         store_predicted_ner_tags.append((batch_idx, batch_sentences, ner_tags_per_sentence))

#     return store_predicted_ner_tags

In [49]:
# predicted_ner_tags = find_max_score_and_tag(predicted_ner_tags_scores, idx_at_ner_tag)

In [50]:
# def find_max_score_and_tag(predicted_ner_tags_scores, tag_to_ix):
#     """Find the maximum scores and corresponding tags"""
#     store_predicted_ner_tags = []

#     for batch_idx, batch_sentences, batch_scores in predicted_ner_tags_scores:
#         max_scores, max_tag_idxs = torch.max(batch_scores, dim=1)
#         max_tag_idxs = max_tag_idxs.tolist()

#         # Map tag indices to actual tags using tag_to_ix
#         ner_tags_per_sentence = [[tag_to_ix[idx] for idx in tag_idxs] for tag_idxs in max_tag_idxs]

#         store_predicted_ner_tags.append((batch_idx, batch_sentences, ner_tags_per_sentence))

#     return store_predicted_ner_tags


In [174]:
def find_max_score_and_tag(predicted_ner_tags_scores, tag_to_ix):
    """Find the maximum scores and corresponding tags"""
    store_predicted_ner_tags = []

    # print(f"predicted_ner_tags_scores: {len(predicted_ner_tags_scores)}")
    for epoch_idx in range(len(predicted_ner_tags_scores)):
        # print(f"epoch_idx: {epoch_idx}")

        predicted_ner_tags_score_per_epochs = predicted_ner_tags_scores[epoch_idx]
        # print(f"predicted_ner_tags_score_per_epochs: {len(predicted_ner_tags_score_per_epochs)}")

        for batch_idx in range(len(predicted_ner_tags_score_per_epochs)):
            predicted_ner_tags_score_per_batch = predicted_ner_tags_score_per_epochs[batch_idx]
            # print(f"predicted_ner_tags_score_per_batch: {type(predicted_ner_tags_score_per_batch)}")

            _, _, predicted_ner_tags_score = predicted_ner_tags_score_per_batch
            # print(f"predicted_ner_tags_score: {type(predicted_ner_tags_score)}")
            
        # assert(False)

            max_scores, max_tag_idxs = torch.max(predicted_ner_tags_score, dim=1)
            max_tag_idxs = max_tag_idxs.tolist()

            ner_tags_per_sentence = []
            for idxs in max_tag_idxs:
                for idx in idxs:
                    for key, value in tag_to_ix.items():
                        # print(key, value, idx)
                        # assert(False)
                        if key == idx:
                            # print(value)
                            # assert(False)
                            ner_tags_per_sentence.append(value)
                            break
                 store_predicted_ner_tags.append((batch_idx, ner_tags_per_sentence))

    return store_predicted_ner_tags

In [175]:
# predicted_ner_tag_scores_per_epoch[0][1][2]

In [176]:
max_score_and_tag_predictions = find_max_score_and_tag(predicted_ner_tag_scores_per_epoch, idx_at_ner_tag)



In [177]:
len(max_score_and_tag_predictions)

470

In [169]:
len(glove_vocabulary)

400002

In [None]:
def create_ner_dataframe(predicted_ner_tags):
    
    data_for_df = []
    
    for idx, batch in enumerate(predicted_ner_tags):
        # print(batch)
        # assert(False)
        batch_idx = batch[0]
        batch_sentences = batch[1]
        batch_ner_tags = batch[2]
        # print(batch_ner_tags)
        # assert(False)

        
        for sentence_tag_idx, (sentence, ner_tags) in enumerate(zip(batch_sentences, batch_ner_tags)):
            # print(sentence_tag_idx)
            # print(sentence, ner_tags)

            for row_idx, (word, tag) in enumerate(zip(sentence, ner_tags)):
                # print(row_idx, word, tag)
                # if word == ".":
                    # print(word, tag)
                data_for_df.append((row_idx + 1, word, tag))
            data_for_df.append((" ", " ", " "))
        # print(data_for_df)
    results_df = pd.DataFrame(data_for_df, columns=['New Sentence Index', 'Word', 'NER Tag'], index=range(1, len(data_for_df)+1))
    return results_df

In [None]:
results_df = create_ner_dataframe(predicted_ner_tags)

In [None]:
results_df

In [None]:
results_df.head(20)

In [None]:
import csv

file_path = 'results/train2.out'

# when saving dataframe as txt with df.to_csv, my single quotes (") are turning into triple quotes ("""). To NOT do this, add quoting=csv.QUOTE_NONE

results_df.to_csv(file_path, sep=' ', header=False, quoting=csv.QUOTE_NONE, index=False, escapechar=' ')
# updated_results_df.to_csv(file_path, sep=' ', header=False, quoting=csv.QUOTE_NONE, index=False, escapechar=' ')