# Named Entity Recognition with Bidirectional LSTMs
- TUTORIAL: [Sequence Models and Long Short-Term Memory Networks](https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html#sequence-models-and-long-short-term-memory-networks)

In [63]:
# imports
# import pdb # for step by step debugging
# assert(False) # use to stop at a specific line (think of like stop, quit, exit, etc)
import torch

import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F


from tqdm import tqdm
from datasets import Dataset

from torch.nn.utils import clip_grad_norm_
from torch.nn.utils.rnn import pad_sequence


## Load Data

In [64]:
# Load and Update Data
def load_data(file_path, word_col_name, ner_tag_col_name):
    sentences = []
    sentences_ner_tags = []
    words = []
    ner_tags = []
    
    with open(file_path, 'r') as f:
        lines = f.read().split("\n") # separate each sentence by new line
        for line in lines:            
            each_line = line.split(" ")
            if len(each_line) != 1:
                text_idx, input_text, ner_tag = each_line[0], each_line[1], each_line[2]
                # print(f"Line {text_idx} has word {input_text}, with NER Tag {ner_tag} --- {sentence_idx}")
  
                words.append(input_text)
                ner_tags.append(ner_tag)
            else:
                if words and ner_tags:
                    # print(f"words {words}")
                    sentences.append(words)
                    # print(f"ner_tags {ner_tags}")
                    sentences_ner_tags.append(ner_tags)
                    
                    words = []
                    ner_tags = []

    """Return as Dictionaries"""
    sentences_in_dict = {}
    ner_tags_in_dict = {}

    sentences_in_dict[word_col_name] = sentences
    ner_tags_in_dict[ner_tag_col_name] = sentences_ner_tags


    return sentences_in_dict, ner_tags_in_dict

In [65]:
train_file_path = "data/train"
dev_file_path = "data/dev"
# Add test file; eval.py -p results/test1.out -g data/test

new_sentence_col_name = 'New Sentence Index'
word_col_name = 'Word'
ner_tag_col_name = 'NER Tag'
ner_tag_idx_col_name = 'NER Tag Idx'


train_sentences_in_dict, train_ner_tags_in_dict = load_data(train_file_path, word_col_name, ner_tag_col_name)

dev_sentences_in_dict, dev_ner_tags_in_dict = load_data(dev_file_path, word_col_name, ner_tag_col_name)

In [66]:
train_sentences_in_dict

{'Word': [['EU',
   'rejects',
   'German',
   'call',
   'to',
   'boycott',
   'British',
   'lamb',
   '.'],
  ['Peter', 'Blackburn'],
  ['BRUSSELS', '1996-08-22'],
  ['The',
   'European',
   'Commission',
   'said',
   'on',
   'Thursday',
   'it',
   'disagreed',
   'with',
   'German',
   'advice',
   'to',
   'consumers',
   'to',
   'shun',
   'British',
   'lamb',
   'until',
   'scientists',
   'determine',
   'whether',
   'mad',
   'cow',
   'disease',
   'can',
   'be',
   'transmitted',
   'to',
   'sheep',
   '.'],
  ['Germany',
   "'s",
   'representative',
   'to',
   'the',
   'European',
   'Union',
   "'s",
   'veterinary',
   'committee',
   'Werner',
   'Zwingmann',
   'said',
   'on',
   'Wednesday',
   'consumers',
   'should',
   'buy',
   'sheepmeat',
   'from',
   'countries',
   'other',
   'than',
   'Britain',
   'until',
   'the',
   'scientific',
   'advice',
   'was',
   'clearer',
   '.'],
  ['"',
   'We',
   'do',
   "n't",
   'support',
   'any',
  

In [67]:
train_ner_tags_in_dict

{'NER Tag': [['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O'],
  ['B-PER', 'I-PER'],
  ['B-LOC', 'O'],
  ['O',
   'B-ORG',
   'I-ORG',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-MISC',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-MISC',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O'],
  ['B-LOC',
   'O',
   'O',
   'O',
   'O',
   'B-ORG',
   'I-ORG',
   'O',
   'O',
   'O',
   'B-PER',
   'I-PER',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-LOC',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O'],
  ['O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-ORG',
   'O',
   'O',
   'O',
   'B-PER',
   'I-PER',
   'I-PER',
   'I-PER',
   'O',
   'O',
   'O',
   'O',
   'O'],
  ['O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O

In [68]:
dev_sentences_in_dict

{'Word': [['CRICKET',
   '-',
   'LEICESTERSHIRE',
   'TAKE',
   'OVER',
   'AT',
   'TOP',
   'AFTER',
   'INNINGS',
   'VICTORY',
   '.'],
  ['LONDON', '1996-08-30'],
  ['West',
   'Indian',
   'all-rounder',
   'Phil',
   'Simmons',
   'took',
   'four',
   'for',
   '38',
   'on',
   'Friday',
   'as',
   'Leicestershire',
   'beat',
   'Somerset',
   'by',
   'an',
   'innings',
   'and',
   '39',
   'runs',
   'in',
   'two',
   'days',
   'to',
   'take',
   'over',
   'at',
   'the',
   'head',
   'of',
   'the',
   'county',
   'championship',
   '.'],
  ['Their',
   'stay',
   'on',
   'top',
   ',',
   'though',
   ',',
   'may',
   'be',
   'short-lived',
   'as',
   'title',
   'rivals',
   'Essex',
   ',',
   'Derbyshire',
   'and',
   'Surrey',
   'all',
   'closed',
   'in',
   'on',
   'victory',
   'while',
   'Kent',
   'made',
   'up',
   'for',
   'lost',
   'time',
   'in',
   'their',
   'rain-affected',
   'match',
   'against',
   'Nottinghamshire',
   '.'],
  

In [69]:
dev_ner_tags_in_dict

{'NER Tag': [['O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
  ['B-LOC', 'O'],
  ['B-MISC',
   'I-MISC',
   'O',
   'B-PER',
   'I-PER',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-ORG',
   'O',
   'B-ORG',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O'],
  ['O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-ORG',
   'O',
   'B-ORG',
   'O',
   'B-ORG',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-ORG',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-ORG',
   'O'],
  ['O',
   'O',
   'B-ORG',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-LOC',
   'I-LOC',
   'O',
   'B-ORG',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-LOC',
   'O',
   'B-PER',
   '

## Data Preprocessing

- `replace_ner_tag_with_idx`: function
    - Map ner tag with index
    
- `create_dataset`: function
- `create_ner_tag_mappings`: function

In [70]:
def replace_ner_tag_with_idx(to_map, key_name, reordered_dict, ner_tag_idx_col_name):
    """Pair ner tag with corresponding index"""
    final_dict = {}
    per_sentence = []
    all_sentences = []

    ner_tags = to_map[key_name]

    for ner_tags_idx in range(len(ner_tags)):
        sentence_ner_tags = ner_tags[ner_tags_idx]

        for sentence_ner_tags_idx in range(len(sentence_ner_tags)):
            ner_tag = sentence_ner_tags[sentence_ner_tags_idx]

            for key, value in reordered_dict.items():
                if ner_tag == value:
                    per_sentence.append(key)

        all_sentences.append(per_sentence)
        per_sentence = []

    final_dict[ner_tag_idx_col_name] = all_sentences

    return final_dict


In [71]:
idx_at_ner_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}

train_ner_tags_idx_in_dict = replace_ner_tag_with_idx(train_ner_tags_in_dict, ner_tag_col_name, idx_at_ner_tag, ner_tag_idx_col_name)

dev_ner_tags_idx_in_dict = replace_ner_tag_with_idx(dev_ner_tags_in_dict, ner_tag_col_name, idx_at_ner_tag, ner_tag_idx_col_name)

In [72]:
train_ner_tags_idx_in_dict

{'NER Tag Idx': [[3, 0, 7, 0, 0, 0, 7, 0, 0],
  [1, 2],
  [5, 0],
  [0,
   3,
   4,
   0,
   0,
   0,
   0,
   0,
   0,
   7,
   0,
   0,
   0,
   0,
   0,
   7,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  [5,
   0,
   0,
   0,
   0,
   3,
   4,
   0,
   0,
   0,
   1,
   2,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   5,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  [0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   3,
   0,
   0,
   0,
   1,
   2,
   2,
   2,
   0,
   0,
   0,
   0,
   0],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0],
  [0,
   0,
   0,
   0,
   0,
   0,
   0,
   3,
   0,
   0,
   1,
   2,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  [1,
   0,
   7,
   0,
   0,
   0,
   0,
   5,

In [73]:
dev_ner_tags_idx_in_dict

{'NER Tag Idx': [[0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0],
  [5, 0],
  [7,
   8,
   0,
   1,
   2,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   3,
   0,
   3,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  [0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   3,
   0,
   3,
   0,
   3,
   0,
   0,
   0,
   0,
   0,
   0,
   3,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   3,
   0],
  [0,
   0,
   3,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   5,
   6,
   0,
   3,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   5,
   0,
   1,
   2,
   0,
   0,
   0,
   0,
   0],
  [0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [3,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   1,
   2,
   0,
   1,
   2,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   3,
   0,
   5,
   0],
  [1,
   0,
   0,


In [74]:
def create_dataset(sentences_dict, ner_tags_dict, ner_tags_idx_dict, word_col_name, ner_tag_idx_col_name, drop_ner_tags=True):


    train_dict = {}
    train_dict.update(sentences_dict)
    train_dict.update(ner_tags_dict)
    train_dict.update(ner_tags_idx_dict)

    train_df = pd.DataFrame(train_dict)

    if drop_ner_tags == True:
        sub_df = train_df.loc[:, [word_col_name, ner_tag_idx_col_name]]
        train_dataset = Dataset.from_dict(sub_df)
        return train_dataset
    else:
        print(f"Error with {drop_ner_tags}")


In [75]:
train_dataset = create_dataset(train_sentences_in_dict, train_ner_tags_in_dict, train_ner_tags_idx_in_dict, word_col_name, ner_tag_idx_col_name)

In [76]:
train_dataset

Dataset({
    features: ['Word', 'NER Tag Idx'],
    num_rows: 14987
})

In [77]:
dev_dataset = create_dataset(dev_sentences_in_dict, dev_ner_tags_in_dict, dev_ner_tags_idx_in_dict, word_col_name, ner_tag_idx_col_name)

In [78]:
dev_dataset

Dataset({
    features: ['Word', 'NER Tag Idx'],
    num_rows: 3466
})

### Load Glove Embeddings

In [79]:
# input_file = 'glove.6B.100d.gz'
# output_file = 'glove.6B.100d.txt'

# with gzip.open(input_file, 'rb') as f_in:
#     with open(output_file, 'wb') as f_out:
#         shutil.copyfileobj(f_in, f_out)

In [81]:
def load_glove_dataset(glove_file, embedding_dim, vocab_size=None):
    embeddings_index = {}
    vocabulary = []
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in tqdm(f, desc="Loading GloVe embeddings", total=vocab_size):
            values = line.split()
            word = values[0]
            vocabulary.append(word)
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    # Create a matrix to hold the embeddings
    embedding_matrix = torch.randn((vocab_size, embedding_dim))
    for i, (word, embedding_vector) in tqdm(enumerate(embeddings_index.items()), desc="Creating embedding matrix"):
        if i < vocab_size:
            embedding_matrix[i] = torch.from_numpy(embedding_vector)

    return embedding_matrix, vocabulary

In [82]:
glove_file = 'glove.6B.100d.txt'
glove_embedding_dim = 100
glove_vocab_size = 400002

glove_embedding_matrix, glove_vocabulary = load_glove_dataset(glove_file, glove_embedding_dim, glove_vocab_size)

Loading GloVe embeddings: 100%|█████████▉| 400000/400002 [00:07<00:00, 52279.02it/s]
Creating embedding matrix: 400000it [00:02, 182726.99it/s]


In [83]:
glove_embeddings = glove_embedding_matrix.numpy()
# glove_embeddings

In [84]:
glove_vocabulary = ['PAD', 'UNK'] + glove_vocabulary
# glove_vocabulary

In [85]:
def build_vocab_glove(words):
    """Build vocabulary for GloVe embeddings.

    Parameters
    ----------
    words: `list`:
        List of vocabulary terms.

    Returns
    -------
    word_idx, idx_word: `tuple`
        A tuple containing two dictionaries mapping words to their corresponding indices and mapping indices to their corresponding words.
    """
    word_at_idx = {}

    for word_idx, word in enumerate(words):
        # print(word, word_idx)
        
        word_at_idx[word] = word_idx
        # assert(False)

    idx_at_word = {}

    for word, word_idx in word_at_idx.items():
        # print(word, idx)

        idx_at_word[word_idx] = word
        
    return word_at_idx, idx_at_word


In [86]:
words_with_idx, idx_with_words = build_vocab_glove(np.array(glove_vocabulary))

In [87]:
# Define the conversion function
def convert_text_to_input_ids_glove(row, words_with_idx, ids_of_inputs_col_name, capitalize_words_col_name):
    """Convert tokenized text into input IDs using GloVe embeddings.
    
    Parameters
    ----------
    row: `dict`
        A dictionary containing the tokens for a single data point.
        
    Returns
    -------
    dict: A dictionary containing the input IDs and capitalization vector for the tokens.
        - 'ids_of_inputs' (list): List of input IDs corresponding to each token.
        - 'capital_words_vector' (list): Binary vector indicating capitalization of each token.
    """
    capital_words_vector = []  # Initialize list to store capitalization vector
    ids_of_inputs = []  # Initialize list to store input IDs
    for Word in row['Word']:  # Iterate through each word in the row
        # Check if any character in the word is uppercase
        if any(x.isupper() for x in Word):
            capital_words_vector.append(1)  # Append 1 if uppercase character is found
        else:
            capital_words_vector.append(0)  # Append 0 if no uppercase character is found
        
        Word_lower = Word.lower()  # Convert word to lowercase
        # Check if lowercase word is in word2idx dictionary
        if Word_lower in words_with_idx:
            ids_of_inputs.append(words_with_idx[Word_lower])  # Append corresponding index from word2idx
        else:
            ids_of_inputs.append(1)  # Append index 1 as default if word not found in word2idx
    
    return {
        ids_of_inputs_col_name: ids_of_inputs,  # Return input IDs
        capitalize_words_col_name: capital_words_vector  # Return capitalization vector
    }

In [88]:
ids_of_inputs_col_name = 'ID of Input'
capitalize_words_col_name = 'Capital Word'
# Assign the conversion function to a variable
convert_function = convert_text_to_input_ids_glove

# Create an empty list to store converted data points
converted_data = []

# Iterate through each row in the dataset and apply the conversion function
for row in train_dataset:
    converted_data.append(convert_function(row, words_with_idx, ids_of_inputs_col_name, capitalize_words_col_name))


In [89]:
def convert_dataset_to_glove(dataset, convert_function, words_with_idx, word_col_name, ner_tag_idx_col_name, ids_of_inputs_col_name, capitalize_words_col_name):
    converted_data = []

    for row in dataset:
        converted_row = {}
        converted_row[word_col_name] = row[word_col_name]
        converted_row[ner_tag_idx_col_name] = row[ner_tag_idx_col_name]
        converted_result = convert_function(row, words_with_idx, ids_of_inputs_col_name, capitalize_words_col_name)
        # print(converted_result)
        # assert(False)
        converted_row[ids_of_inputs_col_name] = converted_result[ids_of_inputs_col_name]
        converted_row[capitalize_words_col_name] = converted_result[capitalize_words_col_name]
        converted_data.append(converted_row)


    converted_data_dict = {}
    for key in converted_data[0]:
        converted_data_dict[key] = []
        for row in converted_data:
            converted_data_dict[key].append(row[key])

    # Create the converted_dataset
    converted_dataset = Dataset.from_dict(converted_data_dict)

    return converted_dataset

In [90]:
train_dataset_glove = convert_dataset_to_glove(train_dataset, convert_text_to_input_ids_glove, words_with_idx, word_col_name, ner_tag_idx_col_name, ids_of_inputs_col_name, capitalize_words_col_name)


In [91]:
train_dataset_glove.set_format(
    type='torch',
    columns=[ner_tag_idx_col_name, ids_of_inputs_col_name, capitalize_words_col_name]
)
print(train_dataset_glove[0])

{'NER Tag Idx': tensor([3, 0, 7, 0, 0, 0, 7, 0, 0]), 'ID of Input': tensor([  646,  7580,   516,   582,     6,  5262,   299, 10240,     4]), 'Capital Word': tensor([1, 0, 1, 0, 0, 0, 1, 0, 0])}


## 1. Simple Bidirectional LSTM model

- `pad_sequences_in_batch`: function
- `BLSTM_NER_TAGGER`: class
    - Create the BLSTM
- `train_with_eval_model`: function
    - Train THE BLSTM

In [93]:
def pad_sequence_in_batch(batch, ner_tag_idx_col_name=ner_tag_idx_col_name, ids_of_inputs_col_name=ids_of_inputs_col_name, capitalize_words_col_name=capitalize_words_col_name):
    """Pad sequences per batch
    Combines individual samples into batches, padding sequences to the same length.
    
    Parameter
    ---------
    batch: `list`
        A list of dictionaries, each containing 'ID of Input', 'Capital Word', and NER Tag Idx.

    Return
    ------
    padded_data: `dict`
        A dictionary containing padded sequences of ids_of_inputs, capital_words_vector, and NER Tag Idx.

    """
    ids_of_inputs_list = []
    capital_words_vector_list = []
    ner_tag_idx_list = []

    # Iterate through each item in the batch
    for batch_item in batch:
        # Get ids_of_inputs, capital_words_vector, and NER Tag Idx from the current item
        ids_of_inputs = batch_item[ids_of_inputs_col_name]
        capital_words_vector = batch_item[capitalize_words_col_name]
        ner_tag_idx = batch_item[ner_tag_idx_col_name]

        ids_of_inputs_list.append(ids_of_inputs)
        capital_words_vector_list.append(capital_words_vector)
        ner_tag_idx_list.append(ner_tag_idx)

    padded_ids_of_inputs = pad_sequence(ids_of_inputs_list, batch_first=True)
    padded_capital_words_vector = pad_sequence(capital_words_vector_list, batch_first=True)
    padded_ner_tag_idx = pad_sequence(ner_tag_idx_list, batch_first=True, padding_value=0)

    padded_data = {
        ids_of_inputs_col_name: padded_ids_of_inputs,
        capitalize_words_col_name: padded_capital_words_vector,
        ner_tag_idx_col_name: padded_ner_tag_idx
    }

    # Return the dictionary
    return padded_data


### Create the BLSTM

In [98]:
class BLLSTM_NER_Tagger_GloVe(nn.Module):
  def __init__(self, embedding_dim, hidden_dim, output_dim, dropout_percentage, vocab_mappings, ner_tag_mappings, pretrained_embeddings):
    super(BLLSTM_NER_Tagger_GloVe, self).__init__()

    self.word_embeddings = nn.Embedding.from_pretrained(torch.from_numpy(pretrained_embeddings).float())
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True, num_layers=1)
    self.dropout = nn.Dropout(p=dropout_percentage)
    self.linear_1 = nn.Linear(hidden_dim * 2, output_dim)
    self.linear_elu = nn.ELU()
    self.linear_classifier = nn.Linear(output_dim, len(ner_tag_mappings))

  def forward(self, sentences, all_capitals):
    # print(f"sentences shape: {sentences.shape}")
    # assert(False)
    embeds = self.word_embeddings(sentences)
    # print(f"embeds shape: {embeds.shape}")

    all_capitals = all_capitals.unsqueeze(2)
    # print(f"all_capitals shape: {all_capitals.shape}")

    # concatenating captial vectors at the end of embeddings
    lstm_input = torch.cat([embeds, all_capitals], dim=2)
    # print(f"lstm_input shape: {lstm_input.shape}")

     # assert(False)
    lstm_out, _ = self.lstm(lstm_input)
    # print(f"lstm_out shape: {lstm_out.shape}")

    lstm_dropout = self.dropout(lstm_out)
    # print(f"lstm_dropout shape: {lstm_dropout.shape}")

    fc_layer = self.linear_1(lstm_dropout)
    # print(f"fc_layer shape: {fc_layer.shape}")

    elu_layer = self.linear_elu(fc_layer)
    # print(f"elu_layer shape: {elu_layer.shape}")

    tag_scores = self.linear_classifier(elu_layer)
    # print(f"tag_scores shape: {tag_scores.shape}")

    tag_scores = tag_scores.permute(0, 2, 1)
    # print(f"tag_scores shape: {tag_scores.shape}")
      
    return tag_scores


In [99]:
EMBEDDING_DIM = 100
LSTM_LAYERS = 1

HIDDEN_DIM = 256
DROPOUT = 0.33

OUTPUT_DIM = 128

TRAIN_BATCH_SIZE = 64

train_params = {
    'batch_size': TRAIN_BATCH_SIZE,
    'shuffle': True,
    'num_workers': 0
}

In [100]:
train_dataloader_glove = torch.utils.data.DataLoader(train_dataset_glove, collate_fn=pad_sequence_in_batch, **train_params)

In [101]:
gloVe_blstm_model_class = BLLSTM_NER_Tagger_GloVe(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT, words_with_idx, idx_at_ner_tag, pretrained_embeddings=glove_embeddings)
gloVe_blstm_model_class


BLLSTM_NER_Tagger_GloVe(
  (word_embeddings): Embedding(400002, 100)
  (lstm): LSTM(100, 256, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.33, inplace=False)
  (linear_1): Linear(in_features=512, out_features=128, bias=True)
  (linear_elu): ELU(alpha=1.0)
  (linear_classifier): Linear(in_features=128, out_features=9, bias=True)
)

### TRAIN THE BLSTM

In [102]:
LEARNING_RATE = 1
LOSS_FUNCTION = nn.CrossEntropyLoss()
OPTIMIZER = optim.SGD(gloVe_blstm_model_class.parameters(), lr=LEARNING_RATE)

In [104]:
def train_glove(model, training_data, dev_data, N_epochs, loss_function, optimizer, train_word_to_ix, dev_word_to_ix, dev_ner_tag_to_idx, batch_size, per_batch):
    ids_of_inputs_col_name, capitalize_words_col_name, ner_tag_idx_col_name = per_batch
    train_losses = []
    dev_losses = []
    best_val_score = 0.3

    for epoch_index in tqdm(range(N_epochs), desc="Epochs"):
        running_loss = 0

        model.train(True)  # Set the model to training mode
        
        for batch_idx, batch_of_training_data in enumerate(tqdm(training_data, desc=f"Epoch {epoch_index + 1}", leave=False)):
            optimizer.zero_grad()

            sentences_scores = batch_of_training_data[ids_of_inputs_col_name]
            one_hot_capitals = batch_of_training_data[capitalize_words_col_name]
            true_ner_tag_mappings = batch_of_training_data[ner_tag_idx_col_name]

            pred_ner_tag_scores_per_batch = model(sentences_scores)
            loss = loss_function(pred_ner_tag_scores_per_batch, true_ner_tag_mappings)
            loss.backward()

            # Clip gradients to prevent exploding gradients
            clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            # Accumulate the loss for this batch
            training_loss += loss.item() * batch_size

        training_loss /= len(training_data)
        print('Epoch: {} \tTraining Loss: {:.7f}'.format(epoch_index+1, training_loss))
        train_losses.append(training_loss)

        # Evaluate the model on development data
        dev_loss = evaluate_model(model, dev_data, loss_function, dev_word_to_ix, dev_ner_tag_to_idx, batch_size)
        dev_losses.append(dev_loss)

        # Save the model if it has the best validation score so far
        if dev_loss < best_val_score:
            best_val_score = dev_loss
            torch.save(model.state_dict(), 'best_model.pth')

    return train_losses, dev_losses

In [105]:
per_batch = (ids_of_inputs_col_name, capitalize_words_col_name, ner_tag_idx_col_name)

predicted_ner_tag_scores_per_epoch = train_glove(gloVe_blstm_model_class, train_dataloader_glove, glove_vocabulary[2:], 2, LOSS_FUNCTION, OPTIMIZER, words_with_idx, TRAIN_BATCH_SIZE, per_batch)


Epochs:  50%|█████     | 1/2 [00:22<00:22, 22.36s/it]

Epoch 0: Average Loss: nan


Epochs: 100%|██████████| 2/2 [00:45<00:00, 22.80s/it]

Epoch 1: Average Loss: nan





# IGNORE

In [None]:
# predicted_ner_tag_scores_per_epoch

In [None]:
# predicted_ner_tags_scores = predicted_ner_tag_scores_per_epoch[-1][1]
# type(predicted_ner_tags_scores), len(predicted_ner_tags_scores)

In [None]:
# predicted_ner_tags_scores[0][2][0]

In [None]:
# def train_model_refactored(model, training_data, N_epochs, loss_function, optimizer, word_to_ix, tag_to_ix, hidden_dim, embed_dim, with_batch, batch_size): 
#     train_losses = [] 
#     best_val_score = 0.3

#     for epoch_index in tqdm(range(N_epochs)):
#         print(f"EPOCH {epoch_index}")
        

#         trainining_loss = 0.0 
        
#         # Make sure gradient tracking is on, and do a pass over the data
#         model.train(True)
        
#         # Batch the sentences and tags
#         for batch_idx, start_index in enumerate((range(0, len(training_data), batch_size))):
#             store_pred_tag_scores_per_batch = []
#             end_index = min(start_index + batch_size, len(training_data))
#             actual_batch_size = end_index - start_index
#             sentences_in_batch, ner_tags_batched = batch_sentence(training_data, start_index, end_index, with_batch)
#             # print(f"Batch: {batch_idx} {sentences_in_batch}")
#             # assert(False)

#             optimizer.zero_grad()

#             input_sentences_idx = prepare_sequence(sentences_in_batch, word_to_ix, with_batch)
#             true_ner_tag_mappings, seq_in_batch_max_length = prepare_sequence(ner_tags_batched, tag_to_ix, with_batch)

#             pred_ner_tag_scores_per_batch = model(input_sentences_idx)
#             # print(f"{pred_ner_tag_scores_per_batch.shape}, {selected_labels.shape}")

#             # print(f"{pred_ner_tag_scores.shape}, {true_ner_tag_mappings.shape}")
#             loss = loss_function(pred_ner_tag_scores_per_batch, true_ner_tag_mappings)
#             loss.backward()

#             # Clip gradients to prevent exploding gradients
#             clip_grad_norm_(model.parameters(), max_norm=1.0)
#             optimizer.step()

#             # Accumulate the loss for this batch
#             training_loss += loss.item()
#         training_loss = training_loss / len(training_data)
#         print('Epoch: {} \tTraining Loss: {:.7f}'.format(epoch_index+1, training_loss))

#         train_losses.append(training_loss)

#         # Print or log the average loss for this epoch
#         epoch_loss = training_loss / (len(training_data) / batch_size)
#         print(f"Epoch {epoch_index}: Average Loss: {epoch_loss}")
#         assert(False)



In [None]:
def find_max_score_and_tag(ner_tag_prediction_scores, tag_to_ix, with_batch_bool):
    """Find the maximum scores and corresponding tags"""
    # print(ner_tag_prediction_scores)
    store_predicted_ner_tags = []

    if with_batch_bool == True:
        for predicted_ner_tags_scores_idx in range(len(predicted_ner_tags_scores)):
            ner_tags_per_sentence = []
            batch = predicted_ner_tags_scores[predicted_ner_tags_scores_idx]
            # print(batch_idx, type(batch_idx))
            batch_idx = batch[0]
            batch_sentences = batch[1]
            batch_scores = batch[2][0]
            # print(f"batch_sentences: {batch_sentences}")
            # print(f"   batch_scores: {batch_scores.shape}")
            max_scores, max_tag_idxs = torch.max(batch_scores, dim=1)
            # print(f"   max_tag_idxs: {max_tag_idxs}")
            max_tag_idxs = max_tag_idxs.tolist()
            max_tag_idxs = sum(max_tag_idxs, [])
            # print(f"   max_tag_idxs: {max_tag_idxs}")
            pred_ner_tag_mapped = []
            for idx in max_tag_idxs:
                for key, value in tag_to_ix.items():
                    if value == idx:
                        pred_ner_tag_mapped.append(key)
                        break
                ner_tags_per_sentence.append(pred_ner_tag_mapped)
            # print(ner_tags_per_sentence)
            store_predicted_ner_tags.append((batch_idx, batch_sentences, ner_tags_per_sentence))
            # assert(False)
        # print(store_predicted_ner_tags)
        # assert(False)
        
    else:

        for sequence_idx, sentences, pred_ner_tag_scores in ner_tag_prediction_scores:
            # print(sequence_idx, ner_tag_prediction_scores)
            index = ner_tag_prediction_scores[sequence_idx][0]
            sentence = ner_tag_prediction_scores[sequence_idx][1]
            scores = ner_tag_prediction_scores[sequence_idx][2]
            # print(index, sentence, scores)

            max_scores, max_tag_idxs = torch.max(scores, dim=1)
            max_tag_idxs = max_tag_idxs.tolist()
            max_tag_idxs = sum(max_tag_idxs, [])
            # print(max_tag_idxs)
            pred_ner_tag_mapped = []
            for idx in max_tag_idxs:
                for key, value in tag_to_ix.items():
                    if value == idx:
                        pred_ner_tag_mapped.append(key)
                        break

            store_predicted_ner_tags.append((index, sentence, pred_ner_tag_mapped))
    return store_predicted_ner_tags

In [None]:
predicted_ner_tags = find_max_score_and_tag(predicted_ner_tag_scores_per_epoch, tag_to_ix, with_batch)
# print(f"{index}, {sentence}, {pred_ner_tag}")
# store_predicted_ner_tags.append((index, sentence, pred_ner_tag))  # Store index, sentence, and predicted tags



In [None]:
# predicted_ner_tags

In [None]:
def create_ner_dataframe(predicted_ner_tags):
    
    data_for_df = []
    
    for idx, batch in enumerate(predicted_ner_tags):
        # print(batch)
        # assert(False)
        batch_idx = batch[0]
        batch_sentences = batch[1]
        batch_ner_tags = batch[2]
        # print(batch_ner_tags)

        
        for sentence_tag_idx, (sentence, ner_tags) in enumerate(zip(batch_sentences, batch_ner_tags)):
            # print(sentence_tag_idx)
            # print(sentence, ner_tags)

            for row_idx, (word, tag) in enumerate(zip(sentence, ner_tags)):
                # print(row_idx, word, tag)
                # if word == ".":
                    # print(word, tag)
                data_for_df.append((row_idx + 1, word, tag))
            data_for_df.append((" ", " ", " "))
        # print(data_for_df)
    results_df = pd.DataFrame(data_for_df, columns=['New Sentence Index', 'Word', 'NER Tag'], index=range(1, len(data_for_df)+1))
    return results_df

In [None]:
results_df = create_ner_dataframe(predicted_ner_tags)

In [None]:
results_df

In [None]:
results_df.head(20)

In [None]:
import csv

train_file_path = 'results/train1.out'
# dev_file_path = 'results/dev1.out'
# Add test file; python eval.py -p results/test1.out -g data/test


# when saving dataframe as txt with df.to_csv, my single quotes (") are turning into triple quotes ("""). To NOT do this, add quoting=csv.QUOTE_NONE

results_df.to_csv(train_file_path, sep=' ', header=False, quoting=csv.QUOTE_NONE, index=False, escapechar=' ')
# updated_results_df.to_csv(file_path, sep=' ', header=False, quoting=csv.QUOTE_NONE, index=False, escapechar=' ')

In [None]:
# def process_data(df):
#     sentence_index = 0
#     sentence_count = 1
#     rows = []

#     for index, row in df.iterrows():
#         if row['New Sentence Index'] != sentence_index:
#             sentence_count = 1
#             sentence_index = row['New Sentence Index']
#             rows.append([" ", " ", " "])
#         rows.append([sentence_count, row['Word'], row['NER Tag']])
        
#         sentence_count += 1

#     new_df = pd.DataFrame(rows, columns=['New Sentence Index', 'Word', 'NER Tag'])
#     return new_df

In [None]:
# updated_results_df = process_data(results_df)


In [None]:
# updated_results_df = updated_results_df.reset_index(drop=True)
# updated_results_df.index += 1
# updated_results_df.head(20)

In [None]:

            
            # lstm_input = embeds.view(batch_size, max_sentence_length, -1)
            # lstm_input = embeds.permute(1, 0, 2)  # Swap batch_size and sequence_length dimensions
            # print(f"lstm_input: {lstm_input.shape}")

            # lstm_input = embeds.view(len(sentences), sentences_in_batch, embed_dim)
            # print(f"lstm_input: {lstm_input.shape}")

            # lstm_out = lstm_out.view(len(sentences), -1)
            # lstm_out = lstm_out.permute(1, 0, 2).contiguous().view(-1, lstm_out.size(2))
            # print(f"2-lstm_out: {lstm_out.shape}")

            # lstm_dropout = self.dropout(lstm_out)
            # print(f"lstm_dropout: {lstm_dropout.shape}")

            # elu_input = self.linear_1(lstm_dropout)
            # print(f"elu_input: {elu_input.shape}")

            # tag_space_input = self.linear_elu(elu_input)
            # print(f"tag_space_input: {tag_space_input.shape}")
            
            # tag_scores = self.hidden2tag(tag_space_input)
            # print(f"tag_scores: {tag_scores.shape}")

            # # tag_scores = F.log_softmax(tag_space)
            # tag_scores = tag_scores.view(batch_size, N_ner_tags, N_words)
            # batch_size, num_outputs = tag_scores.size()[:2]
            # print(f"tag_scores: {tag_scores.shape}")
            