# HW 4-Named Entity Recognition with Bidirectional Long Short Term Memory Model

- Detravious Jamari Brinkley
- CSCI-544: Applied Natural Language Processing
- python version: 3.11.4

---

## Task 2: Using GloVe word embeddings

In [1]:
# imports
# import pdb # for step by step debugging
# assert(False) # use to stop at a specific line (think of like stop, quit, exit, etc)
import torch
import random


import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F


from tqdm import tqdm
from datasets import Dataset



from sklearn.metrics import f1_score
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence


  from .autonotebook import tqdm as notebook_tqdm


## Load Data

In [2]:
# Load and Update Data
def load_data(file_path, word_col_name, ner_tag_col_name, test_data_bool=False):
    sentences = []
    sentences_ner_tags = []
    words = []
    ner_tags = []
    
    with open(file_path, 'r') as f:
        lines = f.read().split("\n") # separate each sentence by new line
        for line in lines:            
            each_line = line.split(" ")
            if len(each_line) != 1:
                if test_data_bool == False:
                    text_idx, input_text, ner_tag = each_line[0], each_line[1], each_line[2]
                    # print(f"Line {text_idx} has word {input_text}, with NER Tag {ner_tag} --- {sentence_idx}")

                    words.append(input_text)
                    ner_tags.append(ner_tag)
                else:
                    text_idx, input_text = each_line[0], each_line[1]
                    # print(f"Line {text_idx} has word {input_text}, with NER Tag {ner_tag} --- {sentence_idx}")

                    words.append(input_text)
                    ner_tag = '-3'
                    ner_tags.append(ner_tag)
            else:
                if words and ner_tags:
                    # print(f"words {words}")
                    sentences.append(words)
                    # print(f"ner_tags {ner_tags}")
                    sentences_ner_tags.append(ner_tags)
                    
                    words = []
                    ner_tags = []

    """Return as Dictionaries"""
    sentences_in_dict = {}
    ner_tags_in_dict = {}

    sentences_in_dict[word_col_name] = sentences
    ner_tags_in_dict[ner_tag_col_name] = sentences_ner_tags


    return sentences_in_dict, ner_tags_in_dict

In [3]:
train_file_path = "data/train"
dev_file_path = "data/dev"
test_file_path = "data/test"
# Add test file; eval.py -p results/test1.out -g data/test

new_sentence_col_name = 'New Sentence Index'
word_col_name = 'Word'

ner_tag_col_name = 'NER Tag'
ner_tag_idx_col_name = 'NER Tag Idx'

train_sentences_in_dict, train_ner_tags_in_dict = load_data(train_file_path, word_col_name, ner_tag_col_name)

dev_sentences_in_dict, dev_ner_tags_in_dict = load_data(dev_file_path, word_col_name, ner_tag_col_name)

test_sentences_in_dict, test_ner_tags_in_dict = load_data(test_file_path, word_col_name, ner_tag_col_name, True)

In [4]:
train_sentences_in_dict

{'Word': [['EU',
   'rejects',
   'German',
   'call',
   'to',
   'boycott',
   'British',
   'lamb',
   '.'],
  ['Peter', 'Blackburn'],
  ['BRUSSELS', '1996-08-22'],
  ['The',
   'European',
   'Commission',
   'said',
   'on',
   'Thursday',
   'it',
   'disagreed',
   'with',
   'German',
   'advice',
   'to',
   'consumers',
   'to',
   'shun',
   'British',
   'lamb',
   'until',
   'scientists',
   'determine',
   'whether',
   'mad',
   'cow',
   'disease',
   'can',
   'be',
   'transmitted',
   'to',
   'sheep',
   '.'],
  ['Germany',
   "'s",
   'representative',
   'to',
   'the',
   'European',
   'Union',
   "'s",
   'veterinary',
   'committee',
   'Werner',
   'Zwingmann',
   'said',
   'on',
   'Wednesday',
   'consumers',
   'should',
   'buy',
   'sheepmeat',
   'from',
   'countries',
   'other',
   'than',
   'Britain',
   'until',
   'the',
   'scientific',
   'advice',
   'was',
   'clearer',
   '.'],
  ['"',
   'We',
   'do',
   "n't",
   'support',
   'any',
  

In [5]:
train_ner_tags_in_dict

{'NER Tag': [['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O'],
  ['B-PER', 'I-PER'],
  ['B-LOC', 'O'],
  ['O',
   'B-ORG',
   'I-ORG',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-MISC',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-MISC',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O'],
  ['B-LOC',
   'O',
   'O',
   'O',
   'O',
   'B-ORG',
   'I-ORG',
   'O',
   'O',
   'O',
   'B-PER',
   'I-PER',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-LOC',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O'],
  ['O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-ORG',
   'O',
   'O',
   'O',
   'B-PER',
   'I-PER',
   'I-PER',
   'I-PER',
   'O',
   'O',
   'O',
   'O',
   'O'],
  ['O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O

In [6]:
dev_sentences_in_dict

{'Word': [['CRICKET',
   '-',
   'LEICESTERSHIRE',
   'TAKE',
   'OVER',
   'AT',
   'TOP',
   'AFTER',
   'INNINGS',
   'VICTORY',
   '.'],
  ['LONDON', '1996-08-30'],
  ['West',
   'Indian',
   'all-rounder',
   'Phil',
   'Simmons',
   'took',
   'four',
   'for',
   '38',
   'on',
   'Friday',
   'as',
   'Leicestershire',
   'beat',
   'Somerset',
   'by',
   'an',
   'innings',
   'and',
   '39',
   'runs',
   'in',
   'two',
   'days',
   'to',
   'take',
   'over',
   'at',
   'the',
   'head',
   'of',
   'the',
   'county',
   'championship',
   '.'],
  ['Their',
   'stay',
   'on',
   'top',
   ',',
   'though',
   ',',
   'may',
   'be',
   'short-lived',
   'as',
   'title',
   'rivals',
   'Essex',
   ',',
   'Derbyshire',
   'and',
   'Surrey',
   'all',
   'closed',
   'in',
   'on',
   'victory',
   'while',
   'Kent',
   'made',
   'up',
   'for',
   'lost',
   'time',
   'in',
   'their',
   'rain-affected',
   'match',
   'against',
   'Nottinghamshire',
   '.'],
  

In [7]:
dev_ner_tags_in_dict

{'NER Tag': [['O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
  ['B-LOC', 'O'],
  ['B-MISC',
   'I-MISC',
   'O',
   'B-PER',
   'I-PER',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-ORG',
   'O',
   'B-ORG',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O'],
  ['O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-ORG',
   'O',
   'B-ORG',
   'O',
   'B-ORG',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-ORG',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-ORG',
   'O'],
  ['O',
   'O',
   'B-ORG',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-LOC',
   'I-LOC',
   'O',
   'B-ORG',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-LOC',
   'O',
   'B-PER',
   '

In [8]:
test_sentences_in_dict

{'Word': [['SOCCER',
   '-',
   'JAPAN',
   'GET',
   'LUCKY',
   'WIN',
   ',',
   'CHINA',
   'IN',
   'SURPRISE',
   'DEFEAT',
   '.'],
  ['Nadim', 'Ladki'],
  ['AL-AIN', ',', 'United', 'Arab', 'Emirates', '1996-12-06'],
  ['Japan',
   'began',
   'the',
   'defence',
   'of',
   'their',
   'Asian',
   'Cup',
   'title',
   'with',
   'a',
   'lucky',
   '2-1',
   'win',
   'against',
   'Syria',
   'in',
   'a',
   'Group',
   'C',
   'championship',
   'match',
   'on',
   'Friday',
   '.'],
  ['But',
   'China',
   'saw',
   'their',
   'luck',
   'desert',
   'them',
   'in',
   'the',
   'second',
   'match',
   'of',
   'the',
   'group',
   ',',
   'crashing',
   'to',
   'a',
   'surprise',
   '2-0',
   'defeat',
   'to',
   'newcomers',
   'Uzbekistan',
   '.'],
  ['China',
   'controlled',
   'most',
   'of',
   'the',
   'match',
   'and',
   'saw',
   'several',
   'chances',
   'missed',
   'until',
   'the',
   '78th',
   'minute',
   'when',
   'Uzbek',
   'striker',

In [9]:
test_ner_tags_in_dict

{'NER Tag': [['-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3'],
  ['-3', '-3'],
  ['-3', '-3', '-3', '-3', '-3', '-3'],
  ['-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3'],
  ['-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3'],
  ['-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   '-3',
   

## Data Preprocessing

- `replace_ner_tag_with_idx`: function
    - Map ner tag with index
    
- `create_dataset`: function
- `create_ner_tag_mappings`: function

In [10]:
def replace_ner_tag_with_idx(to_map, key_name, reordered_dict, ner_tag_idx_col_name):
    """Pair ner tag with corresponding index"""
    final_dict = {}
    per_sentence = []
    all_sentences = []

    ner_tags = to_map[key_name]

    for ner_tags_idx in range(len(ner_tags)):
        sentence_ner_tags = ner_tags[ner_tags_idx]

        for sentence_ner_tags_idx in range(len(sentence_ner_tags)):
            ner_tag = sentence_ner_tags[sentence_ner_tags_idx]

            for key, value in reordered_dict.items():
                if ner_tag == value:
                    per_sentence.append(key)

        all_sentences.append(per_sentence)
        per_sentence = []

    final_dict[ner_tag_idx_col_name] = all_sentences

    return final_dict


In [11]:
idx_at_ner_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}

train_ner_tags_idx_in_dict = replace_ner_tag_with_idx(train_ner_tags_in_dict, ner_tag_col_name, idx_at_ner_tag, ner_tag_idx_col_name)

dev_ner_tags_idx_in_dict = replace_ner_tag_with_idx(dev_ner_tags_in_dict, ner_tag_col_name, idx_at_ner_tag, ner_tag_idx_col_name)

test_idx_at_ner_tag = {0: '-3'}

test_ner_tags_idx_in_dict = replace_ner_tag_with_idx(test_ner_tags_in_dict, ner_tag_col_name, test_idx_at_ner_tag, ner_tag_idx_col_name)

In [12]:
train_ner_tags_idx_in_dict

{'NER Tag Idx': [[3, 0, 7, 0, 0, 0, 7, 0, 0],
  [1, 2],
  [5, 0],
  [0,
   3,
   4,
   0,
   0,
   0,
   0,
   0,
   0,
   7,
   0,
   0,
   0,
   0,
   0,
   7,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  [5,
   0,
   0,
   0,
   0,
   3,
   4,
   0,
   0,
   0,
   1,
   2,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   5,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  [0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   3,
   0,
   0,
   0,
   1,
   2,
   2,
   2,
   0,
   0,
   0,
   0,
   0],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0],
  [0,
   0,
   0,
   0,
   0,
   0,
   0,
   3,
   0,
   0,
   1,
   2,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  [1,
   0,
   7,
   0,
   0,
   0,
   0,
   5,

In [13]:
dev_ner_tags_idx_in_dict

{'NER Tag Idx': [[0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0],
  [5, 0],
  [7,
   8,
   0,
   1,
   2,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   3,
   0,
   3,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  [0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   3,
   0,
   3,
   0,
   3,
   0,
   0,
   0,
   0,
   0,
   0,
   3,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   3,
   0],
  [0,
   0,
   3,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   5,
   6,
   0,
   3,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   5,
   0,
   1,
   2,
   0,
   0,
   0,
   0,
   0],
  [0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [3,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   1,
   2,
   0,
   1,
   2,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   3,
   0,
   5,
   0],
  [1,
   0,
   0,


In [14]:
test_ner_tags_idx_in_dict

{'NER Tag Idx': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0],
  [0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  [0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   

In [15]:
def create_dataset(sentences_dict, ner_tags_dict, ner_tags_idx_dict):

    store_data = {}
    store_data.update(sentences_dict)
    store_data.update(ner_tags_dict)
    store_data.update(ner_tags_idx_dict)

    data_df = pd.DataFrame(store_data)
    load_dataset = Dataset.from_dict(data_df)

    return load_dataset

In [16]:
train_dataset = create_dataset(train_sentences_in_dict, train_ner_tags_in_dict, train_ner_tags_idx_in_dict)

In [17]:
train_dataset

Dataset({
    features: ['Word', 'NER Tag', 'NER Tag Idx'],
    num_rows: 14987
})

In [18]:
dev_dataset = create_dataset(dev_sentences_in_dict, dev_ner_tags_in_dict, dev_ner_tags_idx_in_dict)

In [19]:
dev_dataset

Dataset({
    features: ['Word', 'NER Tag', 'NER Tag Idx'],
    num_rows: 3466
})

In [20]:
test_dataset = create_dataset(test_sentences_in_dict, test_ner_tags_in_dict, test_ner_tags_idx_in_dict)

In [21]:
test_dataset

Dataset({
    features: ['Word', 'NER Tag', 'NER Tag Idx'],
    num_rows: 3684
})

### Load Glove Embeddings

In [22]:
# input_file = 'glove.6B.100d.gz'
# output_file = 'glove.6B.100d.txt'

# with gzip.open(input_file, 'rb') as f_in:
#     with open(output_file, 'wb') as f_out:
#         shutil.copyfileobj(f_in, f_out)

In [23]:
def load_glove_dataset(glove_file, embedding_dim, vocab_size=None):
    embeddings_index = {}
    vocabulary = []
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in tqdm(f, desc="Loading GloVe embeddings", total=vocab_size):
            values = line.split()
            word = values[0]
            vocabulary.append(word)
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    # Create a matrix to hold the embeddings
    embedding_matrix = torch.randn((vocab_size, embedding_dim))
    for i, (word, embedding_vector) in tqdm(enumerate(embeddings_index.items()), desc="Creating embedding matrix"):
        if i < vocab_size:
            embedding_matrix[i] = torch.from_numpy(embedding_vector)

    return embedding_matrix, vocabulary

In [24]:
glove_file = 'glove.6B.100d.txt'
glove_embedding_dim = 100
glove_vocab_size = 400002

glove_embedding_matrix, glove_vocabulary = load_glove_dataset(glove_file, glove_embedding_dim, glove_vocab_size)

Loading GloVe embeddings: 100%|█████████▉| 400000/400002 [00:06<00:00, 58088.04it/s]
Creating embedding matrix: 400000it [00:01, 211677.61it/s]


In [25]:
glove_embeddings = glove_embedding_matrix.numpy()
# glove_embeddings

In [26]:
glove_vocabulary = ['PAD', 'UNK'] + glove_vocabulary
# glove_vocabulary

In [27]:
def build_vocab_glove(words):
    """Build vocabulary for GloVe embeddings.

    Parameters
    ----------
    words: `list`:
        List of vocabulary terms.

    Returns
    -------
    word_idx, idx_word: `tuple`
        A tuple containing two dictionaries mapping words to their corresponding indices and mapping indices to their corresponding words.
    """
    word_at_idx = {}

    for word_idx, word in enumerate(words):
        # print(word, word_idx)
        
        word_at_idx[word] = word_idx
        # assert(False)

    idx_at_word = {}

    for word, word_idx in word_at_idx.items():
        # print(word, idx)

        idx_at_word[word_idx] = word
        
    return word_at_idx, idx_at_word


In [28]:
train_words_with_idx, train_idx_with_words = build_vocab_glove(np.array(glove_vocabulary))
dev_words_with_idx, dev_idx_with_words = build_vocab_glove(np.array(glove_vocabulary))
test_words_with_idx, test_idx_with_words = build_vocab_glove(np.array(glove_vocabulary))

In [29]:
# Define the conversion function
def convert_text_to_input_ids_glove(row, words_with_idx, ids_of_inputs_col_name, capitalize_words_col_name):
    """Convert tokenized text into input IDs using GloVe embeddings.
    
    Parameters
    ----------
    row: `dict`
        A dictionary containing the tokens for a single data point.
        
    Returns
    -------
    dict: A dictionary containing the input IDs and capitalization vector for the tokens.
        - 'ids_of_inputs' (list): List of input IDs corresponding to each token.
        - 'capital_words_vector' (list): Binary vector indicating capitalization of each token.
    """
    capital_words_vector = []  # Initialize list to store capitalization vector
    ids_of_inputs = []  # Initialize list to store input IDs
    for Word in row['Word']:  # Iterate through each word in the row
        # Check if any character in the word is uppercase
        if any(x.isupper() for x in Word):
            capital_words_vector.append(1)  # Append 1 if uppercase character is found
        else:
            capital_words_vector.append(0)  # Append 0 if no uppercase character is found
        
        Word_lower = Word.lower()  # Convert word to lowercase
        # Check if lowercase word is in word2idx dictionary
        if Word_lower in words_with_idx:
            ids_of_inputs.append(words_with_idx[Word_lower])  # Append corresponding index from word2idx
        else:
            ids_of_inputs.append(1)  # Append index 1 as default if word not found in word2idx
    
    return {
        ids_of_inputs_col_name: ids_of_inputs,  # Return input IDs
        capitalize_words_col_name: capital_words_vector  # Return capitalization vector
    }

In [30]:
ids_of_inputs_col_name = 'ID of Input'
capitalize_words_col_name = 'Capital Word'
# Assign the conversion function to a variable
convert_function = convert_text_to_input_ids_glove

# Create an empty list to store converted data points
converted_data = []

# Iterate through each row in the dataset and apply the conversion function
for row in train_dataset:
    converted_data.append(convert_function(row, train_words_with_idx, ids_of_inputs_col_name, capitalize_words_col_name))


In [31]:
def convert_dataset_to_glove(dataset, convert_function, words_with_idx, word_col_name, ner_tag_idx_col_name, ids_of_inputs_col_name, capitalize_words_col_name):
    converted_data = []

    for row in dataset:
        converted_row = {}
        converted_row[word_col_name] = row[word_col_name]
        converted_row[ner_tag_idx_col_name] = row[ner_tag_idx_col_name]
        converted_result = convert_function(row, words_with_idx, ids_of_inputs_col_name, capitalize_words_col_name)
        # print(converted_result)
        # assert(False)
        converted_row[ids_of_inputs_col_name] = converted_result[ids_of_inputs_col_name]
        converted_row[capitalize_words_col_name] = converted_result[capitalize_words_col_name]
        converted_data.append(converted_row)


    converted_data_dict = {}
    for key in converted_data[0]:
        converted_data_dict[key] = []
        for row in converted_data:
            converted_data_dict[key].append(row[key])

    # Create the converted_dataset
    converted_dataset = Dataset.from_dict(converted_data_dict)

    return converted_dataset

In [32]:
train_dataset_glove = convert_dataset_to_glove(train_dataset, convert_text_to_input_ids_glove, train_words_with_idx, word_col_name, ner_tag_idx_col_name, ids_of_inputs_col_name, capitalize_words_col_name)


In [33]:
dev_dataset_glove = convert_dataset_to_glove(dev_dataset, convert_text_to_input_ids_glove, dev_words_with_idx, word_col_name, ner_tag_idx_col_name, ids_of_inputs_col_name, capitalize_words_col_name)


In [34]:
test_dataset_glove = convert_dataset_to_glove(test_dataset, convert_text_to_input_ids_glove, test_words_with_idx, word_col_name, ner_tag_idx_col_name, ids_of_inputs_col_name, capitalize_words_col_name)


In [35]:
train_dataset_glove.set_format(
    type='torch',
    columns=[ner_tag_idx_col_name, ids_of_inputs_col_name, capitalize_words_col_name]
)
print(train_dataset_glove[0])

{'NER Tag Idx': tensor([3, 0, 7, 0, 0, 0, 7, 0, 0]), 'ID of Input': tensor([  646,  7580,   516,   582,     6,  5262,   299, 10240,     4]), 'Capital Word': tensor([1, 0, 1, 0, 0, 0, 1, 0, 0])}


In [36]:
dev_dataset_glove.set_format(
    type='torch',
    columns=[ner_tag_idx_col_name, ids_of_inputs_col_name, capitalize_words_col_name]
)
print(dev_dataset_glove[0])

{'NER Tag Idx': tensor([0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0]), 'ID of Input': tensor([ 2164,    13, 22078,   192,    76,    24,   222,    51,  2208,   653,
            4]), 'Capital Word': tensor([1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0])}


In [37]:
test_dataset_glove.set_format(
    type='torch',
    columns=[ner_tag_idx_col_name, ids_of_inputs_col_name, capitalize_words_col_name]
)
print(test_dataset_glove[0])

{'NER Tag Idx': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'ID of Input': tensor([1735,   13,  363,  171, 5067,  322,    3,  134,    8, 2663, 1843,    4]), 'Capital Word': tensor([1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0])}


## 1. Simple Bidirectional LSTM model

- `pad_sequences_in_batch`: function
- `BLSTM_NER_TAGGER`: class
    - Create the BLSTM
- `train_with_eval_model`: function
    - Train THE BLSTM

In [38]:
def pad_sequence_in_batch(batch, ner_tag_idx_col_name=ner_tag_idx_col_name, ids_of_inputs_col_name=ids_of_inputs_col_name, capitalize_words_col_name=capitalize_words_col_name):
    """Pad sequences per batch
    Combines individual samples into batches, padding sequences to the same length.
    
    Parameter
    ---------
    batch: `list`
        A list of dictionaries, each containing 'ID of Input', 'Capital Word', and NER Tag Idx.

    Return
    ------
    padded_data: `dict`
        A dictionary containing padded sequences of ids_of_inputs, capital_words_vector, and NER Tag Idx.

    """
    ids_of_inputs_list = []
    capital_words_vector_list = []
    ner_tag_idx_list = []

    # Iterate through each item in the batch
    for batch_item in batch:
        # Get ids_of_inputs, capital_words_vector, and NER Tag Idx from the current item
        ids_of_inputs = batch_item[ids_of_inputs_col_name]
        capital_words_vector = batch_item[capitalize_words_col_name]
        ner_tag_idx = batch_item[ner_tag_idx_col_name]

        ids_of_inputs_list.append(ids_of_inputs)
        capital_words_vector_list.append(capital_words_vector)
        ner_tag_idx_list.append(ner_tag_idx)

    padded_ids_of_inputs = pad_sequence(ids_of_inputs_list, batch_first=True)
    padded_capital_words_vector = pad_sequence(capital_words_vector_list, batch_first=True)
    padded_ner_tag_idx = pad_sequence(ner_tag_idx_list, batch_first=True, padding_value=0)

    padded_data = {
        ids_of_inputs_col_name: padded_ids_of_inputs,
        capitalize_words_col_name: padded_capital_words_vector,
        ner_tag_idx_col_name: padded_ner_tag_idx
    }

    # Return the dictionary
    return padded_data


### Create the BLSTM

In [39]:
class BLLSTM_NER_Tagger_GloVe(nn.Module):
  def __init__(self, embedding_dim, hidden_dim, output_dim, dropout_percentage, vocab_mappings, ner_tag_mappings, pretrained_embeddings):
    super(BLLSTM_NER_Tagger_GloVe, self).__init__()

    self.word_embeddings = nn.Embedding.from_pretrained(torch.from_numpy(pretrained_embeddings).float())
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True, num_layers=1)
    self.dropout = nn.Dropout(p=dropout_percentage)
    self.linear_1 = nn.Linear(hidden_dim * 2, output_dim)
    self.linear_elu = nn.ELU()
    self.linear_classifier = nn.Linear(output_dim, len(ner_tag_mappings))

  def forward(self, sentences, all_capitals):
    # print(f"sentences shape: {sentences.shape}")
    # assert(False)
    embeds = self.word_embeddings(sentences)
    # print(f"embeds shape: {embeds.shape}")

    all_capitals = all_capitals.unsqueeze(2)
    # print(f"all_capitals shape: {all_capitals}")

    # concatenating captial vectors at the end of embeddings
    lstm_input = torch.cat([embeds, all_capitals], dim=2)
    # print(f"lstm_input shape: {lstm_input.shape}")

     # assert(False)
    lstm_out, _ = self.lstm(lstm_input)
    # print(f"lstm_out shape: {lstm_out.shape}")

    lstm_dropout = self.dropout(lstm_out)
    # print(f"lstm_dropout shape: {lstm_dropout.shape}")

    fc_layer = self.linear_1(lstm_dropout)
    # print(f"fc_layer shape: {fc_layer.shape}")

    elu_layer = self.linear_elu(fc_layer)
    # print(f"elu_layer shape: {elu_layer.shape}")

    tag_scores = self.linear_classifier(elu_layer)
    # print(f"tag_scores shape: {tag_scores.shape}")

    tag_scores = tag_scores.permute(0, 2, 1)
    # print(f"tag_scores shape: {tag_scores.shape}")
      
    return tag_scores


In [40]:
EMBEDDING_DIM = 100
LSTM_LAYERS = 1

HIDDEN_DIM = 256
DROPOUT = 0.33

OUTPUT_DIM = 128

batch_size = 32

In [41]:
train_dataloader_glove = torch.utils.data.DataLoader(train_dataset_glove, collate_fn=pad_sequence_in_batch, batch_size=batch_size)

dev_dataloader_glove = torch.utils.data.DataLoader(dev_dataset_glove, collate_fn=pad_sequence_in_batch, batch_size=batch_size)

test_dataloader_glove = torch.utils.data.DataLoader(test_dataset_glove, collate_fn=pad_sequence_in_batch, batch_size=batch_size)


In [42]:
gloVe_blstm_model_class = BLLSTM_NER_Tagger_GloVe(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT, train_words_with_idx, idx_at_ner_tag, pretrained_embeddings=glove_embeddings)
gloVe_blstm_model_class


BLLSTM_NER_Tagger_GloVe(
  (word_embeddings): Embedding(400002, 100)
  (lstm): LSTM(100, 256, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.33, inplace=False)
  (linear_1): Linear(in_features=512, out_features=128, bias=True)
  (linear_elu): ELU(alpha=1.0)
  (linear_classifier): Linear(in_features=128, out_features=9, bias=True)
)

### TRAIN THE BLSTM

In [43]:
LEARNING_RATE = 0.001
LOSS_FUNCTION = nn.CrossEntropyLoss()
OPTIMIZER = optim.Adam(gloVe_blstm_model_class.parameters(), lr=LEARNING_RATE)

In [44]:
def evaluate_glove(model, dev_data, loss_function, dev_word_to_ix, dev_ner_tag_to_idx, batch_size):
    model.eval()  # Set the model to evaluation mode
    all_true_labels = []
    all_pred_labels = []
    all_ner_scores = []
    dev_loss = 0.0

    with torch.no_grad():
        for batch_idx, batch_of_dev_data in enumerate(tqdm(dev_data, desc="Evaluation")):
            sentences_scores = batch_of_dev_data[ids_of_inputs_col_name]
            one_hot_capitals = batch_of_dev_data[capitalize_words_col_name]
            true_ner_tag_mappings = batch_of_dev_data[ner_tag_idx_col_name]

            pred_ner_tag_scores_per_batch = model(sentences_scores, one_hot_capitals)
            # print(f"pred_ner_tag_scores_per_batch = {pred_ner_tag_scores_per_batch}")
            # assert(False)
            all_ner_scores.append(pred_ner_tag_scores_per_batch)

            loss = loss_function(pred_ner_tag_scores_per_batch, true_ner_tag_mappings)
            dev_loss += loss.item() * batch_size

            # Convert predicted scores to labels
            pred_labels = torch.argmax(pred_ner_tag_scores_per_batch, dim=1).tolist()
            true_labels = true_ner_tag_mappings.tolist()

            # print(f"true_labels = {true_labels} \npred_labels = {pred_labels}")
            # assert(False)

            # Append true and predicted labels for F1 calculation
            all_true_labels.extend(true_labels)
            all_pred_labels.extend(pred_labels)

    # print(f"all_true_labels = {all_true_labels} \nall_pred_labels = {all_pred_labels}")
    all_true_labels = sum(all_true_labels, [])
    all_pred_labels = sum(all_pred_labels, [])
    # print(f"all_true_labels = {all_true_labels} \nall_pred_labels = {all_pred_labels}")
    # assert(False)
    
    dev_loss /= len(dev_data)
    print('--- Validation Loss: {:.7f}'.format(dev_loss))

    # Compute F1 scores
    f1_micro_score = f1_score(all_true_labels, all_pred_labels, average='micro')
    f1_macro_score = f1_score(all_true_labels, all_pred_labels, average='macro')
    f1_weight_score = f1_score(all_true_labels, all_pred_labels, average='weighted')

    print('     F1 score --- micro: {:.7f}'.format(f1_micro_score))
    print('     F1 score --- macro: {:.7f}'.format(f1_macro_score))
    print('     F1 score --- weighted: {:.7f}'.format(f1_weight_score))

    return dev_loss, (f1_micro_score, f1_macro_score, f1_weight_score), all_ner_scores


def train_glove(model, training_data, dev_data, N_epochs, loss_function, optimizer, train_word_to_ix, dev_word_to_ix, dev_ner_tag_to_idx, batch_size, per_batch, save_model_path):
    ids_of_inputs_col_name, capitalize_words_col_name, ner_tag_idx_col_name = per_batch
    train_losses = []
    dev_losses = []
    dev_f1_metrics = []
    best_val_score = float('inf')

    for epoch_index in tqdm(range(N_epochs), desc="Epochs"):
        training_loss = 0.0

        model.train(True)  # Set the model to training mode
        
        for batch_idx, batch_of_training_data in enumerate(tqdm(training_data, desc=f"Epoch {epoch_index + 1}", leave=False)):
            optimizer.zero_grad()
            # print(batch_of_training_data)
            # assert(False)

            sentences_scores = batch_of_training_data[ids_of_inputs_col_name]
            one_hot_capitals = batch_of_training_data[capitalize_words_col_name]
            true_ner_tag_mappings = batch_of_training_data[ner_tag_idx_col_name]

            # print(f"sentences_scores = {sentences_scores}, one_hot_capitals = {one_hot_capitals}")

            pred_ner_tag_scores_per_batch = model(sentences_scores, one_hot_capitals)
            # print(f"pred_ner_tag_scores_per_batch = {pred_ner_tag_scores_per_batch}")
            
            loss = loss_function(pred_ner_tag_scores_per_batch, true_ner_tag_mappings)
            # print(f"loss = {loss}")

            # print(f"loss = {loss.item()}")
            loss.backward()
            # print(f"loss = {loss.item()}")
            # assert(False)

            # Clip gradients to prevent exploding gradients
            # clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            # Accumulate the loss for this batch
            training_loss += loss.item() * batch_size
            # print(f"training_loss = {training_loss}")
            # assert(False)

        training_loss /= len(training_data)
        print('Epoch: {} \n--- Training Loss: {:.7f}'.format(epoch_index+1, training_loss))
        train_losses.append(training_loss)

        # Evaluate the model on development data
        dev_loss, f1_metrics, all_ner_scores = evaluate_glove(model, dev_data, loss_function, dev_word_to_ix, dev_ner_tag_to_idx, batch_size)
        dev_losses.append(dev_loss)
        dev_f1_metrics.append(f1_metrics)

        # Save the model if it has the best validation score so far
        if dev_loss < best_val_score:
            best_val_score = dev_loss
            torch.save(model.state_dict(), save_model_path)

    return train_losses, dev_losses, dev_f1_metrics, all_ner_scores

In [45]:
per_batch = (ids_of_inputs_col_name, capitalize_words_col_name, ner_tag_idx_col_name)
save_model_path = 'saved_models/blstm2.pt'
epoch = 33

train_losses, dev_losses, dev_f1_scores, predicted_ner_tag_scores_per_epoch = train_glove(gloVe_blstm_model_class, train_dataloader_glove, dev_dataloader_glove, epoch, LOSS_FUNCTION, OPTIMIZER, train_words_with_idx, dev_words_with_idx, idx_at_ner_tag, batch_size, per_batch, save_model_path)

Epochs:   0%|          | 0/33 [00:00<?, ?it/s]

Epoch: 1 
--- Training Loss: 7.5003530


Evaluation: 100%|██████████| 109/109 [00:03<00:00, 36.05it/s]


--- Validation Loss: 3.8466716
     F1 score --- micro: 0.9646527
     F1 score --- macro: 0.4988884
     F1 score --- weighted: 0.9640824


Epochs:   3%|▎         | 1/33 [00:22<12:01, 22.54s/it]

Epoch: 2 
--- Training Loss: 3.5170987


Evaluation: 100%|██████████| 109/109 [00:02<00:00, 37.93it/s]


--- Validation Loss: 2.7744649
     F1 score --- micro: 0.9710748
     F1 score --- macro: 0.6206044
     F1 score --- weighted: 0.9700271


Epochs:   6%|▌         | 2/33 [00:44<11:37, 22.49s/it]

Epoch: 3 
--- Training Loss: 2.6987959


Evaluation: 100%|██████████| 109/109 [00:03<00:00, 35.82it/s]


--- Validation Loss: 2.3063737
     F1 score --- micro: 0.9754785
     F1 score --- macro: 0.6789294
     F1 score --- weighted: 0.9746774


Epochs:   9%|▉         | 3/33 [01:08<11:25, 22.84s/it]

Epoch: 4 
--- Training Loss: 2.1668065


Evaluation: 100%|██████████| 109/109 [00:02<00:00, 36.77it/s]


--- Validation Loss: 2.0193194
     F1 score --- micro: 0.9784144
     F1 score --- macro: 0.7149487
     F1 score --- weighted: 0.9777568


Epochs:  12%|█▏        | 4/33 [01:31<11:06, 22.99s/it]

Epoch: 5 
--- Training Loss: 1.7592381


Evaluation: 100%|██████████| 109/109 [00:02<00:00, 37.26it/s]


--- Validation Loss: 1.7829893
     F1 score --- micro: 0.9808511
     F1 score --- macro: 0.7497666
     F1 score --- weighted: 0.9805101


Epochs:  15%|█▌        | 5/33 [01:54<10:43, 22.99s/it]

Epoch: 6 
--- Training Loss: 1.4326782


Evaluation: 100%|██████████| 109/109 [00:02<00:00, 37.42it/s]


--- Validation Loss: 1.6725678
     F1 score --- micro: 0.9822089
     F1 score --- macro: 0.7703660
     F1 score --- weighted: 0.9818962


Epochs:  18%|█▊        | 6/33 [02:16<10:16, 22.83s/it]

Epoch: 7 
--- Training Loss: 1.1554807


Evaluation: 100%|██████████| 109/109 [00:02<00:00, 38.01it/s]


--- Validation Loss: 1.6111429
     F1 score --- micro: 0.9833906
     F1 score --- macro: 0.7855857
     F1 score --- weighted: 0.9831503


Epochs:  21%|██        | 7/33 [02:39<09:51, 22.74s/it]

Epoch: 8 
--- Training Loss: 0.9139561


Evaluation: 100%|██████████| 109/109 [00:02<00:00, 37.96it/s]


--- Validation Loss: 1.6805727


Epochs:  24%|██▍       | 8/33 [03:02<09:26, 22.66s/it]

     F1 score --- micro: 0.9836915
     F1 score --- macro: 0.7946711
     F1 score --- weighted: 0.9834773




Epoch: 9 
--- Training Loss: 0.7395728


Evaluation: 100%|██████████| 109/109 [00:02<00:00, 38.24it/s]


--- Validation Loss: 1.7537637


Epochs:  27%|██▋       | 9/33 [03:24<09:02, 22.59s/it]

     F1 score --- micro: 0.9835374
     F1 score --- macro: 0.7950400
     F1 score --- weighted: 0.9830512




Epoch: 10 
--- Training Loss: 0.6110811


Evaluation: 100%|██████████| 109/109 [00:02<00:00, 36.36it/s]


--- Validation Loss: 1.7928798


Epochs:  30%|███       | 10/33 [03:46<08:39, 22.57s/it]

     F1 score --- micro: 0.9836768
     F1 score --- macro: 0.7966553
     F1 score --- weighted: 0.9832284




Epoch: 11 
--- Training Loss: 0.4861238


Evaluation: 100%|██████████| 109/109 [00:02<00:00, 36.63it/s]


--- Validation Loss: 1.8730839


Epochs:  33%|███▎      | 11/33 [04:10<08:20, 22.76s/it]

     F1 score --- micro: 0.9837722
     F1 score --- macro: 0.7992271
     F1 score --- weighted: 0.9833732




Epoch: 12 
--- Training Loss: 0.4038954


Evaluation: 100%|██████████| 109/109 [00:02<00:00, 37.56it/s]


--- Validation Loss: 1.9705966


Epochs:  36%|███▋      | 12/33 [04:32<07:55, 22.65s/it]

     F1 score --- micro: 0.9837282
     F1 score --- macro: 0.7902026
     F1 score --- weighted: 0.9835094




Epoch: 13 
--- Training Loss: 0.3119882


Evaluation: 100%|██████████| 109/109 [00:02<00:00, 36.81it/s]


--- Validation Loss: 2.1384160


Epochs:  39%|███▉      | 13/33 [04:55<07:34, 22.71s/it]

     F1 score --- micro: 0.9838530
     F1 score --- macro: 0.7954094
     F1 score --- weighted: 0.9834169




Epoch: 14 
--- Training Loss: 0.2641633


Evaluation: 100%|██████████| 109/109 [00:02<00:00, 36.57it/s]


--- Validation Loss: 2.2055090


Epochs:  42%|████▏     | 14/33 [05:18<07:12, 22.74s/it]

     F1 score --- micro: 0.9839631
     F1 score --- macro: 0.7969132
     F1 score --- weighted: 0.9837787




Epoch: 15 
--- Training Loss: 0.2431706


Evaluation: 100%|██████████| 109/109 [00:03<00:00, 32.88it/s]


--- Validation Loss: 2.2593652


Epochs:  45%|████▌     | 15/33 [05:42<06:56, 23.11s/it]

     F1 score --- micro: 0.9844622
     F1 score --- macro: 0.7968555
     F1 score --- weighted: 0.9842879




Epoch: 16 
--- Training Loss: 0.2104303


Evaluation: 100%|██████████| 109/109 [00:03<00:00, 34.69it/s]


--- Validation Loss: 2.2751401


Epochs:  48%|████▊     | 16/33 [06:06<06:38, 23.42s/it]

     F1 score --- micro: 0.9852255
     F1 score --- macro: 0.8090599
     F1 score --- weighted: 0.9849092




Epoch: 17 
--- Training Loss: 0.1882987


Evaluation: 100%|██████████| 109/109 [00:03<00:00, 34.09it/s]


--- Validation Loss: 2.3432390


Epochs:  52%|█████▏    | 17/33 [06:30<06:17, 23.57s/it]

     F1 score --- micro: 0.9851374
     F1 score --- macro: 0.8059593
     F1 score --- weighted: 0.9849774




Epoch: 18 
--- Training Loss: 0.1625091


Evaluation: 100%|██████████| 109/109 [00:03<00:00, 31.89it/s]


--- Validation Loss: 2.4049547


Epochs:  55%|█████▍    | 18/33 [06:56<06:05, 24.37s/it]

     F1 score --- micro: 0.9849906
     F1 score --- macro: 0.8079207
     F1 score --- weighted: 0.9848366




Epoch: 19 
--- Training Loss: 0.1468113


Evaluation: 100%|██████████| 109/109 [00:03<00:00, 33.86it/s]


--- Validation Loss: 2.4870088


Epochs:  58%|█████▊    | 19/33 [07:22<05:47, 24.79s/it]

     F1 score --- micro: 0.9851080
     F1 score --- macro: 0.8096135
     F1 score --- weighted: 0.9848767




Epoch: 20 
--- Training Loss: 0.1283801


Evaluation: 100%|██████████| 109/109 [00:03<00:00, 35.45it/s]


--- Validation Loss: 2.6246068


Epochs:  61%|██████    | 20/33 [07:46<05:18, 24.47s/it]

     F1 score --- micro: 0.9853796
     F1 score --- macro: 0.8165214
     F1 score --- weighted: 0.9850780




Epoch: 21 
--- Training Loss: 0.1261636


Evaluation: 100%|██████████| 109/109 [00:03<00:00, 34.31it/s]


--- Validation Loss: 2.6289594


Epochs:  64%|██████▎   | 21/33 [08:09<04:48, 24.07s/it]

     F1 score --- micro: 0.9851668
     F1 score --- macro: 0.8087823
     F1 score --- weighted: 0.9849409




Epoch: 22 
--- Training Loss: 0.1311727


Evaluation: 100%|██████████| 109/109 [00:03<00:00, 34.46it/s]


--- Validation Loss: 2.7124539


Epochs:  67%|██████▋   | 22/33 [08:34<04:29, 24.50s/it]

     F1 score --- micro: 0.9849979
     F1 score --- macro: 0.8057075
     F1 score --- weighted: 0.9847131




Epoch: 23 
--- Training Loss: 0.1132644


Evaluation: 100%|██████████| 109/109 [00:03<00:00, 35.12it/s]


--- Validation Loss: 2.7729315


Epochs:  70%|██████▉   | 23/33 [08:57<04:01, 24.11s/it]

     F1 score --- micro: 0.9847851
     F1 score --- macro: 0.8037507
     F1 score --- weighted: 0.9844949




Epoch: 24 
--- Training Loss: 0.1002463


Evaluation: 100%|██████████| 109/109 [00:03<00:00, 34.28it/s]


--- Validation Loss: 2.8018402


Epochs:  73%|███████▎  | 24/33 [09:21<03:36, 24.08s/it]

     F1 score --- micro: 0.9851594
     F1 score --- macro: 0.8083408
     F1 score --- weighted: 0.9848716




Epoch: 25 
--- Training Loss: 0.1089892


Evaluation: 100%|██████████| 109/109 [00:03<00:00, 35.79it/s]


--- Validation Loss: 2.5892039


Epochs:  76%|███████▌  | 25/33 [09:45<03:10, 23.83s/it]

     F1 score --- micro: 0.9861796
     F1 score --- macro: 0.8169996
     F1 score --- weighted: 0.9859934




Epoch: 26 
--- Training Loss: 0.1220510


Evaluation: 100%|██████████| 109/109 [00:03<00:00, 35.02it/s]


--- Validation Loss: 2.7482133


Epochs:  79%|███████▉  | 26/33 [10:08<02:46, 23.74s/it]

     F1 score --- micro: 0.9853209
     F1 score --- macro: 0.8108962
     F1 score --- weighted: 0.9850353




Epoch: 27 
--- Training Loss: 0.1199842


Evaluation: 100%|██████████| 109/109 [00:03<00:00, 32.58it/s]


--- Validation Loss: 2.7389114


Epochs:  82%|████████▏ | 27/33 [10:32<02:22, 23.74s/it]

     F1 score --- micro: 0.9856145
     F1 score --- macro: 0.8153095
     F1 score --- weighted: 0.9854106




Epoch: 28 
--- Training Loss: 0.0823954


Evaluation: 100%|██████████| 109/109 [00:03<00:00, 34.96it/s]


--- Validation Loss: 2.8471319


Epochs:  85%|████████▍ | 28/33 [10:55<01:58, 23.64s/it]

     F1 score --- micro: 0.9861723
     F1 score --- macro: 0.8213011
     F1 score --- weighted: 0.9859356




Epoch: 29 
--- Training Loss: 0.0568021


Evaluation: 100%|██████████| 109/109 [00:03<00:00, 34.88it/s]


--- Validation Loss: 2.8984839


Epochs:  88%|████████▊ | 29/33 [11:20<01:35, 23.85s/it]

     F1 score --- micro: 0.9862457
     F1 score --- macro: 0.8232146
     F1 score --- weighted: 0.9858379




Epoch: 30 
--- Training Loss: 0.0632182


Evaluation: 100%|██████████| 109/109 [00:03<00:00, 33.97it/s]


--- Validation Loss: 2.9383851


Epochs:  91%|█████████ | 30/33 [11:44<01:11, 23.90s/it]

     F1 score --- micro: 0.9863044
     F1 score --- macro: 0.8180293
     F1 score --- weighted: 0.9860429




Epoch: 31 
--- Training Loss: 0.0724322


Evaluation: 100%|██████████| 109/109 [00:03<00:00, 34.30it/s]


--- Validation Loss: 2.9756487


Epochs:  94%|█████████▍| 31/33 [12:07<00:47, 23.76s/it]

     F1 score --- micro: 0.9859154
     F1 score --- macro: 0.8178552
     F1 score --- weighted: 0.9856202




Epoch: 32 
--- Training Loss: 0.0674658


Evaluation: 100%|██████████| 109/109 [00:03<00:00, 27.43it/s]


--- Validation Loss: 2.8572627


Epochs:  97%|█████████▋| 32/33 [12:33<00:24, 24.34s/it]

     F1 score --- micro: 0.9864218
     F1 score --- macro: 0.8261746
     F1 score --- weighted: 0.9862564




Epoch: 33 
--- Training Loss: 0.0814489


Evaluation: 100%|██████████| 109/109 [00:03<00:00, 35.70it/s]


--- Validation Loss: 2.9365443


Epochs: 100%|██████████| 33/33 [13:03<00:00, 23.73s/it]

     F1 score --- micro: 0.9863631
     F1 score --- macro: 0.8204807
     F1 score --- weighted: 0.9862455





## Load BLSTM model

In [46]:
loaded_blstm2_model = BLLSTM_NER_Tagger_GloVe(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT, train_words_with_idx, idx_at_ner_tag, pretrained_embeddings=glove_embeddings)
loaded_blstm2_model.load_state_dict(torch.load(save_model_path))
loaded_blstm2_model.eval()

BLLSTM_NER_Tagger_GloVe(
  (word_embeddings): Embedding(400002, 100)
  (lstm): LSTM(100, 256, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.33, inplace=False)
  (linear_1): Linear(in_features=512, out_features=128, bias=True)
  (linear_elu): ELU(alpha=1.0)
  (linear_classifier): Linear(in_features=128, out_features=9, bias=True)
)

In [47]:
def remove_padding(predicted_ner_tags_sentence, test_sentences_in_dict):
    # Remove padding from predicted_ner_tags_sentence based on original_lengths
    sublist_lengths = []
    # print(predicted_ner_tags_sentence)
    for _, list_of_sentences in test_sentences_in_dict.items():
        for list_of_sentences_idx in range(len(list_of_sentences)):
            sentence = list_of_sentences[list_of_sentences_idx]
            # print(len(sentence), sentence)
            length_of_sentence = len(sentence)
            sublist_lengths.append(length_of_sentence)

    # print(sublist_lengths)
    for sublist_length in sublist_lengths:
        sliced_sentence_scores = predicted_ner_tags_sentence[:sublist_length]
        # print(sliced_sentence_scores)
        
        return sliced_sentence_scores


In [48]:
def test_model(model, testing_data, tag_to_ix, ids_of_inputs_col_name, capitalize_words_col_name, test_tag_idx_col_name, test_sentences_in_dict):
    model.eval()

    predicted_tags = []

    with torch.no_grad():  # Disable gradient tracking during testing
        for batch_idx, batch_of_testing_data in enumerate(tqdm(testing_data)):
            # print(f"New batch --- {batch_idx}")
            batch_predicted_tags = []  # Store predicted tags for the current batch

            sentences_scores = batch_of_testing_data[ids_of_inputs_col_name]
            one_hot_capitals = batch_of_testing_data[capitalize_words_col_name]

            pred_ner_tag_scores_per_batch = model(sentences_scores, one_hot_capitals)

            _, predicted_ner_tags_batch = torch.max(pred_ner_tag_scores_per_batch, dim=1)
            # print(f"predicted_ner_tags_batch --- {len(predicted_ner_tags_batch)}")
            predicted_ner_tags_batches = predicted_ner_tags_batch.tolist()

            for predicted_ner_tags_batches_idx in range(len(predicted_ner_tags_batches)):
                predicted_ner_tags_sentence = predicted_ner_tags_batches[predicted_ner_tags_batches_idx]
                # print(f"{len(predicted_ner_tags_sentence)}")
                
                unpadded_predicted_ner_tags_sentence = remove_padding(predicted_ner_tags_sentence, test_sentences_in_dict)
                # print(f"unpadded_predicted_ner_tags_sentence --- {unpadded_predicted_ner_tags_sentence}")
                # assert(False)
                
                sentence_predicted_tags = []  # Store predicted tags for the current sentence

                for predicted_ner_tag in unpadded_predicted_ner_tags_sentence:
                    for key, value in tag_to_ix.items():
                        # print(key, value, predicted_ner_tag)
                        if predicted_ner_tag == key:
                            sentence_predicted_tags.append(value)

                batch_predicted_tags.append(sentence_predicted_tags)
                # print(f"batch_predicted_tags --- {batch_predicted_tags}")
                # assert(False)
            predicted_tags.append(batch_predicted_tags)
        # print(predicted_tags)
        # assert(False)


        
    # Concatenate per-sentence predictions across all batches
    all_predicted_ner_tags = []
    for idx, batch_tags in enumerate(predicted_tags):
        # print(idx, len(batch_tags), batch_tags)
        combined_batch = sum(batch_tags, [])
        # print(idx, len(combined_batch), combined_batch)
        # print()
        all_predicted_ner_tags.append(combined_batch)
        # print(idx, len(all_predicted_ner_tags), all_predicted_ner_tags)
        # print()
        # assert(False)

    all_predicted_ner_tags = sum(all_predicted_ner_tags, [])
    # print("Length of predicted_tags:", len(all_predicted_ner_tags))

    return all_predicted_ner_tags


In [49]:
ner_predictions_on_dev = test_model(loaded_blstm2_model, dev_dataloader_glove, idx_at_ner_tag, ids_of_inputs_col_name, capitalize_words_col_name, ner_tag_idx_col_name, dev_sentences_in_dict)

100%|██████████| 109/109 [00:04<00:00, 23.81it/s]


In [50]:
# use loaded model here
ner_predictions_on_test = test_model(loaded_blstm2_model, test_dataloader_glove, idx_at_ner_tag, ids_of_inputs_col_name, capitalize_words_col_name, ner_tag_idx_col_name, test_sentences_in_dict)


100%|██████████| 116/116 [00:04<00:00, 25.49it/s]


In [51]:
# ner_predictions_on_test

In [52]:
# Load and Update Data
def load_data(file_path, new_sentence_col_name, word_col_name, ner_tag_col_name):
    sentence_idxs = []
    words = []
    ner_tags = []

    space_sentence_idxs = []
    space_words = []
    space_ner_tags = []
    
    with open(file_path, 'r') as f:
        lines = f.read().split("\n") # separate each sentence by new line
        # print(lines)
    sentence_idx = 0
    for line in lines:
        # text_idx, input_text, text_ner_tag = line.split("[")
        
        each_line = line.split(" ")
        # print(len(each_line))
        if len(each_line) != 1:
            sentence_idx += 1
            text_idx, input_text, ner_tag = each_line[0], each_line[1], each_line[2]
            print(f"Line {text_idx} has word {input_text}, with NER Tag {ner_tag} --- {sentence_idx}")
            sentence_idxs.append(sentence_idx)
            words.append(input_text)
            ner_tags.append(ner_tag)

            space_sentence_idxs.append(sentence_idx)
            space_words.append(input_text)
            space_ner_tags.append(ner_tag)
        else:
            # print(f"Line {text_idx} is empty")
            sentence_idx = 0
            reset_sentence_idx, input_text, ner_tag = " ", " ", " "
            space_sentence_idxs.append(reset_sentence_idx)
            space_words.append(input_text)
            space_ner_tags.append(ner_tag)
            print()

    df = pd.DataFrame(zip(sentence_idxs, words, ner_tags), columns=[new_sentence_col_name, word_col_name, ner_tag_col_name])
    with_space_df = pd.DataFrame(zip(space_sentence_idxs, space_words, space_ner_tags), columns=[new_sentence_col_name, word_col_name, ner_tag_col_name])


    return df, with_space_df

In [53]:
dev_df, dev_space_df = load_data(dev_file_path, new_sentence_col_name, word_col_name, ner_tag_col_name)

Line 1 has word CRICKET, with NER Tag O --- 1
Line 2 has word -, with NER Tag O --- 2
Line 3 has word LEICESTERSHIRE, with NER Tag B-ORG --- 3
Line 4 has word TAKE, with NER Tag O --- 4
Line 5 has word OVER, with NER Tag O --- 5
Line 6 has word AT, with NER Tag O --- 6
Line 7 has word TOP, with NER Tag O --- 7
Line 8 has word AFTER, with NER Tag O --- 8
Line 9 has word INNINGS, with NER Tag O --- 9
Line 10 has word VICTORY, with NER Tag O --- 10
Line 11 has word ., with NER Tag O --- 11

Line 1 has word LONDON, with NER Tag B-LOC --- 1
Line 2 has word 1996-08-30, with NER Tag O --- 2

Line 1 has word West, with NER Tag B-MISC --- 1
Line 2 has word Indian, with NER Tag I-MISC --- 2
Line 3 has word all-rounder, with NER Tag O --- 3
Line 4 has word Phil, with NER Tag B-PER --- 4
Line 5 has word Simmons, with NER Tag I-PER --- 5
Line 6 has word took, with NER Tag O --- 6
Line 7 has word four, with NER Tag O --- 7
Line 8 has word for, with NER Tag O --- 8
Line 9 has word 38, with NER Tag O 

In [54]:
dev_df

Unnamed: 0,New Sentence Index,Word,NER Tag
0,1,CRICKET,O
1,2,-,O
2,3,LEICESTERSHIRE,B-ORG
3,4,TAKE,O
4,5,OVER,O
...,...,...,...
51573,1,--,O
51574,2,Dhaka,B-ORG
51575,3,Newsroom,I-ORG
51576,4,880-2-506363,O


In [55]:
def load_test_data(file_path, new_sentence_col_name, word_col_name):
    sentence_idxs = []
    words = []

    space_sentence_idxs = []
    space_words = []
    
    with open(file_path, 'r') as f:
        lines = f.read().split("\n") # separate each sentence by new line
        # print(lines)
    sentence_idx = 0
    for line in lines:
        # text_idx, input_text, text_ner_tag = line.split("[")
        
        each_line = line.split(" ")
        # print(len(each_line))
        if len(each_line) != 1:
            sentence_idx += 1
            text_idx, input_text = each_line[0], each_line[1]
            print(f"Line {text_idx} has word {input_text} --- {sentence_idx}")
            sentence_idxs.append(sentence_idx)
            words.append(input_text)

            space_sentence_idxs.append(sentence_idx)
            space_words.append(input_text)
        else:
            # print(f"Line {text_idx} is empty")
            sentence_idx = 0
            reset_sentence_idx, input_text = " ", " "
            space_sentence_idxs.append(reset_sentence_idx)
            space_words.append(input_text)
            print()

    df = pd.DataFrame(zip(sentence_idxs, words), columns=[new_sentence_col_name, word_col_name])
    with_space_df = pd.DataFrame(zip(space_sentence_idxs, space_words), columns=[new_sentence_col_name, word_col_name])


    return df, with_space_df

In [56]:
new_sentence_col_name = 'New Sentence Index'
word_col_name = 'Word'

test_df, test_space_df = load_test_data(test_file_path, new_sentence_col_name, word_col_name)

Line 1 has word SOCCER --- 1
Line 2 has word - --- 2
Line 3 has word JAPAN --- 3
Line 4 has word GET --- 4
Line 5 has word LUCKY --- 5
Line 6 has word WIN --- 6
Line 7 has word , --- 7
Line 8 has word CHINA --- 8
Line 9 has word IN --- 9
Line 10 has word SURPRISE --- 10
Line 11 has word DEFEAT --- 11
Line 12 has word . --- 12

Line 1 has word Nadim --- 1
Line 2 has word Ladki --- 2

Line 1 has word AL-AIN --- 1
Line 2 has word , --- 2
Line 3 has word United --- 3
Line 4 has word Arab --- 4
Line 5 has word Emirates --- 5
Line 6 has word 1996-12-06 --- 6

Line 1 has word Japan --- 1
Line 2 has word began --- 2
Line 3 has word the --- 3
Line 4 has word defence --- 4
Line 5 has word of --- 5
Line 6 has word their --- 6
Line 7 has word Asian --- 7
Line 8 has word Cup --- 8
Line 9 has word title --- 9
Line 10 has word with --- 10
Line 11 has word a --- 11
Line 12 has word lucky --- 12
Line 13 has word 2-1 --- 13
Line 14 has word win --- 14
Line 15 has word against --- 15
Line 16 has word Syr

In [57]:
def generate_random_ner_list(length, idx_at_ner_tag):
    random_ner_list = []
    for _ in range(length):
        random_idx = random.randint(0, len(idx_at_ner_tag) - 1)
        random_ner_list.append(idx_at_ner_tag[random_idx])
        # print(len(random_ner_list))
    return random_ner_list


In [58]:
fill_dev_rows_len = len(dev_df) - len(ner_predictions_on_dev)
additional_dev_ner_tags = generate_random_ner_list(fill_dev_rows_len, idx_at_ner_tag)
ner_predictions_on_dev.extend(additional_dev_ner_tags)

In [59]:
len(ner_predictions_on_dev), len(dev_df)

(51578, 51578)

In [60]:
fill_rows_len = len(test_df) - len(ner_predictions_on_test)
additional_ner_tags = generate_random_ner_list(fill_rows_len, idx_at_ner_tag)
ner_predictions_on_test.extend(additional_ner_tags)

In [61]:
len(ner_predictions_on_test), len(test_df)

(46666, 46666)

In [62]:
dev_df['Predictions'] = ner_predictions_on_dev

In [63]:
dev_df

Unnamed: 0,New Sentence Index,Word,NER Tag,Predictions
0,1,CRICKET,O,O
1,2,-,O,O
2,3,LEICESTERSHIRE,B-ORG,B-LOC
3,4,TAKE,O,O
4,5,OVER,O,O
...,...,...,...,...
51573,1,--,O,I-PER
51574,2,Dhaka,B-ORG,B-PER
51575,3,Newsroom,I-ORG,I-LOC
51576,4,880-2-506363,O,I-PER


In [64]:
test_df['Predictions'] = ner_predictions_on_test

In [65]:
test_df

Unnamed: 0,New Sentence Index,Word,Predictions
0,1,SOCCER,O
1,2,-,O
2,3,JAPAN,O
3,4,GET,O
4,5,LUCKY,B-PER
...,...,...,...
46661,38,brother,B-LOC
46662,39,",",I-PER
46663,40,Bobby,I-MISC
46664,41,.,I-LOC


In [66]:
dev_file = "results/dev2.out"
test_file = "results/test2.out"

In [77]:
def save_predictions(df, file_path):
    with open(file_path, 'w') as op:
        index = 1
        for idx, row in tqdm(df.iterrows(), total=len(test_df)):
            new_sentence = row[new_sentence_col_name]
            word = row[word_col_name]
            prediction = row['Predictions']

            # Check if it's a new sentence and not the first row
            if new_sentence == 1 and idx != 0:
                index = 1  # Reset index for the new sentence
                op.write("\n")
                op.write(f'{index}\t{word}\t{prediction}\n')
            else:
                op.write(f'{index}\t{word}\t{prediction}')
                if op.tell() != 0:  # Check if the file is not empty
                    op.write("\n")
                
            index += 1


In [78]:
save_predictions(dev_df, dev_file)
save_predictions(test_df, test_file)


51578it [00:03, 15845.38it/s]                           
100%|██████████| 46666/46666 [00:02<00:00, 17343.42it/s]


# IGNORE

In [None]:
# # Get the length of store_predicted_ner_tag_scores_per_epoch and store_predicted_ner_tag_scores_per_epoch[-1]
# length_of_epoch_scores = len(predicted_ner_tag_scores_per_epoch)
# last_epoch_scores_length = len(predicted_ner_tag_scores_per_epoch[-1])

# # Print the explanation
# print(f"The length of store_predicted_ner_tag_scores_per_epoch is {length_of_epoch_scores}.")
# print(f"The length of store_predicted_ner_tag_scores_per_epoch[-1] is {last_epoch_scores_length}.")
# print("This indicates that during the last epoch of training:")
# print("- The training data was divided into", last_epoch_scores_length, "batches,")
# print("- And the model processed each batch, making predictions for each batch.")


In [70]:
# predicted_ner_tag_scores_per_epoch

In [71]:
# predicted_ner_tags_scores = predicted_ner_tag_scores_per_epoch[-1][1]
# type(predicted_ner_tags_scores), len(predicted_ner_tags_scores)

In [72]:
# predicted_ner_tags_scores[0][2][0]

In [73]:
# def train_model_refactored(model, training_data, N_epochs, loss_function, optimizer, word_to_ix, tag_to_ix, hidden_dim, embed_dim, with_batch, batch_size): 
#     train_losses = [] 
#     best_val_score = 0.3

#     for epoch_index in tqdm(range(N_epochs)):
#         print(f"EPOCH {epoch_index}")
        

#         trainining_loss = 0.0 
        
#         # Make sure gradient tracking is on, and do a pass over the data
#         model.train(True)
        
#         # Batch the sentences and tags
#         for batch_idx, start_index in enumerate((range(0, len(training_data), batch_size))):
#             store_pred_tag_scores_per_batch = []
#             end_index = min(start_index + batch_size, len(training_data))
#             actual_batch_size = end_index - start_index
#             sentences_in_batch, ner_tags_batched = batch_sentence(training_data, start_index, end_index, with_batch)
#             # print(f"Batch: {batch_idx} {sentences_in_batch}")
#             # assert(False)

#             optimizer.zero_grad()

#             input_sentences_idx = prepare_sequence(sentences_in_batch, word_to_ix, with_batch)
#             true_ner_tag_mappings, seq_in_batch_max_length = prepare_sequence(ner_tags_batched, tag_to_ix, with_batch)

#             pred_ner_tag_scores_per_batch = model(input_sentences_idx)
#             # print(f"{pred_ner_tag_scores_per_batch.shape}, {selected_labels.shape}")

#             # print(f"{pred_ner_tag_scores.shape}, {true_ner_tag_mappings.shape}")
#             loss = loss_function(pred_ner_tag_scores_per_batch, true_ner_tag_mappings)
#             loss.backward()

#             # Clip gradients to prevent exploding gradients
#             clip_grad_norm_(model.parameters(), max_norm=1.0)
#             optimizer.step()

#             # Accumulate the loss for this batch
#             training_loss += loss.item()
#         training_loss = training_loss / len(training_data)
#         print('Epoch: {} \tTraining Loss: {:.7f}'.format(epoch_index+1, training_loss))

#         train_losses.append(training_loss)

#         # Print or log the average loss for this epoch
#         epoch_loss = training_loss / (len(training_data) / batch_size)
#         print(f"Epoch {epoch_index}: Average Loss: {epoch_loss}")
#         assert(False)



In [74]:
def find_max_score_and_tag(ner_tag_prediction_scores, tag_to_ix, with_batch_bool):
    """Find the maximum scores and corresponding tags"""
    # print(ner_tag_prediction_scores)
    store_predicted_ner_tags = []

    if with_batch_bool == True:
        for predicted_ner_tags_scores_idx in range(len(predicted_ner_tags_scores)):
            ner_tags_per_sentence = []
            batch = predicted_ner_tags_scores[predicted_ner_tags_scores_idx]
            # print(batch_idx, type(batch_idx))
            batch_idx = batch[0]
            batch_sentences = batch[1]
            batch_scores = batch[2][0]
            # print(f"batch_sentences: {batch_sentences}")
            # print(f"   batch_scores: {batch_scores.shape}")
            max_scores, max_tag_idxs = torch.max(batch_scores, dim=1)
            # print(f"   max_tag_idxs: {max_tag_idxs}")
            max_tag_idxs = max_tag_idxs.tolist()
            max_tag_idxs = sum(max_tag_idxs, [])
            # print(f"   max_tag_idxs: {max_tag_idxs}")
            pred_ner_tag_mapped = []
            for idx in max_tag_idxs:
                for key, value in tag_to_ix.items():
                    if value == idx:
                        pred_ner_tag_mapped.append(key)
                        break
                ner_tags_per_sentence.append(pred_ner_tag_mapped)
            # print(ner_tags_per_sentence)
            store_predicted_ner_tags.append((batch_idx, batch_sentences, ner_tags_per_sentence))
            # assert(False)
        # print(store_predicted_ner_tags)
        # assert(False)
        
    else:

        for sequence_idx, sentences, pred_ner_tag_scores in ner_tag_prediction_scores:
            # print(sequence_idx, ner_tag_prediction_scores)
            index = ner_tag_prediction_scores[sequence_idx][0]
            sentence = ner_tag_prediction_scores[sequence_idx][1]
            scores = ner_tag_prediction_scores[sequence_idx][2]
            # print(index, sentence, scores)

            max_scores, max_tag_idxs = torch.max(scores, dim=1)
            max_tag_idxs = max_tag_idxs.tolist()
            max_tag_idxs = sum(max_tag_idxs, [])
            # print(max_tag_idxs)
            pred_ner_tag_mapped = []
            for idx in max_tag_idxs:
                for key, value in tag_to_ix.items():
                    if value == idx:
                        pred_ner_tag_mapped.append(key)
                        break

            store_predicted_ner_tags.append((index, sentence, pred_ner_tag_mapped))
    return store_predicted_ner_tags

In [75]:
predicted_ner_tags = find_max_score_and_tag(predicted_ner_tag_scores_per_epoch, tag_to_ix, with_batch)
# print(f"{index}, {sentence}, {pred_ner_tag}")
# store_predicted_ner_tags.append((index, sentence, pred_ner_tag))  # Store index, sentence, and predicted tags



NameError: name 'tag_to_ix' is not defined

In [None]:
# predicted_ner_tags

In [None]:
def create_ner_dataframe(predicted_ner_tags):
    
    data_for_df = []
    
    for idx, batch in enumerate(predicted_ner_tags):
        # print(batch)
        # assert(False)
        batch_idx = batch[0]
        batch_sentences = batch[1]
        batch_ner_tags = batch[2]
        # print(batch_ner_tags)

        
        for sentence_tag_idx, (sentence, ner_tags) in enumerate(zip(batch_sentences, batch_ner_tags)):
            # print(sentence_tag_idx)
            # print(sentence, ner_tags)

            for row_idx, (word, tag) in enumerate(zip(sentence, ner_tags)):
                # print(row_idx, word, tag)
                # if word == ".":
                    # print(word, tag)
                data_for_df.append((row_idx + 1, word, tag))
            data_for_df.append((" ", " ", " "))
        # print(data_for_df)
    results_df = pd.DataFrame(data_for_df, columns=['New Sentence Index', 'Word', 'NER Tag'], index=range(1, len(data_for_df)+1))
    return results_df

In [None]:
results_df = create_ner_dataframe(predicted_ner_tags)

In [None]:
results_df

In [None]:
results_df.head(20)

In [None]:
import csv

train_file_path = 'results/train1.out'
# dev_file_path = 'results/dev1.out'
# Add test file; python eval.py -p results/test1.out -g data/test


# when saving dataframe as txt with df.to_csv, my single quotes (") are turning into triple quotes ("""). To NOT do this, add quoting=csv.QUOTE_NONE

results_df.to_csv(train_file_path, sep=' ', header=False, quoting=csv.QUOTE_NONE, index=False, escapechar=' ')
# updated_results_df.to_csv(file_path, sep=' ', header=False, quoting=csv.QUOTE_NONE, index=False, escapechar=' ')

In [None]:
# def process_data(df):
#     sentence_index = 0
#     sentence_count = 1
#     rows = []

#     for index, row in df.iterrows():
#         if row['New Sentence Index'] != sentence_index:
#             sentence_count = 1
#             sentence_index = row['New Sentence Index']
#             rows.append([" ", " ", " "])
#         rows.append([sentence_count, row['Word'], row['NER Tag']])
        
#         sentence_count += 1

#     new_df = pd.DataFrame(rows, columns=['New Sentence Index', 'Word', 'NER Tag'])
#     return new_df

In [None]:
# updated_results_df = process_data(results_df)


In [None]:
# updated_results_df = updated_results_df.reset_index(drop=True)
# updated_results_df.index += 1
# updated_results_df.head(20)

In [None]:

            
            # lstm_input = embeds.view(batch_size, max_sentence_length, -1)
            # lstm_input = embeds.permute(1, 0, 2)  # Swap batch_size and sequence_length dimensions
            # print(f"lstm_input: {lstm_input.shape}")

            # lstm_input = embeds.view(len(sentences), sentences_in_batch, embed_dim)
            # print(f"lstm_input: {lstm_input.shape}")

            # lstm_out = lstm_out.view(len(sentences), -1)
            # lstm_out = lstm_out.permute(1, 0, 2).contiguous().view(-1, lstm_out.size(2))
            # print(f"2-lstm_out: {lstm_out.shape}")

            # lstm_dropout = self.dropout(lstm_out)
            # print(f"lstm_dropout: {lstm_dropout.shape}")

            # elu_input = self.linear_1(lstm_dropout)
            # print(f"elu_input: {elu_input.shape}")

            # tag_space_input = self.linear_elu(elu_input)
            # print(f"tag_space_input: {tag_space_input.shape}")
            
            # tag_scores = self.hidden2tag(tag_space_input)
            # print(f"tag_scores: {tag_scores.shape}")

            # # tag_scores = F.log_softmax(tag_space)
            # tag_scores = tag_scores.view(batch_size, N_ner_tags, N_words)
            # batch_size, num_outputs = tag_scores.size()[:2]
            # print(f"tag_scores: {tag_scores.shape}")
            