# Assignment 3
Training a neural named entity recognition (NER) tagger 

In [None]:
# ! pip install alive-progress

In [34]:

import torch
import torch.nn as nn
import os 
import numpy as np
from random import shuffle
from sklearn.metrics import classification_report
from tqdm import tqdm
import pandas as pd
from google.colab import drive
from tqdm.notebook import tqdm_notebook
from IPython.display import display
import warnings
warnings.filterwarnings("ignore")

In [2]:

# general folder path
drive_path = '/content/gdrive'
drive_saving_path = '/content/gdrive/My Drive'
drive.mount(drive_path)

Mounted at /content/gdrive


In [3]:
USER = 'DRIVE'  # 'OR' / 'ROY'
if USER == 'OR':
    main_folder =main_folder = r'C:\MSC\NLP2\HW3'
elif USER == 'DRIVE' :
    main_folder = folder_path = os.path.join(drive_saving_path, 'NLP3')


In this assignment you are required to build a full training and testing pipeline for a neural sequentail tagger for named entities, using LSTM.

The dataset that you will be working on is called ReCoNLL 2003, which is a corrected version of the CoNLL 2003 dataset: https://www.clips.uantwerpen.be/conll2003/ner/

[Train data](https://drive.google.com/file/d/1hG66e_OoezzeVKho1w7ysyAx4yp0ShDz/view?usp=sharing)

[Dev data](https://drive.google.com/file/d/1EAF-VygYowU1XknZhvzMi2CID65I127L/view?usp=sharing)

[Test data](https://drive.google.com/file/d/16gug5wWnf06JdcBXQbcICOZGZypgr4Iu/view?usp=sharing)

As you can see, the annotated texts are labeled according to the IOB annotation scheme, for 3 entity types: Person, Organization, Location.

**Task 1:** Write a funtion for reading the data from a single file (of the ones that are provided above). The function recieves a filepath and then it encodes every sentence individually using a pair of lists, one list contains the words and one list contains the tags. Each list pair will be added to a general list (data), which will be returned back from the function.

## set path's

In [4]:
train_path = os.path.join(main_folder, 'connl03_train.txt')
test_path = os.path.join(main_folder, 'connl03_test.txt')
dev_path = os.path.join(main_folder, 'connl03_dev.txt')

In [5]:
def read_data(filepath):
    data = []
    with open(filepath) as file:
        words = []
        labels = []

        for index, line in enumerate(file, start=1):
            if line != '\n':
                word, label = line.split()
                words.append(word)
                labels.append(label)
            else:
                data.append((words, labels))
                words = []
                labels = []
    
    return data

train = read_data(train_path)
dev = read_data(test_path)
test = read_data(dev_path)

The following Vocab class can be served as a dictionary that maps words and tags into Ids. The UNK_TOKEN should be used for words that are not part of the training data.

In [6]:

UNK_TOKEN = 0


class Vocab:
    def __init__(self):
        """
        tag2id/id2tag  - tags to each other from label to integer number
        n_words - count the # of word in sentence
        """
        self.word2id = {"__unk__": UNK_TOKEN}
        self.id2word = {UNK_TOKEN: "__unk__"}
        self.n_words = 1
        
        self.tag2id = {"O":0, "B-PER":1, "I-PER": 2, "B-LOC": 3, "I-LOC": 4, "B-ORG": 5, "I-ORG": 6}
        self.id2tag = {0:"O", 1:"B-PER", 2:"I-PER", 3:"B-LOC", 4:"I-LOC", 5:"B-ORG", 6:"I-ORG"}
    
    
    def index_words(self, words):
        """
        for given token list get token index in sentence
        """
        word_indexes = [self.index_word(w) for w in words]
        return word_indexes


    def index_tags(self, tags):
        """
        for given label list get label index
        """
        tag_indexes = [self.tag2id[t] for t in tags]
        return tag_indexes
    

    def index_word(self, w):
        """
     
        """
        if w not in self.word2id:
            self.word2id[w] = self.n_words
            self.id2word[self.n_words] = w
            self.n_words += 1
        
        return self.word2id[w]
    

**Task 2:** Write a function prepare_data that takes one of the [train, dev, test] and the Vocab instance, for converting each pair of (words,tags) to a pair of indexes. Each pair should be added to data_sequences, which will be returned back from the function.

In [7]:
vocab = Vocab()

def prepare_data(data, vocab):
    data_sequences = []
    # TODO - your code...
    """
    this loop run on the data, for each sequence we generating tesor to
    contain the token of sequence
    """
    for i_words, i_tags in data:
        
        words_indexes_tensor = torch.tensor(vocab.index_words(i_words), dtype=torch.long)
        tags_indexes_tensor = torch.tensor(vocab.index_tags(i_tags), dtype=torch.long)
        # append data and label tensors
        data_sequences.append((words_indexes_tensor, tags_indexes_tensor))

    return data_sequences, vocab

train_sequences, vocab = prepare_data(train, vocab)
dev_sequences, vocab = prepare_data(dev, vocab)
test_sequences, vocab = prepare_data(test, vocab)

**Task 3:** Write NERNet, a PyTorch Module for labeling words with NER tags. 

*input_size:* the size of the vocabulary

*embedding_size:* the size of the embeddings

*hidden_size:* the LSTM hidden size

*output_size:* the number tags we are predicting for

*n_layers:* the number of layers we want to use in LSTM

*directions:* could 1 or 2, indicating unidirectional or bidirectional LSTM, respectively

The input for your forward function should be a single sentence tensor.

*note:* the embeddings in this section are learned embedding. That means that you don't need to use pretrained embedding like the one used in class. You will use them in part 5

In [8]:

class NERNet(nn.Module):
    
    def __init__(self, input_size, embedding_size, hidden_size, output_size, n_layers, directions):
        super(NERNet, self).__init__()
        # TODO: your code...
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, n_layers, bidirectional=(True if directions==2 else False))
        self.out = nn.Linear(hidden_size*directions, output_size)

        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.directions = directions


    def forward(self, input_sentence):
        # TODO: your code...
        
        # get sentence token numbers to understand output & input size
        dimension = len(input_sentence)
        
        # initial the hidden to None because none sentence inter
        hidden = None

        # 1. foward input sentence into the embeding
        embedded = self.embedding(input_sentence)

        # 2. foward embedding to LSTM
        lstm_output, _ = self.lstm(embedded.view(dimension, 1, -1), hidden) # The view function is meant to reshape the tensor https://stackoverflow.com/a/48650355/7786691

        # 3. foward to get predictions  - linear transformation to the incoming data
        output = self.out(lstm_output.view(dimension, -1)) 

        return output


## help function to train % evaluate 

In [9]:
def get_model_results(model, test_sequences):
    """
    

    Parameters
    ----------
    model : Torch model  - 
        DESCRIPTION: LSTM model.
    test_sequences : list
        DESCRIPTION: input list of coupels [[word_tensor, lebel_tensor] , ...]
    
    the function get model results
    
    Returns
    -------
    all_test_words_pred : list
    all_test_words_true : list
    binary_test_words_pred : list
    binary_test_words_true : list
    """
    # generate test tokens prediction
    all_test_words_pred = []
    all_test_words_true = []

    # generate test binnary prediction
    binary_test_words_pred = []
    binary_test_words_true = []
    for sentence, labels in test_sequences:
        sentence_tensor = torch.LongTensor(sentence).cuda()
        labels_tensor = torch.LongTensor(labels).cuda()

        _, pred_labels = model(sentence_tensor).T.max(0)

        all_test_words_pred += pred_labels.tolist()
        all_test_words_true += labels.tolist()

        binary_test_words_pred += [1 if i >= 1 else i for i in all_test_words_pred]
        binary_test_words_true += [1 if i >= 1 else i for i in all_test_words_true]
    return all_test_words_pred, all_test_words_true, binary_test_words_pred, binary_test_words_true

**Task 4:** write a training loop, which takes a model (instance of NERNet) and number of epochs to train on. The loss is always CrossEntropyLoss and the optimizer is always Adam.

In [30]:

def train_loop(model, n_epochs, train_sequences):
    #
    all_target_names = ["O", "B-PER", "I-PER", "B-LOC", "I-LOC", "B-ORG", "I-ORG"]
    binary_target_names = ["O", "OTHERS"]
    
    # Loss function
    criterion = nn.CrossEntropyLoss()

    # Optimizer (ADAM is a fancy version of SGD)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
  
    # shuffle data before training phase
    shuffle(train_sequences)
    STEP = 400 
    curr_f1_accuracy_result = 0
    best_f1_accuracy_result = 0
    best_df = pd.DataFrame()
    for e in range(1, n_epochs + 1):
        # print('start ephoc #' + str(e),  flush = True)
        # TODO - your code goes here...
        """
        tqdm - add progress bar
        """
        desc = ('Ephoc #' + str(e))
        for sequence_idx in tqdm_notebook(range(train_sequences.__len__()), desc = desc):
            
            
            # get sentence tokens, and labels 
            sentence, labels = train_sequences[sequence_idx]
            
            # check if there is empty sentence
            if labels.__len__() == 0:
                continue
            
            # insert sentence tokens into tensor
            sentence_tensor = torch.LongTensor(sentence).cuda()
            
            # insert sentence labels into tensor
            labels_tensor = torch.LongTensor(labels).cuda()
            
            # Sets the gradients of all optimized to zero.
            model.zero_grad()
            
            # foward sentence to model
            scores = model(sentence_tensor)
            
            # Computes the gradient of current tensor
            criterion(scores, labels_tensor).backward()
            
            # once the gradients are computed use them to optimize model
            optimizer.step()

        
        # print('finshed ephoc #' + str(e) + ', ephoch results:' , flush = True)
        all_train_words_pred, all_train_words_true, \
        binary_train_words_pred, binary_train_words_true = get_model_results(model, train_sequences)
        train_Results_df = pd.DataFrame(classification_report(all_train_words_true, all_train_words_pred, target_names=all_target_names, output_dict = True))
        curr_f1_accuracy_result = train_Results_df.iloc[2]['accuracy']

        
        if curr_f1_accuracy_result > best_f1_accuracy_result:
          improve_string = 'f1-accuracy-score improve from ' + str(best_f1_accuracy_result) + ' to ' + str(curr_f1_accuracy_result) 
          best_f1_accuracy_result = curr_f1_accuracy_result
          best_df = train_Results_df
        else:
          improve_string = 'f1-accuracy-score did not improve from ' + str(best_f1_accuracy_result)  
        print(improve_string, flush = True)
       
    best_string = 'best-f1-accuracy-score is '+ str(best_f1_accuracy_result)  
    print(best_string, flush = True)
    return best_df


**Task 5:** write an evaluation loop on a trained model, using the dev and test datasets. This function print the true positive rate (TPR), also known as Recall and the opposite to false positive rate (FPR), also known as precision, of each label seperately (7 labels in total), and for all the 6 labels (except O) together. The caption argument for the function should be served for printing, so that when you print include it as a prefix.

In [35]:

def evaluate(model, caption, test_sequences, dev_sequences):
    # TODO - your code goes here
    # from Piazza: https://piazza.com/class/klxc3m1tzqz2o8?cid=59

    all_target_names = ["O", "B-PER", "I-PER", "B-LOC", "I-LOC", "B-ORG", "I-ORG"]
    binary_target_names = ["O", "OTHERS"]
    
    # self.tag2id = {"O":0, "B-PER":1, "I-PER": 2, "B-LOC": 3, "I-LOC": 4, "B-ORG": 5, "I-ORG": 6}
    # self.id2tag = {0:"O", 1:"B-PER", 2:"I-PER", 3:"B-LOC", 4:"I-LOC", 5:"B-ORG", 6:"I-ORG"}
    
    print(f"****************    Results for {caption}    ****************")



    # generate dev tokens prediction 
    all_dev_words_pred = []
    all_dev_words_true = []
    
    # generate dev binnary prediction 
    binary_dev_words_pred = []
    binary_dev_words_true = []

    # get test results
    all_test_words_pred, all_test_words_true, \
        binary_test_words_pred, binary_test_words_true = get_model_results(model, test_sequences)

    # get dev results
    all_dev_words_pred, all_dev_words_true, \
        binary_dev_words_pred, binary_dev_words_true = get_model_results(model, dev_sequences)

    print("Test Results:")
    Test_Results_dict = pd.DataFrame(classification_report(all_test_words_true, all_test_words_pred, target_names=all_target_names,  output_dict = True))
    display(Test_Results_dict.T)
    print("Dev Results:")
    Dev_Results_dict = pd.DataFrame(classification_report(all_dev_words_true, all_dev_words_pred, target_names=all_target_names, output_dict = True))
    display(Dev_Results_dict.T)

    print("Binary Test Results:")
    Binary_Test_Results = pd.DataFrame(classification_report(binary_test_words_true, binary_test_words_pred, target_names=binary_target_names, output_dict = True))
    display(Binary_Test_Results.T)

    print("Binary Dev Results:")
    Binary_Dev_Results  = pd.DataFrame(classification_report(binary_dev_words_true, binary_dev_words_pred, target_names=binary_target_names, output_dict = True))
    display(Binary_Dev_Results.T)

    return 

**Task 6:** Train and evaluate a few models, all with embedding_size=300, and with the following hyper parameters (you may use that as captions for the models as well):

Model 1: (hidden_size: 500, n_layers: 1, directions: 1)

Model 2: (hidden_size: 500, n_layers: 2, directions: 1)

Model 3: (hidden_size: 500, n_layers: 3, directions: 1)

Model 4: (hidden_size: 500, n_layers: 1, directions: 2)

Model 5: (hidden_size: 500, n_layers: 2, directions: 2)

Model 6: (hidden_size: 500, n_layers: 3, directions: 2)

Model 4: (hidden_size: 800, n_layers: 1, directions: 2)

Model 5: (hidden_size: 800, n_layers: 2, directions: 2)

Model 6: (hidden_size: 800, n_layers: 3, directions: 2)

In [31]:


# TODO - your code goes here...
EMBEDDING_SIZE = 300
EPOCHS = 10
HIDDEN_SIZE  = 500 
INPUT_SIZE = len(vocab.word2id) # 8955
OUTPUT_SIZE = len(vocab.tag2id) # 7

n_layers_array = np.arange(1,4)
directions_array = np.arange(1,3)

# n_layers_array = np.arange(1,2)
# directions_array = np.arange(1,2)
model_list  = []
train_res_list = [] 
for i_n_layers in n_layers_array:
    for i_directions in directions_array:
        print('----------------------------------------------------------')
        print('Train model using:\n' + \
              '  1)hidden_size = ' + str(HIDDEN_SIZE)+'\n'+ \
              '  2)n_layers = ' + str(i_n_layers) + '\n'+ \
              '  3)directions = ' + str(i_directions) , flush = True)
        
        model = NERNet(INPUT_SIZE, EMBEDDING_SIZE, HIDDEN_SIZE, OUTPUT_SIZE, int(i_n_layers), int(i_directions)).cuda()
        train_res = train_loop(model, EPOCHS, train_sequences)
        model_list.append(model)
        train_res_list.append(train_res)

DIRECTION = 2
HIDDEN_SIZE = 800
for i_n_layers in n_layers_array:
        print('----------------------------------------------------------')

        print('Train model using:\n'+ \
              '  1)hidden_size = ' + str(HIDDEN_SIZE)+'\n'+ \
              '  2)n_layers = ' + str(i_n_layers) + '\n'+ \
              '  3)directions = ' + str(i_directions) , flush = True )
        model = NERNet(INPUT_SIZE, EMBEDDING_SIZE, HIDDEN_SIZE, OUTPUT_SIZE, i_n_layers, DIRECTION).cuda()
        train_res = train_loop(model, EPOCHS, train_sequences)
        model_list.append(model)

        

----------------------------------------------------------
Train model using:
  1)hidden_size = 500
  2)n_layers = 1
  3)directions = 1


Ephoc #1:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0 to 0.8406118143459915


Ephoc #2:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.8406118143459915 to 0.8886075949367088


Ephoc #3:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.8886075949367088 to 0.9168424753867792


Ephoc #4:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9168424753867792 to 0.9396272855133615


Ephoc #5:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9396272855133615 to 0.9593881856540084


Ephoc #6:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9593881856540084 to 0.9748945147679325


Ephoc #7:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9748945147679325 to 0.9835794655414909


Ephoc #8:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9835794655414909 to 0.990295358649789


Ephoc #9:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.990295358649789 to 0.9924050632911392


Ephoc #10:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9924050632911392 to 0.9937412095639944
----------------------------------------------------------
Train model using:
  1)hidden_size = 500
  2)n_layers = 1
  3)directions = 2


Ephoc #1:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0 to 0.8717299578059071


Ephoc #2:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.8717299578059071 to 0.9202180028129395


Ephoc #3:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9202180028129395 to 0.9528832630098453


Ephoc #4:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9528832630098453 to 0.9739099859353024


Ephoc #5:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9739099859353024 to 0.9857594936708861


Ephoc #6:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9857594936708861 to 0.9940576652601969


Ephoc #7:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9940576652601969 to 0.9945147679324895


Ephoc #8:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9945147679324895 to 0.9984177215189873


Ephoc #9:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score did not improve from 0.9984177215189873


Ephoc #10:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9984177215189873 to 0.9984880450070324
----------------------------------------------------------
Train model using:
  1)hidden_size = 500
  2)n_layers = 2
  3)directions = 1


Ephoc #1:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0 to 0.8475738396624473


Ephoc #2:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.8475738396624473 to 0.8968354430379747


Ephoc #3:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.8968354430379747 to 0.9321729957805908


Ephoc #4:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9321729957805908 to 0.9640998593530239


Ephoc #5:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9640998593530239 to 0.9816807313642757


Ephoc #6:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9816807313642757 to 0.9899789029535865


Ephoc #7:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9899789029535865 to 0.9940225035161744


Ephoc #8:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score did not improve from 0.9940225035161744


Ephoc #9:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9940225035161744 to 0.9962025316455696


Ephoc #10:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9962025316455696 to 0.9974331926863572
----------------------------------------------------------
Train model using:
  1)hidden_size = 500
  2)n_layers = 2
  3)directions = 2


Ephoc #1:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0 to 0.8950070323488045


Ephoc #2:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.8950070323488045 to 0.9456399437412095


Ephoc #3:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9456399437412095 to 0.97514064697609


Ephoc #4:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.97514064697609 to 0.9874120956399437


Ephoc #5:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9874120956399437 to 0.989732770745429


Ephoc #6:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.989732770745429 to 0.9922644163150492


Ephoc #7:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9922644163150492 to 0.9924050632911392


Ephoc #8:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9924050632911392 to 0.9964135021097047


Ephoc #9:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9964135021097047 to 0.9995428973277074


Ephoc #10:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score did not improve from 0.9995428973277074
----------------------------------------------------------
Train model using:
  1)hidden_size = 500
  2)n_layers = 3
  3)directions = 1


Ephoc #1:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0 to 0.8413150492264416


Ephoc #2:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.8413150492264416 to 0.894479606188467


Ephoc #3:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.894479606188467 to 0.9278481012658227


Ephoc #4:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9278481012658227 to 0.9595288326300985


Ephoc #5:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9595288326300985 to 0.9684247538677918


Ephoc #6:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9684247538677918 to 0.9761603375527426


Ephoc #7:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9761603375527426 to 0.9890998593530239


Ephoc #8:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9890998593530239 to 0.9919479606188467


Ephoc #9:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9919479606188467 to 0.995182841068917


Ephoc #10:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.995182841068917 to 0.9965189873417721
----------------------------------------------------------
Train model using:
  1)hidden_size = 500
  2)n_layers = 3
  3)directions = 2


Ephoc #1:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0 to 0.8853023909985935


Ephoc #2:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.8853023909985935 to 0.9328410689170182


Ephoc #3:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9328410689170182 to 0.9649789029535865


Ephoc #4:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9649789029535865 to 0.9786919831223628


Ephoc #5:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9786919831223628 to 0.9841068917018284


Ephoc #6:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9841068917018284 to 0.9921940928270042


Ephoc #7:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9921940928270042 to 0.9922995780590718


Ephoc #8:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9922995780590718 to 0.9964135021097047


Ephoc #9:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9964135021097047 to 0.9971870604781997


Ephoc #10:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9971870604781997 to 0.9981364275668073
----------------------------------------------------------
Train model using:
  1)hidden_size = 800
  2)n_layers = 1
  3)directions = 2


Ephoc #1:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0 to 0.8828762306610408


Ephoc #2:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.8828762306610408 to 0.9220112517580872


Ephoc #3:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9220112517580872 to 0.9550632911392405


Ephoc #4:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9550632911392405 to 0.9779535864978903


Ephoc #5:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9779535864978903 to 0.9875879043600563


Ephoc #6:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9875879043600563 to 0.9924050632911392


Ephoc #7:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9924050632911392 to 0.9964838255977496


Ephoc #8:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9964838255977496 to 0.9973277074542898


Ephoc #9:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9973277074542898 to 0.9990154711673699


Ephoc #10:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9990154711673699 to 0.9992616033755274
----------------------------------------------------------
Train model using:
  1)hidden_size = 800
  2)n_layers = 2
  3)directions = 2


Ephoc #1:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0 to 0.8963431786216597


Ephoc #2:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.8963431786216597 to 0.9464135021097047


Ephoc #3:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9464135021097047 to 0.9748945147679325


Ephoc #4:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9748945147679325 to 0.9841772151898734


Ephoc #5:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9841772151898734 to 0.9941631504922644


Ephoc #6:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9941631504922644 to 0.9943389592123769


Ephoc #7:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score did not improve from 0.9943389592123769


Ephoc #8:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9943389592123769 to 0.9959563994374121


Ephoc #9:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9959563994374121 to 0.9964486638537271


Ephoc #10:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9964486638537271 to 0.9996835443037975
----------------------------------------------------------
Train model using:
  1)hidden_size = 800
  2)n_layers = 3
  3)directions = 2


Ephoc #1:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0 to 0.8886427566807313


Ephoc #2:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.8886427566807313 to 0.9365682137834036


Ephoc #3:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9365682137834036 to 0.9662095639943741


Ephoc #4:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9662095639943741 to 0.9758087201125176


Ephoc #5:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9758087201125176 to 0.9836849507735583


Ephoc #6:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9836849507735583 to 0.9930028129395218


Ephoc #7:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9930028129395218 to 0.995534458509142


Ephoc #8:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score did not improve from 0.995534458509142


Ephoc #9:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score did not improve from 0.995534458509142


Ephoc #10:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.995534458509142 to 0.9971518987341772


## evaluate 


In [36]:
for i, model in enumerate(model_list):
    model_name = "model_"+str(i)
    evaluate(model, model_name, test_sequences, dev_sequences)

****************    Results for model_0    ****************
Test Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.930814,0.93863,0.934706,3096.0
B-PER,0.602679,0.675,0.636792,200.0
I-PER,0.735714,0.656051,0.693603,157.0
B-LOC,0.791411,0.704918,0.745665,183.0
I-LOC,1.0,0.521739,0.685714,23.0
B-ORG,0.520833,0.595238,0.555556,168.0
I-ORG,0.6,0.465517,0.524272,116.0
accuracy,0.872179,0.872179,0.872179,0.872179
macro avg,0.740207,0.651013,0.68233,3943.0
weighted avg,0.873135,0.872179,0.871539,3943.0


Dev Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.933424,0.939394,0.9364,6567.0
B-PER,0.664319,0.652074,0.65814,434.0
I-PER,0.744526,0.689189,0.715789,296.0
B-LOC,0.794118,0.708455,0.748844,343.0
I-LOC,0.931034,0.509434,0.658537,53.0
B-ORG,0.521739,0.617143,0.565445,350.0
I-ORG,0.448649,0.415,0.431169,200.0
accuracy,0.876501,0.876501,0.876501,0.876501
macro avg,0.719687,0.647241,0.673475,8243.0
weighted avg,0.877418,0.876501,0.876227,8243.0


Binary Test Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.928054,0.937219,0.932614,380877.0
OTHERS,0.782393,0.7565,0.769229,113647.0
accuracy,0.895688,0.895688,0.895688,0.895688
macro avg,0.855224,0.846859,0.850921,494524.0
weighted avg,0.89458,0.895688,0.895066,494524.0


Binary Dev Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.932444,0.939585,0.936001,1616269.0
OTHERS,0.762764,0.740497,0.751466,423980.0
accuracy,0.898213,0.898213,0.898213,0.8982129
macro avg,0.847604,0.840041,0.843733,2040249.0
weighted avg,0.897184,0.898213,0.897653,2040249.0


****************    Results for model_1    ****************
Test Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.942015,0.960271,0.951056,3096.0
B-PER,0.770053,0.72,0.744186,200.0
I-PER,0.816327,0.764331,0.789474,157.0
B-LOC,0.759358,0.775956,0.767568,183.0
I-LOC,0.846154,0.478261,0.611111,23.0
B-ORG,0.672956,0.636905,0.654434,168.0
I-ORG,0.56383,0.456897,0.504762,116.0
accuracy,0.90033,0.90033,0.90033,0.90033
macro avg,0.767242,0.68466,0.717513,3943.0
weighted avg,0.896662,0.90033,0.897862,3943.0


Dev Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.945682,0.957058,0.951336,6567.0
B-PER,0.787013,0.698157,0.739927,434.0
I-PER,0.802974,0.72973,0.764602,296.0
B-LOC,0.728,0.795918,0.760446,343.0
I-LOC,0.755556,0.641509,0.693878,53.0
B-ORG,0.646884,0.622857,0.634643,350.0
I-ORG,0.526882,0.49,0.507772,200.0
accuracy,0.901007,0.901007,0.901007,0.901007
macro avg,0.741856,0.705033,0.7218,8243.0
weighted avg,0.899074,0.901007,0.899692,8243.0


Binary Test Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.939888,0.964474,0.952023,380877.0
OTHERS,0.869498,0.793272,0.829638,113647.0
accuracy,0.92513,0.92513,0.92513,0.92513
macro avg,0.904693,0.878873,0.89083,494524.0
weighted avg,0.923712,0.92513,0.923897,494524.0


Binary Dev Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.946854,0.960408,0.953583,1616269.0
OTHERS,0.840357,0.7945,0.816785,423980.0
accuracy,0.925931,0.925931,0.925931,0.9259306
macro avg,0.893605,0.877454,0.885184,2040249.0
weighted avg,0.924723,0.925931,0.925155,2040249.0


****************    Results for model_2    ****************
Test Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.922185,0.976098,0.948376,3096.0
B-PER,0.770115,0.67,0.716578,200.0
I-PER,0.911504,0.656051,0.762963,157.0
B-LOC,0.795181,0.721311,0.756447,183.0
I-LOC,0.818182,0.391304,0.529412,23.0
B-ORG,0.673469,0.589286,0.628571,168.0
I-ORG,0.763636,0.362069,0.491228,116.0
accuracy,0.898047,0.898047,0.898047,0.898047
macro avg,0.807753,0.623731,0.690511,3943.0
weighted avg,0.892284,0.898047,0.890809,3943.0


Dev Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.932649,0.96787,0.949933,6567.0
B-PER,0.757576,0.691244,0.722892,434.0
I-PER,0.829876,0.675676,0.744879,296.0
B-LOC,0.774481,0.760933,0.767647,343.0
I-LOC,0.966667,0.54717,0.698795,53.0
B-ORG,0.664516,0.588571,0.624242,350.0
I-ORG,0.649123,0.37,0.471338,200.0
accuracy,0.900886,0.900886,0.900886,0.900886
macro avg,0.796412,0.657352,0.711389,8243.0
weighted avg,0.895113,0.900886,0.895975,8243.0


Binary Test Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.920024,0.980705,0.949396,380877.0
OTHERS,0.916985,0.714291,0.803045,113647.0
accuracy,0.91948,0.91948,0.91948,0.91948
macro avg,0.918505,0.847498,0.87622,494524.0
weighted avg,0.919326,0.91948,0.915763,494524.0


Binary Dev Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.930327,0.967825,0.948705,1616269.0
OTHERS,0.855075,0.72369,0.783915,423980.0
accuracy,0.917091,0.917091,0.917091,0.9170915
macro avg,0.892701,0.845757,0.86631,2040249.0
weighted avg,0.914689,0.917091,0.914461,2040249.0


****************    Results for model_3    ****************
Test Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.955234,0.937339,0.946201,3096.0
B-PER,0.741627,0.775,0.757946,200.0
I-PER,0.739884,0.815287,0.775758,157.0
B-LOC,0.777174,0.781421,0.779292,183.0
I-LOC,0.521739,0.521739,0.521739,23.0
B-ORG,0.6,0.678571,0.636872,168.0
I-ORG,0.52381,0.568966,0.545455,116.0
accuracy,0.892721,0.892721,0.892721,0.892721
macro avg,0.69421,0.725475,0.709037,3943.0
weighted avg,0.897204,0.892721,0.894674,3943.0


Dev Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.955339,0.941374,0.948305,6567.0
B-PER,0.746479,0.732719,0.739535,434.0
I-PER,0.741433,0.804054,0.771475,296.0
B-LOC,0.741573,0.769679,0.755365,343.0
I-LOC,0.681818,0.566038,0.618557,53.0
B-ORG,0.602151,0.64,0.620499,350.0
I-ORG,0.462451,0.585,0.516556,200.0
accuracy,0.894456,0.894456,0.894456,0.894456
macro avg,0.704463,0.719838,0.710042,8243.0
weighted avg,0.899052,0.894456,0.89642,8243.0


Binary Test Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.954451,0.941438,0.9479,380877.0
OTHERS,0.812311,0.849428,0.830455,113647.0
accuracy,0.920293,0.920293,0.920293,0.920293
macro avg,0.883381,0.895433,0.889177,494524.0
weighted avg,0.921786,0.920293,0.92091,494524.0


Binary Dev Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.954555,0.940589,0.947521,1616269.0
OTHERS,0.785483,0.829294,0.806794,423980.0
accuracy,0.917461,0.917461,0.917461,0.9174611
macro avg,0.870019,0.884941,0.877157,2040249.0
weighted avg,0.919421,0.917461,0.918277,2040249.0


****************    Results for model_4    ****************
Test Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.932203,0.959302,0.945559,3096.0
B-PER,0.785311,0.695,0.737401,200.0
I-PER,0.814815,0.700637,0.753425,157.0
B-LOC,0.8375,0.73224,0.781341,183.0
I-LOC,0.769231,0.434783,0.555556,23.0
B-ORG,0.547872,0.613095,0.578652,168.0
I-ORG,0.642857,0.465517,0.54,116.0
accuracy,0.892721,0.892721,0.892721,0.892721
macro avg,0.761398,0.657225,0.698847,3943.0
weighted avg,0.889845,0.892721,0.88989,3943.0


Dev Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.946852,0.954926,0.950872,6567.0
B-PER,0.709302,0.702765,0.706019,434.0
I-PER,0.730375,0.722973,0.726655,296.0
B-LOC,0.874564,0.731778,0.796825,343.0
I-LOC,0.842105,0.603774,0.703297,53.0
B-ORG,0.55198,0.637143,0.591512,350.0
I-ORG,0.52381,0.44,0.478261,200.0
accuracy,0.89579,0.89579,0.89579,0.89579
macro avg,0.739856,0.684766,0.707634,8243.0
weighted avg,0.895859,0.89579,0.895201,8243.0


Binary Test Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.930278,0.961022,0.9454,380877.0
OTHERS,0.853097,0.758612,0.803085,113647.0
accuracy,0.914506,0.914506,0.914506,0.914506
macro avg,0.891688,0.859817,0.874243,494524.0
weighted avg,0.912541,0.914506,0.912695,494524.0


Binary Dev Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.946648,0.953942,0.950281,1616269.0
OTHERS,0.819108,0.795049,0.806899,423980.0
accuracy,0.920923,0.920923,0.920923,0.9209229
macro avg,0.882878,0.874496,0.87859,2040249.0
weighted avg,0.920144,0.920923,0.920485,2040249.0


****************    Results for model_5    ****************
Test Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.939988,0.976421,0.957858,3096.0
B-PER,0.776042,0.745,0.760204,200.0
I-PER,0.860294,0.745223,0.798635,157.0
B-LOC,0.790055,0.781421,0.785714,183.0
I-LOC,0.6,0.521739,0.55814,23.0
B-ORG,0.777778,0.625,0.693069,168.0
I-ORG,0.825397,0.448276,0.581006,116.0
accuracy,0.913264,0.913264,0.913264,0.913264
macro avg,0.79565,0.691869,0.733518,3943.0
weighted avg,0.909274,0.913264,0.908803,3943.0


Dev Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.946724,0.976854,0.961553,6567.0
B-PER,0.835411,0.771889,0.802395,434.0
I-PER,0.843066,0.780405,0.810526,296.0
B-LOC,0.798834,0.798834,0.798834,343.0
I-LOC,0.744681,0.660377,0.7,53.0
B-ORG,0.771429,0.617143,0.685714,350.0
I-ORG,0.713115,0.435,0.540373,200.0
accuracy,0.921145,0.921145,0.921145,0.921145
macro avg,0.807608,0.720072,0.757056,8243.0
weighted avg,0.916577,0.921145,0.917366,8243.0


Binary Test Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.941171,0.976011,0.958274,380877.0
OTHERS,0.908215,0.795542,0.848153,113647.0
accuracy,0.934537,0.934537,0.934537,0.934537
macro avg,0.924693,0.885776,0.903214,494524.0
weighted avg,0.933597,0.934537,0.932967,494524.0


Binary Dev Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.946625,0.979406,0.962736,1616269.0
OTHERS,0.909551,0.789481,0.845273,423980.0
accuracy,0.939938,0.939938,0.939938,0.9399377
macro avg,0.928088,0.884443,0.904005,2040249.0
weighted avg,0.938921,0.939938,0.938327,2040249.0


****************    Results for model_6    ****************
Test Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.928748,0.968346,0.948134,3096.0
B-PER,0.759777,0.68,0.717678,200.0
I-PER,0.808511,0.726115,0.765101,157.0
B-LOC,0.684729,0.759563,0.720207,183.0
I-LOC,0.785714,0.478261,0.594595,23.0
B-ORG,0.756098,0.553571,0.639175,168.0
I-ORG,0.727273,0.344828,0.467836,116.0
accuracy,0.895511,0.895511,0.895511,0.895511
macro avg,0.778693,0.644383,0.693247,3943.0
weighted avg,0.889947,0.895511,0.889222,3943.0


Dev Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.932446,0.971067,0.951365,6567.0
B-PER,0.791123,0.698157,0.741738,434.0
I-PER,0.843373,0.709459,0.770642,296.0
B-LOC,0.74124,0.801749,0.770308,343.0
I-LOC,0.815789,0.584906,0.681319,53.0
B-ORG,0.721831,0.585714,0.646688,350.0
I-ORG,0.759494,0.3,0.430108,200.0
accuracy,0.905132,0.905132,0.905132,0.905132
macro avg,0.800757,0.664436,0.713167,8243.0
weighted avg,0.899961,0.905132,0.898984,8243.0


Binary Test Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.928236,0.967522,0.947472,380877.0
OTHERS,0.873163,0.749311,0.80651,113647.0
accuracy,0.917375,0.917375,0.917375,0.917375
macro avg,0.9007,0.858417,0.876991,494524.0
weighted avg,0.91558,0.917375,0.915078,494524.0


Binary Dev Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.928393,0.970765,0.949106,1616269.0
OTHERS,0.865076,0.714564,0.78265,423980.0
accuracy,0.917524,0.917524,0.917524,0.9175243
macro avg,0.896735,0.842665,0.865878,2040249.0
weighted avg,0.915235,0.917524,0.914515,2040249.0


****************    Results for model_7    ****************
Test Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.935723,0.978036,0.956412,3096.0
B-PER,0.762431,0.69,0.724409,200.0
I-PER,0.838235,0.726115,0.778157,157.0
B-LOC,0.853801,0.797814,0.824859,183.0
I-LOC,0.733333,0.478261,0.578947,23.0
B-ORG,0.737589,0.619048,0.673139,168.0
I-ORG,0.809524,0.439655,0.569832,116.0
accuracy,0.910981,0.910981,0.910981,0.910981
macro avg,0.810091,0.675561,0.729394,3943.0
weighted avg,0.905914,0.910981,0.905797,3943.0


Dev Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.941963,0.976245,0.958798,6567.0
B-PER,0.826425,0.735023,0.778049,434.0
I-PER,0.842105,0.756757,0.797153,296.0
B-LOC,0.820433,0.772595,0.795796,343.0
I-LOC,0.914286,0.603774,0.727273,53.0
B-ORG,0.75,0.617143,0.677116,350.0
I-ORG,0.705036,0.49,0.578171,200.0
accuracy,0.917748,0.917748,0.917748,0.917748
macro avg,0.828607,0.707362,0.758908,8243.0
weighted avg,0.91316,0.917748,0.91401,8243.0


Binary Test Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.935916,0.978006,0.956498,380877.0
OTHERS,0.913208,0.775568,0.838779,113647.0
accuracy,0.931484,0.931484,0.931484,0.931484
macro avg,0.924562,0.876787,0.897639,494524.0
weighted avg,0.930697,0.931484,0.929445,494524.0


Binary Dev Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.940241,0.977561,0.958538,1616269.0
OTHERS,0.899207,0.763149,0.82561,423980.0
accuracy,0.933004,0.933004,0.933004,0.9330043
macro avg,0.919724,0.870355,0.892074,2040249.0
weighted avg,0.931714,0.933004,0.930914,2040249.0


****************    Results for model_8    ****************
Test Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.936124,0.975129,0.955229,3096.0
B-PER,0.760417,0.73,0.744898,200.0
I-PER,0.855072,0.751592,0.8,157.0
B-LOC,0.851852,0.754098,0.8,183.0
I-LOC,0.833333,0.434783,0.571429,23.0
B-ORG,0.697987,0.619048,0.656151,168.0
I-ORG,0.738462,0.413793,0.530387,116.0
accuracy,0.908699,0.908699,0.908699,0.908699
macro avg,0.810464,0.668349,0.722585,3943.0
weighted avg,0.903512,0.908699,0.903695,3943.0


Dev Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.946574,0.973961,0.960072,6567.0
B-PER,0.800512,0.721198,0.758788,434.0
I-PER,0.817857,0.773649,0.795139,296.0
B-LOC,0.833333,0.758017,0.793893,343.0
I-LOC,0.681818,0.566038,0.618557,53.0
B-ORG,0.702454,0.654286,0.677515,350.0
I-ORG,0.714286,0.475,0.570571,200.0
accuracy,0.916171,0.916171,0.916171,0.916171
macro avg,0.785262,0.703164,0.739219,8243.0
weighted avg,0.911846,0.916171,0.912993,8243.0


Binary Test Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.934804,0.973443,0.953732,380877.0
OTHERS,0.896685,0.772471,0.829956,113647.0
accuracy,0.927257,0.927257,0.927257,0.927257
macro avg,0.915744,0.872957,0.891844,494524.0
weighted avg,0.926044,0.927257,0.925287,494524.0


Binary Dev Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.945277,0.97485,0.959836,1616269.0
OTHERS,0.891143,0.784862,0.834633,423980.0
accuracy,0.935369,0.935369,0.935369,0.9353692
macro avg,0.91821,0.879856,0.897234,2040249.0
weighted avg,0.934028,0.935369,0.933818,2040249.0


**Task 6:** Download the GloVe embeddings from https://nlp.stanford.edu/projects/glove/ (use the 300-dim vectors from glove.6B.zip). Then intialize the nn.Embedding module in your NERNet with these embeddings, so that you can start your training with pre-trained vectors. Repeat Task 6 and print the results for each model.

Note: make sure that vectors are aligned with the IDs in your Vocab, in other words, make sure that for example the word with ID 0 is the first vector in the GloVe matrix of vectors that you initialize nn.Embedding with. For a dicussion on how to do that, check it this link:
https://discuss.pytorch.org/t/can-we-use-pre-trained-word-embeddings-for-weight-initialization-in-nn-embedding/1222

## move to drive folder

In [60]:
! cd "/content/gdrive/My Drive/NLP3" 
GLOVE_PATH  = os.path.join(main_folder, 'glove.6B.300d.txt')
GLOVE_PATH = 'glove.6B.300d.txt'

In [59]:
!ls

gdrive		   glove.6B.200d.txt  glove.6B.50d.txt	glove.6B.zip.1
glove.6B.100d.txt  glove.6B.300d.txt  glove.6B.zip	sample_data


## get glove data set

In [57]:
# TODO - your code goes here...
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zipy

--2022-05-24 14:02:50--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2022-05-24 14:02:50--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-05-24 14:02:50--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip.1’


2022

# get embding weights

In [52]:
def get_glove_pre_trained_embeddings_weights(input_size, embedding_size, word2id = vocab.word2id):
    # generate zeros initilized embeding mask 
    weights = np.zeros((input_size, embedding_size))

    # parse embeding
    with open(GLOVE_PATH) as glove:
        for line in glove.readlines():
            split = line.split()
            word = split[0]
            word_id = word2id.get(word)

        if word_id:
            weights[word_id] = split[1:]
    tensor = torch.from_numpy(weights).float()
    return tensor

## define Glove net

In [53]:
class GloveNERNet(nn.Module):
    
    def __init__(self, input_size, embedding_size, hidden_size, output_size, n_layers, directions):
        super(GloveNERNet, self).__init__()
        
        # TODO: your code...

        # add first layer - the embding layer
        self.embedding = nn.Embedding(input_size, embedding_size)

        # get trained embding weights from the data we loaded
        pre_trained_weights = get_glove_pre_trained_embeddings_weights(input_size, embedding_size)
        
        # load embding weights 
        self.embedding.weight = nn.Parameter(pre_trained_weights)

        # add LSTM layer
        self.lstm = nn.LSTM(embedding_size, hidden_size, n_layers, bidirectional=(True if directions==2 else False))
        
        # Add FC layer
        self.out = nn.Linear(hidden_size*directions, output_size)
    
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.directions = directions

    def forward(self, input_sentence):
        # TODO: your code...
        
        # get sentence token numbers to understand output & input size
        dimension = len(input_sentence)
        
        # initial the hidden to None because none sentence inter
        hidden = None

        # 1. foward input sentence into the embeding
        embedded = self.embedding(input_sentence)

        # 2. foward embedding to LSTM
        lstm_output, _ = self.lstm(embedded.view(dimension, 1, -1), hidden) # The view function is meant to reshape the tensor https://stackoverflow.com/a/48650355/7786691

        # 3. foward to get predictions  - linear transformation to the incoming data
        output = self.out(lstm_output.view(dimension, -1)) 

        return output

## train model

In [63]:


# TODO - your code goes here...
EMBEDDING_SIZE = 300
EPOCHS = 10
HIDDEN_SIZE  = 500 
INPUT_SIZE = len(vocab.word2id) # 8955
OUTPUT_SIZE = len(vocab.tag2id) # 7

n_layers_array = np.arange(1,4)
directions_array = np.arange(1,3)

# n_layers_array = np.arange(1,2)
# directions_array = np.arange(1,2)
model_list  = []
train_res_list = [] 
for i_n_layers in n_layers_array:
    for i_directions in directions_array:
        print('----------------------------------------------------------')
        print('Train model using:\n' + \
              '  1)hidden_size = ' + str(HIDDEN_SIZE)+'\n'+ \
              '  2)n_layers = ' + str(i_n_layers) + '\n'+ \
              '  3)directions = ' + str(i_directions) , flush = True)
        model = GloveNERNet(INPUT_SIZE, EMBEDDING_SIZE, HIDDEN_SIZE, OUTPUT_SIZE, 1, 1).cuda()
        train_res = train_loop(model, EPOCHS, train_sequences)   
        model_list.append(model)
        train_res_list.append(train_res)

DIRECTION = 2
HIDDEN_SIZE = 800
for i_n_layers in n_layers_array:
        print('----------------------------------------------------------')

        print('Train model using:\n'+ \
              '  1)hidden_size = ' + str(HIDDEN_SIZE)+'\n'+ \
              '  2)n_layers = ' + str(i_n_layers) + '\n'+ \
              '  3)directions = ' + str(i_directions) , flush = True )
        model = GloveNERNet(INPUT_SIZE, EMBEDDING_SIZE, HIDDEN_SIZE, OUTPUT_SIZE, 1, 1).cuda()
        train_res = train_loop(model, EPOCHS, train_sequences)   

       
        model_list.append(model)

        

----------------------------------------------------------
Train model using:
  1)hidden_size = 500
  2)n_layers = 1
  3)directions = 1


Ephoc #1:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0 to 0.8534458509142053


Ephoc #2:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.8534458509142053 to 0.9197960618846694


Ephoc #3:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9197960618846694 to 0.9458860759493671


Ephoc #4:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9458860759493671 to 0.9596343178621659


Ephoc #5:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9596343178621659 to 0.9709915611814346


Ephoc #6:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9709915611814346 to 0.9753164556962025


Ephoc #7:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9753164556962025 to 0.9825949367088608


Ephoc #8:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9825949367088608 to 0.9856540084388186


Ephoc #9:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9856540084388186 to 0.9921237693389592


Ephoc #10:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9921237693389592 to 0.9927566807313642
----------------------------------------------------------
Train model using:
  1)hidden_size = 500
  2)n_layers = 1
  3)directions = 2


Ephoc #1:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0 to 0.8534106891701828


Ephoc #2:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.8534106891701828 to 0.9184599156118144


Ephoc #3:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9184599156118144 to 0.9426160337552743


Ephoc #4:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9426160337552743 to 0.9532700421940928


Ephoc #5:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9532700421940928 to 0.9680028129395218


Ephoc #6:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9680028129395218 to 0.9708509142053446


Ephoc #7:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9708509142053446 to 0.9809071729957806


Ephoc #8:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9809071729957806 to 0.9863572433192687


Ephoc #9:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9863572433192687 to 0.990506329113924


Ephoc #10:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.990506329113924 to 0.9927566807313642
----------------------------------------------------------
Train model using:
  1)hidden_size = 500
  2)n_layers = 2
  3)directions = 1


Ephoc #1:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0 to 0.8576652601969058


Ephoc #2:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.8576652601969058 to 0.9173699015471167


Ephoc #3:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9173699015471167 to 0.944338959212377


Ephoc #4:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.944338959212377 to 0.959845288326301


Ephoc #5:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.959845288326301 to 0.9742967651195499


Ephoc #6:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9742967651195499 to 0.980309423347398


Ephoc #7:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.980309423347398 to 0.9837201125175808


Ephoc #8:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9837201125175808 to 0.9880450070323488


Ephoc #9:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9880450070323488 to 0.990295358649789


Ephoc #10:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.990295358649789 to 0.9913853727144867
----------------------------------------------------------
Train model using:
  1)hidden_size = 500
  2)n_layers = 2
  3)directions = 2


Ephoc #1:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0 to 0.8526019690576653


Ephoc #2:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.8526019690576653 to 0.9188115330520393


Ephoc #3:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9188115330520393 to 0.9514767932489452


Ephoc #4:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9514767932489452 to 0.9673347398030943


Ephoc #5:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9673347398030943 to 0.9735935302390999


Ephoc #6:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9735935302390999 to 0.9815049226441631


Ephoc #7:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9815049226441631 to 0.9841420534458509


Ephoc #8:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9841420534458509 to 0.9884317862165963


Ephoc #9:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9884317862165963 to 0.9929324894514768


Ephoc #10:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9929324894514768 to 0.9939873417721519
----------------------------------------------------------
Train model using:
  1)hidden_size = 500
  2)n_layers = 3
  3)directions = 1


Ephoc #1:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0 to 0.8506680731364276


Ephoc #2:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.8506680731364276 to 0.920253164556962


Ephoc #3:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.920253164556962 to 0.9506680731364275


Ephoc #4:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9506680731364275 to 0.9667018284106892


Ephoc #5:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9667018284106892 to 0.9725035161744022


Ephoc #6:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9725035161744022 to 0.9783052039381154


Ephoc #7:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9783052039381154 to 0.9828410689170183


Ephoc #8:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9828410689170183 to 0.9888185654008439


Ephoc #9:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9888185654008439 to 0.9893108298171589


Ephoc #10:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9893108298171589 to 0.9933192686357243
----------------------------------------------------------
Train model using:
  1)hidden_size = 500
  2)n_layers = 3
  3)directions = 2


Ephoc #1:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0 to 0.8549226441631504


Ephoc #2:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.8549226441631504 to 0.9221870604781998


Ephoc #3:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9221870604781998 to 0.959493670886076


Ephoc #4:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.959493670886076 to 0.9738045007032349


Ephoc #5:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9738045007032349 to 0.9785161744022504


Ephoc #6:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9785161744022504 to 0.9828059071729958


Ephoc #7:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9828059071729958 to 0.9864978902953586


Ephoc #8:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9864978902953586 to 0.9901195499296765


Ephoc #9:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9901195499296765 to 0.9922292545710267


Ephoc #10:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9922292545710267 to 0.9929324894514768
----------------------------------------------------------
Train model using:
  1)hidden_size = 800
  2)n_layers = 1
  3)directions = 2


Ephoc #1:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0 to 0.8623769338959212


Ephoc #2:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.8623769338959212 to 0.9218354430379747


Ephoc #3:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9218354430379747 to 0.9516877637130802


Ephoc #4:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9516877637130802 to 0.9627988748241912


Ephoc #5:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9627988748241912 to 0.9702180028129396


Ephoc #6:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9702180028129396 to 0.9760548523206751


Ephoc #7:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9760548523206751 to 0.9822081575246132


Ephoc #8:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9822081575246132 to 0.9880098452883262


Ephoc #9:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9880098452883262 to 0.9887834036568214


Ephoc #10:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9887834036568214 to 0.9922644163150492
----------------------------------------------------------
Train model using:
  1)hidden_size = 800
  2)n_layers = 2
  3)directions = 2


Ephoc #1:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0 to 0.8636075949367089


Ephoc #2:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.8636075949367089 to 0.9278832630098452


Ephoc #3:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9278832630098452 to 0.9611462728551337


Ephoc #4:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9611462728551337 to 0.9742616033755275


Ephoc #5:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9742616033755275 to 0.9784458509142053


Ephoc #6:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9784458509142053 to 0.9839310829817159


Ephoc #7:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9839310829817159 to 0.9868846694796062


Ephoc #8:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score did not improve from 0.9868846694796062


Ephoc #9:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9868846694796062 to 0.9937412095639944


Ephoc #10:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9937412095639944 to 0.9940225035161744
----------------------------------------------------------
Train model using:
  1)hidden_size = 800
  2)n_layers = 3
  3)directions = 2


Ephoc #1:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0 to 0.8614627285513361


Ephoc #2:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.8614627285513361 to 0.9213431786216596


Ephoc #3:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9213431786216596 to 0.9559423347398031


Ephoc #4:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9559423347398031 to 0.9677215189873418


Ephoc #5:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9677215189873418 to 0.9751758087201126


Ephoc #6:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9751758087201126 to 0.9802039381153306


Ephoc #7:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9802039381153306 to 0.9854078762306611


Ephoc #8:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9854078762306611 to 0.989521800281294


Ephoc #9:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.989521800281294 to 0.9911040787623067


Ephoc #10:   0%|          | 0/1750 [00:00<?, ?it/s]

f1-accuracy-score improve from 0.9911040787623067 to 0.9924050632911392


## evaluate 


In [64]:
for i, model in enumerate(model_list):
    model_name = "model_"+str(i)
    evaluate(model, model_name, test_sequences, dev_sequences)

****************    Results for model_0    ****************
Test Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.982728,0.900517,0.939828,3096.0
B-PER,0.90604,0.675,0.773639,200.0
I-PER,0.923077,0.611465,0.735632,157.0
B-LOC,0.939597,0.765027,0.843373,183.0
I-LOC,0.833333,0.434783,0.571429,23.0
B-ORG,0.366234,0.839286,0.509946,168.0
I-ORG,0.302932,0.801724,0.439716,116.0
accuracy,0.863048,0.863048,0.863048,0.863048
macro avg,0.750563,0.718257,0.687652,3943.0
weighted avg,0.927324,0.863048,0.883613,3943.0


Dev Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.988673,0.890513,0.937029,6567.0
B-PER,0.892857,0.691244,0.779221,434.0
I-PER,0.886878,0.662162,0.758221,296.0
B-LOC,0.939271,0.676385,0.786441,343.0
I-LOC,0.8125,0.490566,0.611765,53.0
B-ORG,0.356882,0.837143,0.500427,350.0
I-ORG,0.256334,0.86,0.394948,200.0
accuracy,0.857333,0.857333,0.857333,0.857333
macro avg,0.733342,0.729716,0.68115,8243.0
weighted avg,0.93219,0.857333,0.882251,8243.0


Binary Test Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.982314,0.896271,0.937322,380877.0
OTHERS,0.731255,0.94592,0.82485,113647.0
accuracy,0.907681,0.907681,0.907681,0.907681
macro avg,0.856784,0.921096,0.881086,494524.0
weighted avg,0.924618,0.907681,0.911475,494524.0


Binary Dev Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.988485,0.888634,0.935903,1616269.0
OTHERS,0.693488,0.960536,0.805454,423980.0
accuracy,0.903575,0.903575,0.903575,0.9035755
macro avg,0.840986,0.924585,0.870679,2040249.0
weighted avg,0.927182,0.903575,0.908795,2040249.0


****************    Results for model_1    ****************
Test Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.977078,0.922481,0.948995,3096.0
B-PER,0.884615,0.69,0.775281,200.0
I-PER,0.929825,0.675159,0.782288,157.0
B-LOC,0.933775,0.770492,0.844311,183.0
I-LOC,0.846154,0.478261,0.611111,23.0
B-ORG,0.39403,0.785714,0.524851,168.0
I-ORG,0.358566,0.775862,0.490463,116.0
accuracy,0.881055,0.881055,0.881055,0.881055
macro avg,0.760578,0.728281,0.711043,3943.0
weighted avg,0.924695,0.881055,0.895155,3943.0


Dev Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.985262,0.906045,0.943995,6567.0
B-PER,0.895522,0.691244,0.780234,434.0
I-PER,0.876596,0.695946,0.775895,296.0
B-LOC,0.925373,0.723032,0.811784,343.0
I-LOC,0.733333,0.622642,0.673469,53.0
B-ORG,0.387618,0.822857,0.52699,350.0
I-ORG,0.288927,0.835,0.429306,200.0
accuracy,0.872498,0.872498,0.872498,0.872498
macro avg,0.727519,0.756681,0.705953,8243.0
weighted avg,0.930252,0.872498,0.891902,8243.0


Binary Test Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.977226,0.920216,0.947864,380877.0
OTHERS,0.77634,0.928128,0.845476,113647.0
accuracy,0.922034,0.922034,0.922034,0.922034
macro avg,0.876783,0.924172,0.89667,494524.0
weighted avg,0.93106,0.922034,0.924334,494524.0


Binary Dev Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.985458,0.905756,0.943928,1616269.0
OTHERS,0.725396,0.949047,0.822285,423980.0
accuracy,0.914753,0.914753,0.914753,0.9147526
macro avg,0.855427,0.927402,0.883106,2040249.0
weighted avg,0.931415,0.914753,0.918649,2040249.0


****************    Results for model_2    ****************
Test Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.981709,0.901486,0.939889,3096.0
B-PER,0.86755,0.655,0.746439,200.0
I-PER,0.886792,0.598726,0.714829,157.0
B-LOC,0.924658,0.737705,0.820669,183.0
I-LOC,0.888889,0.347826,0.5,23.0
B-ORG,0.360215,0.797619,0.496296,168.0
I-ORG,0.310127,0.844828,0.453704,116.0
accuracy,0.860005,0.860005,0.860005,0.860005
macro avg,0.745706,0.697599,0.667404,3943.0
weighted avg,0.922713,0.860005,0.879813,3943.0


Dev Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.984652,0.88899,0.934379,6567.0
B-PER,0.883792,0.665899,0.759527,434.0
I-PER,0.869369,0.652027,0.745174,296.0
B-LOC,0.923077,0.699708,0.79602,343.0
I-LOC,0.75,0.509434,0.606742,53.0
B-ORG,0.36514,0.82,0.505282,350.0
I-ORG,0.250366,0.855,0.387316,200.0
accuracy,0.854665,0.854665,0.854665,0.854665
macro avg,0.718057,0.727294,0.676348,8243.0
weighted avg,0.92701,0.854665,0.879022,8243.0


Binary Test Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.981362,0.902052,0.940037,380877.0
OTHERS,0.741698,0.942585,0.830162,113647.0
accuracy,0.911367,0.911367,0.911367,0.911367
macro avg,0.86153,0.922319,0.8851,494524.0
weighted avg,0.926285,0.911367,0.914787,494524.0


Binary Dev Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.985117,0.888686,0.93442,1616269.0
OTHERS,0.690974,0.948818,0.799624,423980.0
accuracy,0.901182,0.901182,0.901182,0.9011822
macro avg,0.838045,0.918752,0.867022,2040249.0
weighted avg,0.923992,0.901182,0.906409,2040249.0


****************    Results for model_3    ****************
Test Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.988372,0.878553,0.930233,3096.0
B-PER,0.943548,0.585,0.722222,200.0
I-PER,0.969072,0.598726,0.740157,157.0
B-LOC,0.620408,0.830601,0.71028,183.0
I-LOC,0.355556,0.695652,0.470588,23.0
B-ORG,0.320755,0.809524,0.459459,168.0
I-ORG,0.339844,0.75,0.467742,116.0
accuracy,0.842506,0.842506,0.842506,0.842506
macro avg,0.648222,0.735437,0.642955,3943.0
weighted avg,0.917037,0.842506,0.86556,3943.0


Dev Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.993669,0.860362,0.922223,6567.0
B-PER,0.963899,0.615207,0.751055,434.0
I-PER,0.897059,0.618243,0.732,296.0
B-LOC,0.590517,0.798834,0.679058,343.0
I-LOC,0.455556,0.773585,0.573427,53.0
B-ORG,0.324355,0.825714,0.465753,350.0
I-ORG,0.270998,0.855,0.411552,200.0
accuracy,0.834041,0.834041,0.834041,0.834041
macro avg,0.642293,0.763849,0.647867,8243.0
weighted avg,0.922443,0.834041,0.862247,8243.0


Binary Test Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.987401,0.877409,0.929161,380877.0
OTHERS,0.700836,0.96248,0.81108,113647.0
accuracy,0.89696,0.89696,0.89696,0.89696
macro avg,0.844119,0.919945,0.870121,494524.0
weighted avg,0.921546,0.89696,0.902025,494524.0


Binary Dev Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.994068,0.859901,0.92213,1616269.0
OTHERS,0.647361,0.98044,0.779824,423980.0
accuracy,0.88495,0.88495,0.88495,0.8849498
macro avg,0.820715,0.92017,0.850977,2040249.0
weighted avg,0.92202,0.88495,0.892558,2040249.0


****************    Results for model_4    ****************
Test Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.971313,0.929587,0.949992,3096.0
B-PER,0.927536,0.64,0.757396,200.0
I-PER,0.925234,0.630573,0.75,157.0
B-LOC,0.935252,0.710383,0.807453,183.0
I-LOC,0.6875,0.478261,0.564103,23.0
B-ORG,0.367123,0.797619,0.502814,168.0
I-ORG,0.395349,0.732759,0.513595,116.0
accuracy,0.878773,0.878773,0.878773,0.878773
macro avg,0.744187,0.70274,0.692193,3943.0
weighted avg,0.921241,0.878773,0.891502,3943.0


Dev Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.976846,0.918684,0.946873,6567.0
B-PER,0.933333,0.677419,0.785047,434.0
I-PER,0.89083,0.689189,0.777143,296.0
B-LOC,0.927126,0.667638,0.776271,343.0
I-LOC,0.75,0.566038,0.645161,53.0
B-ORG,0.381333,0.817143,0.52,350.0
I-ORG,0.314815,0.765,0.446064,200.0
accuracy,0.876987,0.876987,0.876987,0.876987
macro avg,0.739183,0.72873,0.699508,8243.0
weighted avg,0.92659,0.876987,0.892943,8243.0


Binary Test Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.969901,0.927016,0.947974,380877.0
OTHERS,0.786969,0.903587,0.841256,113647.0
accuracy,0.921632,0.921632,0.921632,0.921632
macro avg,0.878435,0.915302,0.894615,494524.0
weighted avg,0.927861,0.921632,0.923449,494524.0


Binary Dev Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.975697,0.91842,0.946192,1616269.0
OTHERS,0.745876,0.912791,0.820935,423980.0
accuracy,0.91725,0.91725,0.91725,0.9172503
macro avg,0.860786,0.915605,0.883563,2040249.0
weighted avg,0.927938,0.91725,0.920163,2040249.0


****************    Results for model_5    ****************
Test Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.987027,0.88469,0.933061,3096.0
B-PER,0.917293,0.61,0.732733,200.0
I-PER,0.92233,0.605096,0.730769,157.0
B-LOC,0.661905,0.759563,0.707379,183.0
I-LOC,0.47619,0.434783,0.454545,23.0
B-ORG,0.310748,0.791667,0.446309,168.0
I-ORG,0.322344,0.758621,0.452442,116.0
accuracy,0.84352,0.84352,0.84352,0.84352
macro avg,0.656834,0.69206,0.636748,3943.0
weighted avg,0.914476,0.84352,0.866701,3943.0


Dev Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.990064,0.864931,0.923277,6567.0
B-PER,0.930314,0.615207,0.740638,434.0
I-PER,0.877358,0.628378,0.732283,296.0
B-LOC,0.667488,0.790087,0.723632,343.0
I-LOC,0.521127,0.698113,0.596774,53.0
B-ORG,0.31978,0.831429,0.461905,350.0
I-ORG,0.264516,0.82,0.4,200.0
accuracy,0.836589,0.836589,0.836589,0.836589
macro avg,0.65295,0.749735,0.654073,8243.0
weighted avg,0.920369,0.836589,0.864109,8243.0


Binary Test Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.984594,0.881127,0.929992,380877.0
OTHERS,0.705372,0.953796,0.810986,113647.0
accuracy,0.897827,0.897827,0.897827,0.897827
macro avg,0.844983,0.917461,0.870489,494524.0
weighted avg,0.920426,0.897827,0.902643,494524.0


Binary Dev Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.990132,0.86471,0.923181,1616269.0
OTHERS,0.652204,0.967147,0.779049,423980.0
accuracy,0.885997,0.885997,0.885997,0.8859972
macro avg,0.821168,0.915929,0.851115,2040249.0
weighted avg,0.919908,0.885997,0.893229,2040249.0


****************    Results for model_6    ****************
Test Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.982667,0.897287,0.938038,3096.0
B-PER,0.941667,0.565,0.70625,200.0
I-PER,0.957447,0.573248,0.717131,157.0
B-LOC,0.904,0.617486,0.733766,183.0
I-LOC,0.5,0.521739,0.510638,23.0
B-ORG,0.305987,0.821429,0.44588,168.0
I-ORG,0.317881,0.827586,0.45933,116.0
accuracy,0.847071,0.847071,0.847071,0.847071
macro avg,0.701378,0.689111,0.644434,3943.0
weighted avg,0.924728,0.847071,0.870459,3943.0


Dev Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.986371,0.881681,0.931093,6567.0
B-PER,0.970149,0.599078,0.740741,434.0
I-PER,0.918782,0.611486,0.73428,296.0
B-LOC,0.919492,0.632653,0.749568,343.0
I-LOC,0.666667,0.566038,0.612245,53.0
B-ORG,0.307286,0.831429,0.448728,350.0
I-ORG,0.244118,0.83,0.377273,200.0
accuracy,0.84132,0.84132,0.84132,0.84132
macro avg,0.716123,0.707481,0.656275,8243.0
weighted avg,0.931408,0.84132,0.870481,8243.0


Binary Test Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.981809,0.896019,0.936954,380877.0
OTHERS,0.730453,0.944363,0.823747,113647.0
accuracy,0.907129,0.907129,0.907129,0.907129
macro avg,0.856131,0.920191,0.880351,494524.0
weighted avg,0.924045,0.907129,0.910938,494524.0


Binary Dev Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.986278,0.88201,0.931235,1616269.0
OTHERS,0.679409,0.953219,0.793354,423980.0
accuracy,0.896808,0.896808,0.896808,0.8968082
macro avg,0.832844,0.917615,0.862294,2040249.0
weighted avg,0.922508,0.896808,0.902582,2040249.0


****************    Results for model_7    ****************
Test Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.988727,0.87823,0.930209,3096.0
B-PER,0.904412,0.615,0.732143,200.0
I-PER,0.907407,0.624204,0.739623,157.0
B-LOC,0.691176,0.770492,0.728682,183.0
I-LOC,0.423077,0.478261,0.44898,23.0
B-ORG,0.299781,0.815476,0.4384,168.0
I-ORG,0.343511,0.775862,0.47619,116.0
accuracy,0.841745,0.841745,0.841745,0.841745
macro avg,0.651156,0.708218,0.642032,3943.0
weighted avg,0.915768,0.841745,0.866102,3943.0


Dev Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.991211,0.858687,0.920202,6567.0
B-PER,0.945763,0.642857,0.765432,434.0
I-PER,0.886256,0.631757,0.737673,296.0
B-LOC,0.658477,0.781341,0.714667,343.0
I-LOC,0.607143,0.641509,0.623853,53.0
B-ORG,0.308012,0.845714,0.451564,350.0
I-ORG,0.267628,0.835,0.40534,200.0
accuracy,0.833434,0.833434,0.833434,0.833434
macro avg,0.666356,0.748124,0.659819,8243.0
weighted avg,0.922169,0.833434,0.86265,8243.0


Binary Test Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.987014,0.875474,0.927904,380877.0
OTHERS,0.697305,0.961398,0.808327,113647.0
accuracy,0.89522,0.89522,0.89522,0.89522
macro avg,0.84216,0.918436,0.868116,494524.0
weighted avg,0.920436,0.89522,0.900424,494524.0


Binary Dev Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.991404,0.859721,0.920879,1616269.0
OTHERS,0.644994,0.971584,0.775299,423980.0
accuracy,0.882967,0.882967,0.882967,0.8829672
macro avg,0.818199,0.915652,0.848089,2040249.0
weighted avg,0.919417,0.882967,0.890626,2040249.0


****************    Results for model_8    ****************
Test Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.979613,0.900194,0.938226,3096.0
B-PER,0.85906,0.64,0.733524,200.0
I-PER,0.96,0.611465,0.747082,157.0
B-LOC,0.798883,0.781421,0.790055,183.0
I-LOC,0.423077,0.478261,0.44898,23.0
B-ORG,0.341146,0.779762,0.474638,168.0
I-ORG,0.330769,0.741379,0.457447,116.0
accuracy,0.857723,0.857723,0.857723,0.857723
macro avg,0.670364,0.70464,0.655707,3943.0
weighted avg,0.914792,0.857723,0.876605,3943.0


Dev Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.984941,0.886402,0.933077,6567.0
B-PER,0.908497,0.640553,0.751351,434.0
I-PER,0.888889,0.621622,0.73161,296.0
B-LOC,0.812883,0.772595,0.792227,343.0
I-LOC,0.560606,0.698113,0.621849,53.0
B-ORG,0.340144,0.808571,0.478849,350.0
I-ORG,0.266779,0.795,0.399497,200.0
accuracy,0.852481,0.852481,0.852481,0.852481
macro avg,0.680391,0.746122,0.672637,8243.0
weighted avg,0.922776,0.852481,0.876179,8243.0


Binary Test Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.979905,0.901094,0.938849,380877.0
OTHERS,0.738904,0.938071,0.82666,113647.0
accuracy,0.909592,0.909592,0.909592,0.909592
macro avg,0.859404,0.919583,0.882754,494524.0
weighted avg,0.924521,0.909592,0.913067,494524.0


Binary Dev Results:


Unnamed: 0,precision,recall,f1-score,support
O,0.983756,0.884647,0.931573,1616269.0
OTHERS,0.68228,0.944313,0.792191,423980.0
accuracy,0.897046,0.897046,0.897046,0.8970459
macro avg,0.833018,0.91448,0.861882,2040249.0
weighted avg,0.921107,0.897046,0.902608,2040249.0


**Good luck!**