<center><h1>Project 4</h1></center>
<br>
<center><font size="5">Name - Spandan Patil</font></center>

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import precision_recall_fscore_support
from torch.optim.lr_scheduler import StepLR,OneCycleLR
from gensim.models import Word2Vec
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
# Function to read the dataset
def read_Dataset(file_path):
    # List of store all the sentences and the labels list.
    sents, lbls = [], []
    # Variable to store the sentece and the labels for each token in the sentence.
    sent, lbl = [], []
    # Here we open our dataset file.
    with open(file_path, 'r', encoding='utf-8') as c:
        # We are reading line by line
        for w_dtls in c:
            # Here we are removing all the leading and trailing white spaces
            w_dtls = w_dtls.strip()
            # If the line is not blank
            if w_dtls:
                # We are spliting the the line
                w_dtls_lst = w_dtls.split()
                if len(w_dtls_lst) == 3:
                    # We are getting the index, word and its tag
                    _, w, tag = w_dtls_lst
                    sent.append(w)
                    lbl.append(tag)
                elif len(w_dtls_lst) == 2:
                    # We are getting the index and word
                    _, w = w_dtls_lst
                    sent.append(w)
            else:
                # If its end of the sentences add it to sentences list.
                if sent:
                    sents.append(sent)
                    lbls.append(lbl)
                    sent, lbl = [], []
        # For adding the last sentence.
        if sent:
            sents.append(sent)
            lbls.append(lbl)
    if file_path == "./data/test":
        return {"sentences": sents}
    else:
        return {"sentences": sents, "labels": lbls}



In [3]:
# Here we are loading our train, dev and test data.
train_data = read_Dataset("./data/train")
dev_data = read_Dataset("./data/dev")
test_data = read_Dataset("./data/test")

## Task 1


In [4]:
# Here is our BiLSTM model for performing the Name Entity Recognition task.
class BiLSTM(nn.Module):
    def __init__(self, v_size, t_size, embedding_dim=100, hidden_dim=256, linear_dim=128, lstm_layers=1, dropout=0.33):
        super(BiLSTM, self).__init__()

        self.embedding = nn.Embedding(v_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers=lstm_layers,
            bidirectional=True,
            batch_first=True,
            dropout=dropout
        ).to(device)
        
        self.fc = nn.Linear(hidden_dim * 2, linear_dim).to(device)
        self.elu = nn.ELU(alpha=0.01)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(linear_dim, t_size).to(device)
    
    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        fc_out = self.fc(lstm_out)
        elu_out = self.elu(fc_out)
        drop_out = self.dropout(elu_out)
        scores = self.classifier(drop_out)
        return scores

In [5]:
# This is the custom dataset to load our datasets.
class NERDataset(Dataset):
    def __init__(self, sents, lbls, word2idx, tag2idx):
        # Here we are storing the sentences and there labels
        self.sents = [[word2idx.get(word, 1) for word in sent] for sent in sents]
        self.lbls = [[tag2idx[tag] for tag in label] for label in lbls]
    
    def __len__(self):
        return len(self.sents)
    
    def __getitem__(self, idx):
        return torch.tensor(self.sents[idx]), torch.tensor(self.lbls[idx])

In [6]:
# This is the custom dataset to load our test datasets.
class NERDataset_TEST(Dataset):
    def __init__(self, sents, word2idx):
        # Here we are storing the sentences
        self.sents = [[word2idx.get(word, 1) for word in sent] for sent in sents]
    
    def __len__(self):
        return len(self.sents)
    
    def __getitem__(self, idx):
        return torch.tensor(self.sents[idx])

In [7]:
# This is the function to evaluate our Model and writing the prediction to output file.
def evaluate(model, dataloader, data, word2idx, tag2idx, batch_size, output_file):
    model.to(device)
    model.eval()
    # The idx to tag dictionary
    idx2tag = {idx: tag for tag, idx in tag2idx.items()}
    # The idx to word dictionary
    idx2word = {idx: word for word, idx in word2idx.items()}
    # List to store the true labels
    true_lbls = []
    # List to store the predictions
    preds_lst = []
    
    with open(output_file, "w", encoding="utf-8") as f:
        with torch.no_grad():
            # Here we are evaluating the datapoints in batches.
            for batch_idx, (sents, lbls) in enumerate(dataloader):
                sents = sents.to(device)
                output = model(sents)
                preds = torch.argmax(output, dim=-1)
                
                # Here we are writing our predictions to the output file.
                for sent_idx, (sent, lbl, pred) in enumerate(zip(sents, lbls, preds)):
                    for w_idx, (w_id, pred_id, _) in enumerate(zip(sent, pred, lbl)):
                        w = idx2word.get(w_id.item(), "<UNK>")
                        pred_tag = idx2tag[pred_id.item()]
                        # This is for skipping the padding.
                        if w != "<PAD>" and w != "<S>" and w != "</S>":
                            og_w = data["sentences"][batch_idx * batch_size + sent_idx][w_idx-1]
                            f.write(f"{w_idx} {og_w} {pred_tag}\n")
                    f.write("\n")
                
                # Collect the true labels and predictions
                for true, pred in zip(lbl, pred):
                    # This is for skipping the padding.
                    if true.item() != -1:
                        true_lbls.append(true.item())
                        preds_lst.append(pred.item())
    
    # Here we are calculating Precision, Recall, F1 Score
    precision, recall, f1, _ = precision_recall_fscore_support(true_lbls, preds_lst, average='macro', zero_division=0)
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")
    
    # Here we are calculate Accuracy
    correct = (torch.tensor(preds_lst) == torch.tensor(true_lbls)).sum().item()
    total = len(true_lbls)
    print(total)
    print(f"Evaluation Accuracy: {correct / total:.4f}")

    return f1

In [8]:
# This is the function to evaluate our Model and writing the prediction to output file.
def evaluate_test(model, dataloader, data, word2idx, tag2idx, batch_size, output_file):
    model.eval()
    # The idx to tag dictionary
    idx2tag = {idx: tag for tag, idx in tag2idx.items()}
    # The idx to word dictionary
    idx2word = {idx: word for word, idx in word2idx.items()}
    # List to store the true labels
    
    with open(output_file, "w", encoding="utf-8") as f:
        with torch.no_grad():
            # Here we are evaluating the datapoints in batches.
            for batch_idx, sents in enumerate(dataloader):
                sents = sents.to(device)
                output = model(sents)
                preds = torch.argmax(output, dim=-1)
                
                # Here we are writing our predictions to the output file.
                for sent_idx, (sent, pred) in enumerate(zip(sents, preds)):
                    for w_idx, (w_id, pred_id) in enumerate(zip(sent, pred)):
                        w = idx2word.get(w_id.item(), "<UNK>")
                        pred_tag = idx2tag[pred_id.item()]
                        # This is for skipping the padding.
                        if w != "<PAD>" and w != "<S>" and w != "</S>":
                            og_w = data["sentences"][batch_idx * batch_size + sent_idx][w_idx-1]
                            f.write(f"{w_idx} {og_w} {pred_tag}\n")
                    f.write("\n")

In [9]:
# Here is the function to train our model
def train_model(model, train_loader, dev_loader, data, optimizer, criterion, word2idx, tag2idx, batch_size, output_file="dev1.out", epochs=10, accum_steps=1):
    model.train()
    # This is the learning rate scheduler here we are reducing the learning by 0.5 factor if we dont see improvement in validation f1-score for 4 epoches.
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=4, verbose=True)
    model.to(device)
    # Variable to store the best f1
    best_f1 = float("-inf")
    best_model = None
    # Paitences counter.
    counter = 0
    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        optimizer.zero_grad()
        
        # This is to give details regarding each epoches
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", leave=False)
        
        # Here we are training our model in batches.
        for step, (sents, lbls) in enumerate(progress_bar):
            sents, lbls = sents.to(device), lbls.to(device)
            out = model(sents)
            
            loss = criterion(out.view(-1, out.shape[-1]), lbls.view(-1)) / accum_steps
            loss.backward()

            # Here we are doing Gradient accumulation (if accum_steps > 1)
            if (step + 1) % accum_steps == 0 or (step + 1) == len(train_loader):
                # This is to preventing the exploding gradients problem.
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0) 
                optimizer.step()
                optimizer.zero_grad()
            
            # Here we are calculating the total loss for this epoch.
            total_loss += loss.item() * accum_steps
            progress_bar.set_postfix(loss=loss.item())

        # Calculating the avg loss for the epoch
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{epochs}, Avg Loss: {avg_loss:.4f}, LR: {scheduler.get_last_lr()[0]}")
        
        # Here we are running the validation step on our model for the current epoch
        f1 = evaluate(model, dev_loader, data, word2idx, tag2idx, batch_size, output_file)
        # Updating the learning rate based on the F1 score.
        scheduler.step(f1)
        if f1 > best_f1:
            best_f1 = f1
            torch.save(model, "blstm1.pt")
            best_model = model
            counter = 0
        else:
            counter += 1
        if counter >= 10:
            print("Early stopping triggered.")
            break
        
    return best_model


In [10]:
def collate_fn(batch, word_to_index, tag_to_index, pad_token='<PAD>', init_token='<S>', eos_token='</S>'):
    sentences, labels = zip(*batch)
    
    # Add <s> and </s> tokens for both sentences and labels
    sentences_padded = [([word_to_index[init_token]] + list(sentence) + [word_to_index[eos_token]]) for sentence in sentences]
    labels_padded = [([tag_to_index[pad_token]] + list(label) + [tag_to_index[pad_token]]) for label in labels]
    
    # Pad the sequences
    max_len = max(len(sentence) for sentence in sentences_padded)
    sentences_padded = [sentence + [word_to_index[pad_token]] * (max_len - len(sentence)) for sentence in sentences_padded]
    labels_padded = [label + [tag_to_index[pad_token]] * (max_len - len(label)) for label in labels_padded]
    
    # Convert to tensors
    sentences_padded = torch.tensor(sentences_padded, dtype=torch.long).to(device)
    labels_padded = torch.tensor(labels_padded, dtype=torch.long).to(device)

    return sentences_padded, labels_padded

def collate_fn_test(batch, word_to_index, pad_token='<PAD>', init_token='<S>', eos_token='</S>'):
    
    
    # Add <s> and </s> tokens for both sentences and labels
    sentences_padded = [([word_to_index[init_token]] + list(sentence) + [word_to_index[eos_token]]) for sentence in batch]
    
    # Pad the sequences
    max_len = max(len(sentence) for sentence in sentences_padded)
    sentences_padded = [sentence + [word_to_index[pad_token]] * (max_len - len(sentence)) for sentence in sentences_padded]
    
    # Convert to tensors
    sentences_padded = torch.tensor(sentences_padded, dtype=torch.long).to(device)

    return sentences_padded

In [11]:
# Setting the Hyperparameters for the model
embedding_dim = 100
hidden_dim = 256
linear_dim = 128
dropout = 0.33
batch_size = 16
learning_rate = 0.25
epochs = 50


# Here we are building our Vocabulary from Training Data
word2idx = {word: idx for idx, word in enumerate(set(word for sent in train_data['sentences'] for word in sent), start=4)}
# Here we are adding the indices for padding and unknown words.
word2idx["<PAD>"] = 0
word2idx["<UNK>"] = 1
word2idx["<S>"] = 2
word2idx["</S>"] = 3
# Here we are creating our numeric label mapping.
tag2idx = {tag: idx for idx, tag in enumerate(set(tag for label in train_data['labels'] for tag in label))}
tag2idx["<PAD>"] = -1
vocab_size = len(word2idx)
tagset_size = len(tag2idx)-1


# Here we are creating our BLSTM model for training.
model = BiLSTM(vocab_size, tagset_size, embedding_dim, hidden_dim, linear_dim, dropout=dropout).to(device)

# Here we are creating the opitmizer SGD.
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, weight_decay=0.00005, nesterov=True)
# Here we are defining the loss function.
criterion = nn.CrossEntropyLoss(ignore_index=-1).to(device)

# Here we are loading our datasets.
dataset_train = NERDataset(train_data['sentences'], train_data['labels'], word2idx, tag2idx)
dataset_dev = NERDataset(dev_data['sentences'], dev_data['labels'], word2idx, tag2idx)

train_loader = DataLoader(dataset_train, batch_size=batch_size, shuffle=True, collate_fn=lambda batch: collate_fn(batch, word2idx, tag2idx))
dev_loader = DataLoader(dataset_dev, batch_size=batch_size, collate_fn=lambda batch: collate_fn(batch, word2idx, tag2idx))

In [12]:
# Training and evaluating the Model
best_model = train_model(model, train_loader, dev_loader, dev_data, optimizer, criterion, word2idx, tag2idx, batch_size, "./dev1.out", epochs)

                                                                          

Epoch 1/50, Avg Loss: 0.4828, LR: 0.25
Precision: 0.7947, Recall: 0.5033, F1 Score: 0.5939
3324
Evaluation Accuracy: 0.8995


                                                                           

Epoch 2/50, Avg Loss: 0.2386, LR: 0.25
Precision: 0.8754, Recall: 0.5731, F1 Score: 0.6782
3324
Evaluation Accuracy: 0.9161


                                                                           

Epoch 3/50, Avg Loss: 0.1560, LR: 0.25
Precision: 0.8413, Recall: 0.6377, F1 Score: 0.7163
3324
Evaluation Accuracy: 0.9269


                                                                           

Epoch 4/50, Avg Loss: 0.1111, LR: 0.25
Precision: 0.8542, Recall: 0.6664, F1 Score: 0.7447
3324
Evaluation Accuracy: 0.9293


                                                                            

Epoch 5/50, Avg Loss: 0.0847, LR: 0.25
Precision: 0.9028, Recall: 0.6762, F1 Score: 0.7636
3324
Evaluation Accuracy: 0.9335


                                                                            

Epoch 6/50, Avg Loss: 0.0651, LR: 0.25
Precision: 0.8874, Recall: 0.7191, F1 Score: 0.7905
3324
Evaluation Accuracy: 0.9377


                                                                            

Epoch 7/50, Avg Loss: 0.0321, LR: 0.125
Precision: 0.8971, Recall: 0.7036, F1 Score: 0.7825
3324
Evaluation Accuracy: 0.9395


                                                                            

Epoch 8/50, Avg Loss: 0.0216, LR: 0.125
Precision: 0.8820, Recall: 0.7444, F1 Score: 0.7992
3324
Evaluation Accuracy: 0.9419


                                                                            

Epoch 9/50, Avg Loss: 0.0192, LR: 0.125
Precision: 0.8958, Recall: 0.7060, F1 Score: 0.7831
3324
Evaluation Accuracy: 0.9395


                                                                              

Epoch 10/50, Avg Loss: 0.0209, LR: 0.125
Precision: 0.9025, Recall: 0.6827, F1 Score: 0.7706
3324
Evaluation Accuracy: 0.9335


                                                                             

Epoch 11/50, Avg Loss: 0.0215, LR: 0.125
Precision: 0.8253, Recall: 0.7250, F1 Score: 0.7694
3324
Evaluation Accuracy: 0.9380


                                                                              

Epoch 12/50, Avg Loss: 0.0120, LR: 0.0625
Precision: 0.8941, Recall: 0.7368, F1 Score: 0.8012
3324
Evaluation Accuracy: 0.9440


                                                                              

Epoch 13/50, Avg Loss: 0.0088, LR: 0.0625
Precision: 0.9021, Recall: 0.7229, F1 Score: 0.7960
3324
Evaluation Accuracy: 0.9422


                                                                              

Epoch 14/50, Avg Loss: 0.0086, LR: 0.0625
Precision: 0.9048, Recall: 0.7107, F1 Score: 0.7890
3324
Evaluation Accuracy: 0.9416


                                                                              

Epoch 15/50, Avg Loss: 0.0084, LR: 0.0625
Precision: 0.9197, Recall: 0.7358, F1 Score: 0.8132
3324
Evaluation Accuracy: 0.9452


                                                                              

Epoch 16/50, Avg Loss: 0.0088, LR: 0.0625
Precision: 0.9244, Recall: 0.7155, F1 Score: 0.7992
3324
Evaluation Accuracy: 0.9428


                                                                              

Epoch 17/50, Avg Loss: 0.0068, LR: 0.03125
Precision: 0.9178, Recall: 0.7287, F1 Score: 0.8046
3324
Evaluation Accuracy: 0.9437


                                                                              

Epoch 18/50, Avg Loss: 0.0060, LR: 0.03125
Precision: 0.8967, Recall: 0.7323, F1 Score: 0.8015
3324
Evaluation Accuracy: 0.9440


                                                                              

Epoch 19/50, Avg Loss: 0.0063, LR: 0.03125
Precision: 0.9160, Recall: 0.7289, F1 Score: 0.8055
3324
Evaluation Accuracy: 0.9455


                                                                              

Epoch 20/50, Avg Loss: 0.0068, LR: 0.03125
Precision: 0.9015, Recall: 0.7297, F1 Score: 0.8005
3324
Evaluation Accuracy: 0.9437


                                                                              

Epoch 21/50, Avg Loss: 0.0069, LR: 0.03125
Precision: 0.9185, Recall: 0.7187, F1 Score: 0.8002
3324
Evaluation Accuracy: 0.9431


                                                                              

Epoch 22/50, Avg Loss: 0.0059, LR: 0.015625
Precision: 0.9190, Recall: 0.7251, F1 Score: 0.8033
3324
Evaluation Accuracy: 0.9437


                                                                             

Epoch 23/50, Avg Loss: 0.0058, LR: 0.015625
Precision: 0.9126, Recall: 0.7313, F1 Score: 0.8066
3324
Evaluation Accuracy: 0.9443


                                                                              

Epoch 24/50, Avg Loss: 0.0059, LR: 0.015625
Precision: 0.9038, Recall: 0.7346, F1 Score: 0.8051
3324
Evaluation Accuracy: 0.9449


                                                                              

Epoch 25/50, Avg Loss: 0.0060, LR: 0.015625
Precision: 0.9293, Recall: 0.7238, F1 Score: 0.8059
3324
Evaluation Accuracy: 0.9443
Early stopping triggered.


In [14]:

# Load the state dictionary
best_model = torch.load('./blstm1.pt')
best_model.to(device)
evaluate(best_model, dev_loader, dev_data, word2idx, tag2idx, batch_size, "./dev1.out")

Precision: 0.9197, Recall: 0.7358, F1 Score: 0.8132
3324
Evaluation Accuracy: 0.9452


0.8131798357408604

In [15]:
# Evaluating METRICS with eval.py
!python eval.py -p dev1.out -g data/dev

processed 51578 tokens with 5942 phrases; found: 5029 phrases; correct: 4153.
accuracy:  94.93%; precision:  82.58%; recall:  69.89%; FB1:  75.71
              LOC: precision:  92.38%; recall:  77.19%; FB1:  84.10  1535
             MISC: precision:  84.25%; recall:  76.03%; FB1:  79.93  832
              ORG: precision:  76.99%; recall:  64.13%; FB1:  69.98  1117
              PER: precision:  75.99%; recall:  63.74%; FB1:  69.32  1545


In [16]:
# Load the state dictionary
best_model = torch.load('./blstm1.pt')
best_model.to(device)
dataset_test = NERDataset_TEST(test_data['sentences'], word2idx)
test_loader = DataLoader(dataset_test, batch_size=batch_size, collate_fn=lambda batch: collate_fn_test(batch, word2idx))
evaluate_test(best_model, test_loader, test_data, word2idx, tag2idx, batch_size, "./test1.out")

## Task 2

In [17]:
# Here is our BiLSTM model for performing the Name Entity Recognition task.
class BiLSTM_Glove(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_matrix=None, embedding_dim=100, hidden_dim=256, linear_dim=128, lstm_layers=1, dropout=0.33, capitalization_dim=6):
        super(BiLSTM_Glove, self).__init__()
        
        if embedding_matrix is not None:
            self.embedding = nn.Embedding.from_pretrained(
                torch.from_numpy(embedding_matrix).float(),
                padding_idx=0,
                freeze=False
            )
        else:
            self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        # Here we are adding a new embedding layer for capitalization feature
        self.capitalization_embedding = nn.Embedding(capitalization_dim, embedding_dim, padding_idx=0)  

        self.lstm = nn.LSTM(
            embedding_dim * 2,  # multiple by 2 is to account for both word and capitalization embeddings
            hidden_dim,
            num_layers=lstm_layers,
            bidirectional=True,
            batch_first=True,
            dropout=dropout  
        ).to(device)

        
        self.fc = nn.Linear(hidden_dim * 2, linear_dim).to(device)
        self.elu = nn.ELU(alpha=0.01)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(linear_dim, tagset_size).to(device)
    
    def forward(self, x, capitalization_features):
        embedded = self.embedding(x)
        # Here we are getting capitalization embeddings
        capitalization_embeds = self.capitalization_embedding(capitalization_features)  
        # Here we are concatenating the word and capitalization embeddings
        combined_embeds = torch.cat((embedded, capitalization_embeds), dim=-1)  
        lstm_out, _ = self.lstm(combined_embeds)
        fc_out = self.fc(lstm_out)
        elu_out = self.elu(fc_out)
        drop_out = self.dropout(elu_out)
        scores = self.classifier(drop_out)
        return scores


In [18]:
# In this function we are determining the capitalization feature label for each word.
def get_capitalization_feature(word):
    # Check if the word is fully uppercase
    if word.isupper():
        return 'allCaps'
    # Check if the word starts with an uppercase letter and the rest are lowercase
    elif word[0].isupper() and word[1:].islower():
        return 'upperInitial'
    # Check if the word is entirely lowercase
    elif word.islower():
        return 'lowercase'
    # Check if the word has mixed capitalization
    elif any(c.isupper() for c in word[1:]) and any(c.islower() for c in word[1:]):
        return 'mixedCaps'
    else:
        return 'noinfo'

# Here we are mapping each of the capitalization feature label to numeric label.
def capitalization_to_index(capitalization):
    lookup = {
        '<PAD>' : 0,
        'allCaps': 1,
        'upperInitial': 2,
        'lowercase': 3,
        'mixedCaps': 4,
        'noinfo': 5
    }
    return lookup.get(capitalization, 4)  # default to 'noinfo' if invalid

# In this function we are checking for capitalization
def adjust_case(word):
    if word.islower():
        return word.lower()
    elif word.isupper():
        return word.upper()
    elif word.istitle():
        return word.title()
    else:
        return word.lower()

In [19]:
# This is the custom dataset to load our datasets with capitalization.
class NERDataset_Glove(Dataset):
    def __init__(self, sentences, labels, word2idx, tag2idx):
        self.sentences = [[word2idx.get(word, 1) for word in sent] for sent in sentences]
        self.capitalization_features = [[capitalization_to_index(get_capitalization_feature(word)) for word in sent] for sent in sentences]
        self.labels = [[tag2idx[tag] for tag in label] for label in labels]
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        return torch.tensor(self.sentences[idx]), torch.tensor(self.capitalization_features[idx]), torch.tensor(self.labels[idx])


In [20]:
# This is the custom dataset to load our datasets with capitalization.
class NERDataset_Glove_test(Dataset):
    def __init__(self, sentences, word2idx):
        self.sentences = [[word2idx.get(word, 1) for word in sent] for sent in sentences]
        self.capitalization_features = [[capitalization_to_index(get_capitalization_feature(word)) for word in sent] for sent in sentences]
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        return torch.tensor(self.sentences[idx]), torch.tensor(self.capitalization_features[idx])


In [21]:
# This is the function to evaluate our Model and writing the prediction to output file.
def evaluate_glove(model, dataloader, data, word2idx, tag2idx, batch_size, output_file):
    model.to(device)
    model.eval()
    # The idx to tag dictionary
    idx2tag = {idx: tag for tag, idx in tag2idx.items()}
    # The idx to word dictionary
    idx2word = {idx: word for word, idx in word2idx.items()}
    # List to store the true labels
    true_lbls = []
    # List to store the predictions
    preds_lst = []
    
    with open(output_file, "w", encoding="utf-8") as f:
        with torch.no_grad():
            # Here we are evaluating the datapoints in batches.
            for batch_idx, (sents, cps, lbls) in enumerate(dataloader):
                sents = sents.to(device)
                cps = cps.to(device)
                output = model(sents, cps)
                preds = torch.argmax(output, dim=-1)
                # Here we are writing our predictions to the output file.
                for sent_idx, (sent, lbl, pred) in enumerate(zip(sents, lbls, preds)):
                    for w_idx, (w_id, pred_id, _) in enumerate(zip(sent, pred, lbl)):
                        w = idx2word.get(w_id.item(), "<UNK>")
                        pred_tag = idx2tag[pred_id.item()]
                        # This is for skipping the padding.
                        if w != "<PAD>" and w != "<S>" and w != "</S>":
                            og_w = data["sentences"][batch_idx * batch_size + sent_idx][w_idx-1]
                            f.write(f"{w_idx} {og_w} {pred_tag}\n")
                    f.write("\n")
                
                # Collect the true labels and predictions
                for true, pred in zip(lbl, pred):
                    # This is for skipping the padding.
                    if true.item() != -1:
                        true_lbls.append(true.item())
                        preds_lst.append(pred.item())
    
    # Here we are calculating Precision, Recall, F1 Score
    precision, recall, f1, _ = precision_recall_fscore_support(true_lbls, preds_lst, average='macro', zero_division=0)
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")
    
    # Here we are calculate Accuracy
    correct = (torch.tensor(preds_lst) == torch.tensor(true_lbls)).sum().item()
    total = len(true_lbls)
    print(f"Evaluation Accuracy: {correct / total:.4f}")

    return f1

In [22]:
# This is the function to evaluate our Model and writing the prediction to output file.
def evaluate_glove_test(model, dataloader, data, word2idx, tag2idx, batch_size, output_file):
    model.eval()
    # The idx to tag dictionary
    idx2tag = {idx: tag for tag, idx in tag2idx.items()}
    # The idx to word dictionary
    idx2word = {idx: word for word, idx in word2idx.items()}
    # List to store the true labels
    
    with open(output_file, "w", encoding="utf-8") as f:
        with torch.no_grad():
            # Here we are evaluating the datapoints in batches.
            for batch_idx, (sents, cps) in enumerate(dataloader):
                sents = sents.to(device)
                cps = cps.to(device)
                output = model(sents, cps)
                preds = torch.argmax(output, dim=-1)
                
                # Here we are writing our predictions to the output file.
                for sent_idx, (sent, pred) in enumerate(zip(sents, preds)):
                    for w_idx, (w_id, pred_id) in enumerate(zip(sent, pred)):
                        w = idx2word.get(w_id.item(), "<UNK>")
                        pred_tag = idx2tag[pred_id.item()]
                        # This is for skipping the padding.
                        if w != "<PAD>" and w != "<S>" and w != "</S>":
                            og_w = data["sentences"][batch_idx * batch_size + sent_idx][w_idx-1]
                            f.write(f"{w_idx} {og_w} {pred_tag}\n")
                    f.write("\n")

In [23]:
# Here is the function to train our model
def train_model_glove(model, train_loader, dev_loader, data, optimizer, criterion, word2idx, tag2idx, batch_size, output_file="./dev2.out", epochs=10, accum_steps=1):
    model.train()
    # This is the learning rate scheduler here we are reducing the learning by 0.5 factor if we dont see improvement in validation f1-score for paitence number of epoches.
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=4)
    model.to(device)
    # Variable to store the best f1
    best_f1 = float("-inf")
    # Variable to store best model
    best_model = None
    # Paitences counter.
    counter = 0
    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        optimizer.zero_grad()

        # This is to give details regarding each epoches
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", leave=False)

        # Here we are training our model in batches.
        for step, (sentences, cps, labels) in enumerate(progress_bar):
            sentences, cps, labels = sentences.to(device), cps.to(device), labels.to(device)
            output = model(sentences, cps)

            loss = criterion(output.view(-1, output.shape[-1]), labels.view(-1)) / accum_steps
            loss.backward()

            # Here we are doing Gradient accumulation (if accum_steps > 1)
            if (step + 1) % accum_steps == 0 or (step + 1) == len(train_loader):
                # This is to preventing the exploding gradients problem.
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
                optimizer.step()
                optimizer.zero_grad()

            total_loss += loss.item() * accum_steps
            progress_bar.set_postfix(loss=loss.item())

        # Calculating the avg loss for the epoch
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{epochs}, Avg Loss: {avg_loss:.4f}, LR: {scheduler.get_last_lr()[0]}")
        # Here we are running the validation step on our model for the current epoch
        f1 = evaluate_glove(model, dev_loader, data, word2idx, tag2idx, batch_size, output_file)  
        # Updating the learning rate based on the F1 score.
        scheduler.step(f1)
        if f1 > best_f1:
            best_f1 = f1
            torch.save(model, "blstm2.pt")
            best_model = model
            counter = 0
        else:
            counter += 1
        if counter >= 10:
            print("Early stopping triggered.")
            break
    return best_model


In [24]:
# This Function is used to create the embedding matrix from the glove embedding.
def create_embedding_matrix(vocab, embeddings, embedding_dim=100):
    # Here we are intializing a zero matrix of the correct dimensions.
    embedding_matrix = np.zeros((len(vocab), embedding_dim))
    # Here we are updating the initalized matrix with embedding vectors.
    for word, idx in vocab.items():
        if word in embeddings:
            embedding_matrix[idx] = embeddings[word]
        elif word.lower() in embeddings:
            embedding_matrix[idx] = embeddings[word.lower()]
        elif word.upper() in embeddings:
            embedding_matrix[idx] = embeddings[word.upper()] 
        else:
            # Initialize randomly with small values for OOV words
            embedding_matrix[idx] = np.random.uniform(-0.25, 0.25, embedding_dim)
    
    return embedding_matrix

# This Function is to to load GloVe embeddings
def load_glove_embeddings(glove_file_path):
    embeddings = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            word = parts[0]
            vector = np.array(parts[1:], dtype=np.float32)
            embeddings[word] = vector
    return embeddings




In [25]:
def collate_fn_glove(batch, word_to_index, tag_to_index, pad_token='<PAD>', init_token='<S>', eos_token='</S>'):
    sentences, cps, labels = zip(*batch)
    
    # Add <s> and </s> tokens for both sentences and labels
    sentences_padded = [([word_to_index[init_token]] + list(sentence) + [word_to_index[eos_token]]) for sentence in sentences]
    labels_padded = [([tag_to_index[pad_token]] + list(label) + [tag_to_index[pad_token]]) for label in labels]
    cp_padded = [([capitalization_to_index(pad_token)] + list(cp) + [capitalization_to_index(pad_token)]) for cp in cps]
    # Pad the sequences
    max_len = max(len(sentence) for sentence in sentences_padded)
    sentences_padded = [sentence + [word_to_index[pad_token]] * (max_len - len(sentence)) for sentence in sentences_padded]
    labels_padded = [label + [tag_to_index[pad_token]] * (max_len - len(label)) for label in labels_padded]
    cp_padded = [cp + [capitalization_to_index(pad_token)] * (max_len - len(cp)) for cp in cp_padded]
    
    # Convert to tensors
    sentences_padded = torch.tensor(sentences_padded, dtype=torch.long).to(device)
    labels_padded = torch.tensor(labels_padded, dtype=torch.long).to(device)
    cp_padded = torch.tensor(cp_padded, dtype=torch.long).to(device)

    return sentences_padded, cp_padded, labels_padded 

def collate_fn_glove_test(batch, word_to_index, pad_token='<PAD>', init_token='<S>', eos_token='</S>'):
    sentences, cps = zip(*batch)
    
    # Add <s> and </s> tokens for both sentences and labels
    sentences_padded = [([word_to_index[init_token]] + list(sentence) + [word_to_index[eos_token]]) for sentence in sentences]
    cp_padded = [([capitalization_to_index(pad_token)] + list(cp) + [capitalization_to_index(pad_token)]) for cp in cps]
    
    # Pad the sequences
    max_len = max(len(sentence) for sentence in sentences_padded)
    sentences_padded = [sentence + [word_to_index[pad_token]] * (max_len - len(sentence)) for sentence in sentences_padded]
    cp_padded = [cp + [capitalization_to_index(pad_token)] * (max_len - len(cp)) for cp in cp_padded]
    
    # Convert to tensors
    sentences_padded = torch.tensor(sentences_padded, dtype=torch.long).to(device)
    cp_padded = torch.tensor(cp_padded, dtype=torch.long).to(device)

    return sentences_padded, cp_padded

In [26]:
# Hyperparameters
embedding_dim = 100
hidden_dim = 256
linear_dim = 128
dropout = 0.33
batch_size = 16
learning_rate = 0.25
epochs = 80


# Build Vocabulary from Training Data
word2idx = {word: idx for idx, word in enumerate(set(word for sent in train_data['sentences'] for word in sent), start=4)}
word2idx["<PAD>"] = 0
word2idx["<UNK>"] = 1
word2idx["<S>"] = 2
word2idx["</S>"] = 3
tag2idx = {tag: idx for idx, tag in enumerate(set(tag for label in train_data['labels'] for tag in label))}
tag2idx["<PAD>"] = -1
vocab_size = len(word2idx)
tagset_size = len(tag2idx)-1


glove_embeddings = load_glove_embeddings("./glove.6B.100d/glove.6B.100d.txt")

# Create the embedding matrix from the trained Word2Vec model
embedding_matrix = create_embedding_matrix(word2idx, glove_embeddings, embedding_dim)

# Model, Optimizer, and Loss Function
model = BiLSTM_Glove(vocab_size, tagset_size, embedding_matrix, embedding_dim, hidden_dim, linear_dim, dropout=dropout).to(device)

optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, weight_decay=1e-5, nesterov=True)

criterion = nn.CrossEntropyLoss(ignore_index=-1).to(device)

# Load Data
dataset_train = NERDataset_Glove(train_data['sentences'], train_data['labels'], word2idx, tag2idx)
dataset_dev = NERDataset_Glove(dev_data['sentences'], dev_data['labels'], word2idx, tag2idx)
train_loader = DataLoader(dataset_train, batch_size=batch_size, shuffle=True, collate_fn=lambda batch: collate_fn_glove(batch, word2idx, tag2idx))
dev_loader = DataLoader(dataset_dev, batch_size=batch_size, collate_fn=lambda batch: collate_fn_glove(batch, word2idx, tag2idx))

In [27]:
# Train the Model
best_model = train_model_glove(model, train_loader, dev_loader, dev_data, optimizer, criterion, word2idx, tag2idx, batch_size, output_file="./dev2.out", epochs=epochs)

                                                                            

Epoch 1/80, Avg Loss: 0.1546, LR: 0.25
Precision: 0.7901, Recall: 0.7417, F1 Score: 0.7585
Evaluation Accuracy: 0.9510


                                                                            

Epoch 2/80, Avg Loss: 0.0758, LR: 0.25
Precision: 0.8355, Recall: 0.7737, F1 Score: 0.7977
Evaluation Accuracy: 0.9573


                                                                            

Epoch 3/80, Avg Loss: 0.0556, LR: 0.25
Precision: 0.8654, Recall: 0.7898, F1 Score: 0.8188
Evaluation Accuracy: 0.9600


                                                                             

Epoch 4/80, Avg Loss: 0.0415, LR: 0.25
Precision: 0.8666, Recall: 0.8113, F1 Score: 0.8344
Evaluation Accuracy: 0.9618


                                                                             

Epoch 5/80, Avg Loss: 0.0322, LR: 0.25
Precision: 0.8655, Recall: 0.8121, F1 Score: 0.8331
Evaluation Accuracy: 0.9633


                                                                             

Epoch 6/80, Avg Loss: 0.0259, LR: 0.25
Precision: 0.8646, Recall: 0.8471, F1 Score: 0.8514
Evaluation Accuracy: 0.9654


                                                                             

Epoch 7/80, Avg Loss: 0.0197, LR: 0.25
Precision: 0.8970, Recall: 0.8647, F1 Score: 0.8775
Evaluation Accuracy: 0.9702


                                                                             

Epoch 8/80, Avg Loss: 0.0161, LR: 0.25
Precision: 0.9188, Recall: 0.8240, F1 Score: 0.8633
Evaluation Accuracy: 0.9669


                                                                             

Epoch 9/80, Avg Loss: 0.0120, LR: 0.25
Precision: 0.8649, Recall: 0.8248, F1 Score: 0.8335
Evaluation Accuracy: 0.9636


                                                                              

Epoch 10/80, Avg Loss: 0.0099, LR: 0.25
Precision: 0.8887, Recall: 0.8588, F1 Score: 0.8657
Evaluation Accuracy: 0.9693


                                                                              

Epoch 11/80, Avg Loss: 0.0086, LR: 0.25
Precision: 0.8930, Recall: 0.8666, F1 Score: 0.8742
Evaluation Accuracy: 0.9705


                                                                              

Epoch 12/80, Avg Loss: 0.0061, LR: 0.25
Precision: 0.8832, Recall: 0.8372, F1 Score: 0.8549
Evaluation Accuracy: 0.9672


                                                                              

Epoch 13/80, Avg Loss: 0.0037, LR: 0.125
Precision: 0.9052, Recall: 0.8575, F1 Score: 0.8778
Evaluation Accuracy: 0.9705


                                                                              

Epoch 14/80, Avg Loss: 0.0025, LR: 0.125
Precision: 0.8948, Recall: 0.8626, F1 Score: 0.8745
Evaluation Accuracy: 0.9708


                                                                              

Epoch 15/80, Avg Loss: 0.0021, LR: 0.125
Precision: 0.9043, Recall: 0.8515, F1 Score: 0.8714
Evaluation Accuracy: 0.9699


                                                                              

Epoch 16/80, Avg Loss: 0.0018, LR: 0.125
Precision: 0.8973, Recall: 0.8705, F1 Score: 0.8779
Evaluation Accuracy: 0.9714


                                                                              

Epoch 17/80, Avg Loss: 0.0015, LR: 0.125
Precision: 0.9109, Recall: 0.8509, F1 Score: 0.8746
Evaluation Accuracy: 0.9714


                                                                              

Epoch 18/80, Avg Loss: 0.0016, LR: 0.125
Precision: 0.8898, Recall: 0.8540, F1 Score: 0.8672
Evaluation Accuracy: 0.9696


                                                                              

Epoch 19/80, Avg Loss: 0.0016, LR: 0.125
Precision: 0.8958, Recall: 0.8652, F1 Score: 0.8769
Evaluation Accuracy: 0.9708


                                                                              

Epoch 20/80, Avg Loss: 0.0014, LR: 0.125
Precision: 0.8980, Recall: 0.8661, F1 Score: 0.8785
Evaluation Accuracy: 0.9708


                                                                              

Epoch 21/80, Avg Loss: 0.0013, LR: 0.125
Precision: 0.9002, Recall: 0.8603, F1 Score: 0.8725
Evaluation Accuracy: 0.9690


                                                                              

Epoch 22/80, Avg Loss: 0.0012, LR: 0.125
Precision: 0.9023, Recall: 0.8514, F1 Score: 0.8720
Evaluation Accuracy: 0.9705


                                                                              

Epoch 23/80, Avg Loss: 0.0012, LR: 0.125
Precision: 0.8944, Recall: 0.8530, F1 Score: 0.8706
Evaluation Accuracy: 0.9699


                                                                              

Epoch 24/80, Avg Loss: 0.0012, LR: 0.125
Precision: 0.9018, Recall: 0.8636, F1 Score: 0.8793
Evaluation Accuracy: 0.9714


                                                                              

Epoch 25/80, Avg Loss: 0.0011, LR: 0.125
Precision: 0.9026, Recall: 0.8683, F1 Score: 0.8807
Evaluation Accuracy: 0.9711


                                                                              

Epoch 26/80, Avg Loss: 0.0011, LR: 0.125
Precision: 0.8874, Recall: 0.8646, F1 Score: 0.8715
Evaluation Accuracy: 0.9699


                                                                              

Epoch 27/80, Avg Loss: 0.0010, LR: 0.125
Precision: 0.9065, Recall: 0.8600, F1 Score: 0.8794
Evaluation Accuracy: 0.9717


                                                                              

Epoch 28/80, Avg Loss: 0.0008, LR: 0.125
Precision: 0.9070, Recall: 0.8605, F1 Score: 0.8803
Evaluation Accuracy: 0.9723


                                                                              

Epoch 29/80, Avg Loss: 0.0012, LR: 0.125
Precision: 0.9005, Recall: 0.8617, F1 Score: 0.8779
Evaluation Accuracy: 0.9711


                                                                              

Epoch 30/80, Avg Loss: 0.0009, LR: 0.125
Precision: 0.9004, Recall: 0.8813, F1 Score: 0.8882
Evaluation Accuracy: 0.9732


                                                                              

Epoch 31/80, Avg Loss: 0.0011, LR: 0.125
Precision: 0.9034, Recall: 0.8624, F1 Score: 0.8801
Evaluation Accuracy: 0.9720


                                                                              

Epoch 32/80, Avg Loss: 0.0012, LR: 0.125
Precision: 0.9147, Recall: 0.8634, F1 Score: 0.8823
Evaluation Accuracy: 0.9735


                                                                              

Epoch 33/80, Avg Loss: 0.0010, LR: 0.125
Precision: 0.9000, Recall: 0.8666, F1 Score: 0.8814
Evaluation Accuracy: 0.9720


                                                                              

Epoch 34/80, Avg Loss: 0.0015, LR: 0.125
Precision: 0.8912, Recall: 0.8640, F1 Score: 0.8718
Evaluation Accuracy: 0.9702


                                                                              

Epoch 35/80, Avg Loss: 0.0011, LR: 0.125
Precision: 0.9049, Recall: 0.8762, F1 Score: 0.8867
Evaluation Accuracy: 0.9729


                                                                              

Epoch 36/80, Avg Loss: 0.0007, LR: 0.0625
Precision: 0.9063, Recall: 0.8668, F1 Score: 0.8828
Evaluation Accuracy: 0.9729


                                                                              

Epoch 37/80, Avg Loss: 0.0006, LR: 0.0625
Precision: 0.8992, Recall: 0.8855, F1 Score: 0.8886
Evaluation Accuracy: 0.9747


                                                                              

Epoch 38/80, Avg Loss: 0.0006, LR: 0.0625
Precision: 0.8995, Recall: 0.8741, F1 Score: 0.8829
Evaluation Accuracy: 0.9723


                                                                              

Epoch 39/80, Avg Loss: 0.0005, LR: 0.0625
Precision: 0.9057, Recall: 0.8721, F1 Score: 0.8841
Evaluation Accuracy: 0.9729


                                                                              

Epoch 40/80, Avg Loss: 0.0006, LR: 0.0625
Precision: 0.9025, Recall: 0.8750, F1 Score: 0.8857
Evaluation Accuracy: 0.9729


                                                                              

Epoch 41/80, Avg Loss: 0.0005, LR: 0.0625
Precision: 0.8962, Recall: 0.8731, F1 Score: 0.8810
Evaluation Accuracy: 0.9720


                                                                              

Epoch 42/80, Avg Loss: 0.0005, LR: 0.0625
Precision: 0.8976, Recall: 0.8754, F1 Score: 0.8829
Evaluation Accuracy: 0.9729


                                                                              

Epoch 43/80, Avg Loss: 0.0005, LR: 0.03125
Precision: 0.9020, Recall: 0.8752, F1 Score: 0.8852
Evaluation Accuracy: 0.9726


                                                                              

Epoch 44/80, Avg Loss: 0.0005, LR: 0.03125
Precision: 0.9059, Recall: 0.8742, F1 Score: 0.8871
Evaluation Accuracy: 0.9729


                                                                              

Epoch 45/80, Avg Loss: 0.0004, LR: 0.03125
Precision: 0.9025, Recall: 0.8746, F1 Score: 0.8854
Evaluation Accuracy: 0.9729


                                                                              

Epoch 46/80, Avg Loss: 0.0005, LR: 0.03125
Precision: 0.9093, Recall: 0.8754, F1 Score: 0.8894
Evaluation Accuracy: 0.9732


                                                                              

Epoch 47/80, Avg Loss: 0.0005, LR: 0.03125
Precision: 0.8996, Recall: 0.8716, F1 Score: 0.8821
Evaluation Accuracy: 0.9720


                                                                              

Epoch 48/80, Avg Loss: 0.0004, LR: 0.03125
Precision: 0.9005, Recall: 0.8722, F1 Score: 0.8833
Evaluation Accuracy: 0.9726


                                                                              

Epoch 49/80, Avg Loss: 0.0004, LR: 0.03125
Precision: 0.9046, Recall: 0.8750, F1 Score: 0.8869
Evaluation Accuracy: 0.9729


                                                                              

Epoch 50/80, Avg Loss: 0.0004, LR: 0.03125
Precision: 0.9018, Recall: 0.8713, F1 Score: 0.8833
Evaluation Accuracy: 0.9723


                                                                              

Epoch 51/80, Avg Loss: 0.0004, LR: 0.03125
Precision: 0.8988, Recall: 0.8746, F1 Score: 0.8835
Evaluation Accuracy: 0.9726


                                                                              

Epoch 52/80, Avg Loss: 0.0004, LR: 0.015625
Precision: 0.8981, Recall: 0.8741, F1 Score: 0.8830
Evaluation Accuracy: 0.9723


                                                                              

Epoch 53/80, Avg Loss: 0.0004, LR: 0.015625
Precision: 0.8986, Recall: 0.8728, F1 Score: 0.8825
Evaluation Accuracy: 0.9720


                                                                              

Epoch 54/80, Avg Loss: 0.0004, LR: 0.015625
Precision: 0.9006, Recall: 0.8737, F1 Score: 0.8837
Evaluation Accuracy: 0.9726


                                                                              

Epoch 55/80, Avg Loss: 0.0004, LR: 0.015625
Precision: 0.9024, Recall: 0.8750, F1 Score: 0.8854
Evaluation Accuracy: 0.9729


                                                                              

Epoch 56/80, Avg Loss: 0.0004, LR: 0.015625
Precision: 0.9027, Recall: 0.8729, F1 Score: 0.8844
Evaluation Accuracy: 0.9723
Early stopping triggered.


In [28]:
# Load the state dictionary
best_model = torch.load('./blstm2.pt')
best_model.to(device)
evaluate_glove(best_model, dev_loader, dev_data, word2idx, tag2idx, batch_size, "./dev2.out")

Precision: 0.9093, Recall: 0.8754, F1 Score: 0.8894
Evaluation Accuracy: 0.9732


0.8893633722790155

In [29]:
# Evaluating METRICS with eval.py
!python eval.py -p dev2.out -g data/dev

processed 51578 tokens with 5942 phrases; found: 5867 phrases; correct: 5151.
accuracy:  97.76%; precision:  87.80%; recall:  86.69%; FB1:  87.24
              LOC: precision:  95.40%; recall:  90.31%; FB1:  92.79  1739
             MISC: precision:  81.38%; recall:  82.97%; FB1:  82.17  940
              ORG: precision:  76.04%; recall:  86.88%; FB1:  81.10  1532
              PER: precision:  94.32%; recall:  84.80%; FB1:  89.31  1656


In [30]:

# Load the state dictionary
best_model = torch.load('./blstm2.pt')
best_model.to(device)

dataset_test = NERDataset_Glove_test(test_data['sentences'], word2idx)
test_loader = DataLoader(dataset_test, batch_size=batch_size, collate_fn=lambda batch: collate_fn_glove_test(batch, word2idx))
evaluate_glove_test(best_model, test_loader, test_data, word2idx, tag2idx, batch_size, "./test2.out")

## Bonus

In [31]:
class BiLSTM_CNN(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_matrix=None, embedding_dim=100, hidden_dim=256, linear_dim=128, lstm_layers=1, dropout=0.33, capitalization_dim=6, char_embedding_dim=25, char_cnn_kernel_size=3, char_cnn_output_dim=53):
        super(BiLSTM_CNN, self).__init__()
        
        if embedding_matrix is not None:
            self.embedding = nn.Embedding.from_pretrained(
                torch.from_numpy(embedding_matrix).float(),
                padding_idx=0,
                freeze=False
            )
        else:
            self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        # Character-level CNN
        self.char_embedding = nn.Embedding(num_embeddings=100, embedding_dim=char_embedding_dim, padding_idx=0)
        self.char_cnn = nn.Conv1d(in_channels=char_embedding_dim, out_channels=char_cnn_output_dim, kernel_size=char_cnn_kernel_size, padding=1)
        self.char_pool = nn.AdaptiveMaxPool1d(1)

        # Capitalization embedding
        self.capitalization_embedding = nn.Embedding(capitalization_dim, embedding_dim, padding_idx=0)  

        # BiLSTM
        self.lstm = nn.LSTM(
            embedding_dim * 2 + char_cnn_output_dim,  # word + capitalization + character-level CNN output
            hidden_dim,
            num_layers=lstm_layers,
            bidirectional=True,
            batch_first=True,
            dropout=dropout  
        ).to(device)

        # Fully connected layer
        self.fc = nn.Linear(hidden_dim * 2, linear_dim).to(device)
        self.elu = nn.ELU(alpha=0.01)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(linear_dim, tagset_size).to(device)
    
    def forward(self, x, capitalization_features, char_sequences):
        # Word embeddings
        embedded = self.embedding(x)
        
        # Character-level embeddings
        char_embeds = self.char_embedding(char_sequences)  # (batch_size, seq_len, word_len, char_embedding_dim)
        batch_size, seq_len, word_len, char_embed_dim = char_embeds.size()
        char_embeds = char_embeds.view(batch_size * seq_len, word_len, char_embed_dim)  # (batch_size * seq_len, word_len, char_embedding_dim)
        char_embeds = char_embeds.permute(0, 2, 1)  # (batch_size * seq_len, char_embedding_dim, word_len)
        char_cnn_out = self.char_cnn(char_embeds)  # (batch_size * seq_len, char_cnn_output_dim, word_len)
        char_pool_out = self.char_pool(char_cnn_out).squeeze(-1)  # (batch_size * seq_len, char_cnn_output_dim)
        char_pool_out = char_pool_out.view(batch_size, seq_len, -1)  # (batch_size, seq_len, char_cnn_output_dim)
        
        # Capitalization embeddings
        capitalization_embeds = self.capitalization_embedding(capitalization_features)  
        
        # Concatenate word, capitalization, and character-level embeddings
        combined_embeds = torch.cat((embedded, capitalization_embeds, char_pool_out), dim=-1)  
        
        # BiLSTM
        lstm_out, _ = self.lstm(combined_embeds)
        
        # Fully connected layer
        fc_out = self.fc(lstm_out)
        elu_out = self.elu(fc_out)
        drop_out = self.dropout(elu_out)
        scores = self.classifier(drop_out)
        return scores

In [32]:
class NERDataset_CNN(Dataset):
    def __init__(self, sentences, labels, word2idx, tag2idx, char2idx, max_word_len=20):
        self.sentences = [[word2idx.get(word, 1) for word in sent] for sent in sentences]
        self.capitalization_features = [[capitalization_to_index(get_capitalization_feature(word)) for word in sent] for sent in sentences]
        self.labels = [[tag2idx.get(tag, -1) for tag in label] for label in labels]
        
        # Pad character sequences to a fixed length (max_word_len)
        self.char_sequences = []
        for sent in sentences:
            char_seq_sent = []
            for word in sent:
                char_seq_word = [char2idx.get(char, 1) for char in word][:max_word_len]  
                char_seq_word += [char2idx["<PAD>"]] * (max_word_len - len(char_seq_word))
                char_seq_sent.append(char_seq_word)
            self.char_sequences.append(char_seq_sent)
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        return (
            torch.tensor(self.sentences[idx], dtype=torch.long),
            torch.tensor(self.capitalization_features[idx], dtype=torch.long),
            torch.tensor(self.labels[idx], dtype=torch.long),
            torch.tensor(self.char_sequences[idx], dtype=torch.long) 
        )

class NERDataset_CNN_test(Dataset):
    def __init__(self, sentences, word2idx, char2idx, max_word_len=20):
        self.sentences = [[word2idx.get(word, 1) for word in sent] for sent in sentences]
        self.capitalization_features = [[capitalization_to_index(get_capitalization_feature(word)) for word in sent] for sent in sentences]
        
        # Pad character sequences to a fixed length (max_word_len)
        self.char_sequences = []
        for sent in sentences:
            char_seq_sent = []
            for word in sent:
                char_seq_word = [char2idx.get(char, 1) for char in word][:max_word_len]  
                char_seq_word += [char2idx["<PAD>"]] * (max_word_len - len(char_seq_word))  
                char_seq_sent.append(char_seq_word)
            self.char_sequences.append(char_seq_sent)
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        return (
            torch.tensor(self.sentences[idx], dtype=torch.long),
            torch.tensor(self.capitalization_features[idx], dtype=torch.long),
            torch.tensor(self.char_sequences[idx], dtype=torch.long) 
        )

In [33]:
def collate_fn_cnn(batch, word_to_index, tag_to_index, char_to_index, pad_token='<PAD>', init_token='<S>', eos_token='</S>'):
    sentences, cps, labels, char_sequences = zip(*batch)
    
    # Add <s> and </s> tokens for both sentences and labels
    sentences_padded = [([word_to_index[init_token]] + list(sentence) + [word_to_index[eos_token]]) for sentence in sentences]
    labels_padded = [([tag_to_index[pad_token]] + list(label) + [tag_to_index[pad_token]]) for label in labels]
    cp_padded = [([capitalization_to_index(pad_token)] + list(cp) + [capitalization_to_index(pad_token)]) for cp in cps]
    
    # Pad the sentences to the length of the longest sentence in the batch
    max_len = max(len(sentence) for sentence in sentences_padded)
    sentences_padded = [sentence + [word_to_index[pad_token]] * (max_len - len(sentence)) for sentence in sentences_padded]
    labels_padded = [label + [tag_to_index[pad_token]] * (max_len - len(label)) for label in labels_padded]
    cp_padded = [cp + [capitalization_to_index(pad_token)] * (max_len - len(cp)) for cp in cp_padded]
    
    # Convert to tensors
    sentences_padded = torch.tensor(sentences_padded, dtype=torch.long).to(device)
    labels_padded = torch.tensor(labels_padded, dtype=torch.long).to(device)
    cp_padded = torch.tensor(cp_padded, dtype=torch.long).to(device)
    
    # Pad character sequences to match the padded sentences
    max_word_len = char_sequences[0].size(1)  # Length of the first word's character sequence
    char_sequences_padded = []
    for char_seq_sent in char_sequences:
        # Move char_seq_sent to the same device as padding
        char_seq_sent = char_seq_sent.to(device)
        # Calculate padding needed
        padding_length = max_len - char_seq_sent.size(0)
        if padding_length > 0:
            # Create padding tensor
            padding = torch.full((padding_length, max_word_len), char_to_index[pad_token], dtype=torch.long).to(device)
            # Concatenate padding to the character sequences
            char_seq_sent_padded = torch.cat([char_seq_sent, padding], dim=0)
        else:
            char_seq_sent_padded = char_seq_sent
        char_sequences_padded.append(char_seq_sent_padded)
    
    # Stack all character sequences into a single tensor
    char_sequences_padded = torch.stack(char_sequences_padded).to(device)

    return sentences_padded, labels_padded, cp_padded, char_sequences_padded

def collate_fn_cnn_test(batch, word_to_index, char_to_index, pad_token='<PAD>', init_token='<S>', eos_token='</S>'):
    sentences, cps, char_sequences = zip(*batch)
    
    # Add <s> and </s> tokens for both sentences and labels
    sentences_padded = [([word_to_index[init_token]] + list(sentence) + [word_to_index[eos_token]]) for sentence in sentences]
    cp_padded = [([capitalization_to_index(pad_token)] + list(cp) + [capitalization_to_index(pad_token)]) for cp in cps]
    
    # Pad the sentences to the length of the longest sentence in the batch
    max_len = max(len(sentence) for sentence in sentences_padded)
    sentences_padded = [sentence + [word_to_index[pad_token]] * (max_len - len(sentence)) for sentence in sentences_padded]
    cp_padded = [cp + [capitalization_to_index(pad_token)] * (max_len - len(cp)) for cp in cp_padded]
    
    # Convert to tensors
    sentences_padded = torch.tensor(sentences_padded, dtype=torch.long).to(device)
    cp_padded = torch.tensor(cp_padded, dtype=torch.long).to(device)
    
    # Pad character sequences to match the padded sentences
    max_word_len = char_sequences[0].size(1)  # Length of the first word's character sequence
    char_sequences_padded = []
    for char_seq_sent in char_sequences:
        # Move char_seq_sent to the same device as padding
        char_seq_sent = char_seq_sent.to(device)
        # Calculate padding needed
        padding_length = max_len - char_seq_sent.size(0)
        if padding_length > 0:
            # Create padding tensor
            padding = torch.full((padding_length, max_word_len), char_to_index[pad_token], dtype=torch.long).to(device)
            # Concatenate padding to the character sequences
            char_seq_sent_padded = torch.cat([char_seq_sent, padding], dim=0)
        else:
            char_seq_sent_padded = char_seq_sent
        char_sequences_padded.append(char_seq_sent_padded)
    
    # Stack all character sequences into a single tensor
    char_sequences_padded = torch.stack(char_sequences_padded).to(device)

    return sentences_padded, cp_padded, char_sequences_padded

In [34]:
def train_model_cnn(model, train_loader, dev_loader, data, optimizer, criterion, word2idx, tag2idx, char2idx, batch_size, output_file="./dev2.out", epochs=10, accum_steps=1):
    model.train()
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=4)
    model.to(device)
    best_f1 = float("-inf")
    best_model = None
    counter = 0
    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        optimizer.zero_grad()

        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", leave=False)

        for step, (sentences, labels, cps, char_sequences) in enumerate(progress_bar):
            sentences, cps, labels, char_sequences = sentences.to(device), cps.to(device), labels.to(device), char_sequences.to(device)
            output = model(sentences, cps, char_sequences)

            loss = criterion(output.view(-1, output.shape[-1]), labels.view(-1)) / accum_steps
            loss.backward()

            if (step + 1) % accum_steps == 0 or (step + 1) == len(train_loader):
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()
                optimizer.zero_grad()

            total_loss += loss.item() * accum_steps
            progress_bar.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{epochs}, Avg Loss: {avg_loss:.4f}, LR: {scheduler.get_last_lr()[0]}")
        f1 = evaluate_cnn(model, dev_loader, data, word2idx, tag2idx, batch_size, output_file)  
        scheduler.step(f1)
        if f1 > best_f1:
            best_f1 = f1
            torch.save(model, "blstm_bonus.pt")
            best_model = model
            counter = 0
        else:
            counter += 1
        if counter >= 10:
            print("Early stopping triggered.")
            break
    return best_model

In [35]:
def evaluate_cnn(model, dataloader, data, word2idx, tag2idx, batch_size, output_file):
    model.to(device)
    model.eval()
    idx2tag = {idx: tag for tag, idx in tag2idx.items()}
    idx2word = {idx: word for word, idx in word2idx.items()}
    true_lbls = []
    preds_lst = []
    
    with open(output_file, "w", encoding="utf-8") as f:
        with torch.no_grad():
            for batch_idx, (sents, lbls, cps, char_sequences) in enumerate(dataloader):
                sents = sents.to(device)
                cps = cps.to(device)
                char_sequences = char_sequences.to(device)
                output = model(sents, cps, char_sequences)
                preds = torch.argmax(output, dim=-1)
                
                for sent_idx, (sent, lbl, pred) in enumerate(zip(sents, lbls, preds)):
                    for w_idx, (w_id, pred_id, _) in enumerate(zip(sent, pred, lbl)):
                        w = idx2word.get(w_id.item(), "<UNK>")
                        pred_tag = idx2tag[pred_id.item()]
                        if w != "<PAD>" and w != "<S>" and w != "</S>":
                            og_w = data["sentences"][batch_idx * batch_size + sent_idx][w_idx-1]
                            f.write(f"{w_idx} {og_w} {pred_tag}\n")
                    f.write("\n")
                
                for true, pred in zip(lbl, pred):
                    if true.item() != -1:
                        true_lbls.append(true.item())
                        preds_lst.append(pred.item())
    
    precision, recall, f1, _ = precision_recall_fscore_support(true_lbls, preds_lst, average='macro', zero_division=0)
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")
    
    correct = (torch.tensor(preds_lst) == torch.tensor(true_lbls)).sum().item()
    total = len(true_lbls)
    print(total)
    print(f"Evaluation Accuracy: {correct / total:.4f}")

    return f1

In [36]:
def evaluate_cnn_test(model, dataloader, data, word2idx, tag2idx, batch_size, output_file):
    model.to(device)
    model.eval()
    idx2tag = {idx: tag for tag, idx in tag2idx.items()}
    idx2word = {idx: word for word, idx in word2idx.items()}
    
    with open(output_file, "w", encoding="utf-8") as f:
        with torch.no_grad():
            for batch_idx, (sents, cps, char_sequences) in enumerate(dataloader):
                sents = sents.to(device)
                cps = cps.to(device)
                char_sequences = char_sequences.to(device)
                output = model(sents, cps, char_sequences)
                preds = torch.argmax(output, dim=-1)
                
                for sent_idx, (sent, pred) in enumerate(zip(sents, preds)):
                    for w_idx, (w_id, pred_id) in enumerate(zip(sent, pred)):
                        w = idx2word.get(w_id.item(), "<UNK>")
                        pred_tag = idx2tag[pred_id.item()]
                        if w != "<PAD>" and w != "<S>" and w != "</S>":
                            og_w = data["sentences"][batch_idx * batch_size + sent_idx][w_idx-1]
                            f.write(f"{w_idx} {og_w} {pred_tag}\n")
                    f.write("\n")

In [37]:
# This Function is used to create the embedding matrix from the glove embedding.
def create_embedding_matrix(vocab, embeddings, embedding_dim=100):
    # Here we are intializing a zero matrix of the correct dimensions.
    embedding_matrix = np.zeros((len(vocab), embedding_dim))
    # Here we are updating the initalized matrix with embedding vectors.
    for word, idx in vocab.items():
        if word in embeddings:
            embedding_matrix[idx] = embeddings[word]
        elif word.lower() in embeddings:
            embedding_matrix[idx] = embeddings[word.lower()]
        elif word.upper() in embeddings:
            embedding_matrix[idx] = embeddings[word.upper()] 
        else:
            # Initialize randomly with small values for OOV words
            embedding_matrix[idx] = np.random.uniform(-0.25, 0.25, embedding_dim)
    
    return embedding_matrix

# This Function is to to load GloVe embeddings
def load_glove_embeddings(glove_file_path):
    embeddings = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            word = parts[0]
            vector = np.array(parts[1:], dtype=np.float32)
            embeddings[word] = vector
    return embeddings




In [38]:
# In this function we are determining the capitalization feature label for each word.
def get_capitalization_feature(word):
    # Check if the word is fully uppercase
    if word.isupper():
        return 'allCaps'
    # Check if the word starts with an uppercase letter and the rest are lowercase
    elif word[0].isupper() and word[1:].islower():
        return 'upperInitial'
    # Check if the word is entirely lowercase
    elif word.islower():
        return 'lowercase'
    # Check if the word has mixed capitalization
    elif any(c.isupper() for c in word[1:]) and any(c.islower() for c in word[1:]):
        return 'mixedCaps'
    else:
        return 'noinfo'

# Here we are mapping each of the capitalization feature label to numeric label.
def capitalization_to_index(capitalization):
    lookup = {
        '<PAD>' : 0,
        'allCaps': 1,
        'upperInitial': 2,
        'lowercase': 3,
        'mixedCaps': 4,
        'noinfo': 5
    }
    return lookup.get(capitalization, 4)  # default to 'noinfo' if invalid

# In this function we are checking for capitalization
def adjust_case(word):
    if word.islower():
        return word.lower()
    elif word.isupper():
        return word.upper()
    elif word.istitle():
        return word.title()
    else:
        return word.lower()

In [39]:
# Hyperparameters
embedding_dim = 100
hidden_dim = 256
linear_dim = 128
dropout = 0.33
batch_size = 16
learning_rate = 0.25
epochs = 80

# Build Vocabulary from Training Data
word2idx = {word: idx for idx, word in enumerate(set(word for sent in train_data['sentences'] for word in sent), start=4)}
word2idx["<PAD>"] = 0
word2idx["<UNK>"] = 1
word2idx["<S>"] = 2
word2idx["</S>"] = 3

# Build Character Vocabulary
char2idx = {char: idx for idx, char in enumerate(set(char for sent in train_data['sentences'] for word in sent for char in word), start=2)}
char2idx["<PAD>"] = 0
char2idx["<UNK>"] = 1

tag2idx = {tag: idx for idx, tag in enumerate(set(tag for label in train_data['labels'] for tag in label))}
tag2idx["<PAD>"] = -1
vocab_size = len(word2idx)
tagset_size = len(tag2idx)-1

glove_embeddings = load_glove_embeddings("./glove.6B.100d/glove.6B.100d.txt")

# Create the embedding matrix from the trained Word2Vec model
embedding_matrix = create_embedding_matrix(word2idx, glove_embeddings, embedding_dim)

# Model, Optimizer, and Loss Function
model = BiLSTM_CNN(vocab_size, tagset_size, embedding_matrix, embedding_dim, hidden_dim, linear_dim, dropout=dropout).to(device)

optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, weight_decay=1e-5, nesterov=True)

criterion = nn.CrossEntropyLoss(ignore_index=-1).to(device)

# Load Data
dataset_train = NERDataset_CNN(train_data['sentences'], train_data['labels'], word2idx, tag2idx, char2idx)
dataset_dev = NERDataset_CNN(dev_data['sentences'], dev_data['labels'], word2idx, tag2idx, char2idx)
train_loader = DataLoader(dataset_train, batch_size=batch_size, collate_fn=lambda batch: collate_fn_cnn(batch, word2idx, tag2idx, char2idx))
dev_loader = DataLoader(dataset_dev, batch_size=batch_size, collate_fn=lambda batch: collate_fn_cnn(batch, word2idx, tag2idx, char2idx))

In [40]:
# Train the Model
best_model = train_model_cnn(model, train_loader, dev_loader, dev_data, optimizer, criterion, word2idx, tag2idx, char2idx, batch_size, output_file="./dev_bonus.out", epochs=epochs)

                                                                            

Epoch 1/80, Avg Loss: 0.2547, LR: 0.25
Precision: 0.7582, Recall: 0.6976, F1 Score: 0.7091
3324
Evaluation Accuracy: 0.9443


                                                                             

Epoch 2/80, Avg Loss: 0.1185, LR: 0.25
Precision: 0.8100, Recall: 0.7597, F1 Score: 0.7736
3324
Evaluation Accuracy: 0.9537


                                                                             

Epoch 3/80, Avg Loss: 0.0775, LR: 0.25
Precision: 0.8386, Recall: 0.8054, F1 Score: 0.8187
3324
Evaluation Accuracy: 0.9612


                                                                             

Epoch 4/80, Avg Loss: 0.0548, LR: 0.25
Precision: 0.8540, Recall: 0.8171, F1 Score: 0.8314
3324
Evaluation Accuracy: 0.9633


                                                                             

Epoch 5/80, Avg Loss: 0.0394, LR: 0.25
Precision: 0.8621, Recall: 0.8240, F1 Score: 0.8399
3324
Evaluation Accuracy: 0.9645


                                                                             

Epoch 6/80, Avg Loss: 0.0274, LR: 0.25
Precision: 0.8529, Recall: 0.8347, F1 Score: 0.8404
3324
Evaluation Accuracy: 0.9657


                                                                             

Epoch 7/80, Avg Loss: 0.0214, LR: 0.25
Precision: 0.8581, Recall: 0.8618, F1 Score: 0.8564
3324
Evaluation Accuracy: 0.9669


                                                                             

Epoch 8/80, Avg Loss: 0.0162, LR: 0.25
Precision: 0.8759, Recall: 0.8675, F1 Score: 0.8684
3324
Evaluation Accuracy: 0.9693


                                                                             

Epoch 9/80, Avg Loss: 0.0126, LR: 0.25
Precision: 0.8744, Recall: 0.8709, F1 Score: 0.8683
3324
Evaluation Accuracy: 0.9708


                                                                              

Epoch 10/80, Avg Loss: 0.0098, LR: 0.25
Precision: 0.8702, Recall: 0.8545, F1 Score: 0.8586
3324
Evaluation Accuracy: 0.9675


                                                                              

Epoch 11/80, Avg Loss: 0.0088, LR: 0.25
Precision: 0.8775, Recall: 0.8529, F1 Score: 0.8625
3324
Evaluation Accuracy: 0.9684


                                                                              

Epoch 12/80, Avg Loss: 0.0071, LR: 0.25
Precision: 0.8873, Recall: 0.8669, F1 Score: 0.8741
3324
Evaluation Accuracy: 0.9714


                                                                              

Epoch 13/80, Avg Loss: 0.0058, LR: 0.25
Precision: 0.8880, Recall: 0.8483, F1 Score: 0.8660
3324
Evaluation Accuracy: 0.9690


                                                                              

Epoch 14/80, Avg Loss: 0.0061, LR: 0.25
Precision: 0.8845, Recall: 0.8655, F1 Score: 0.8709
3324
Evaluation Accuracy: 0.9714


                                                                              

Epoch 15/80, Avg Loss: 0.0050, LR: 0.25
Precision: 0.8755, Recall: 0.8737, F1 Score: 0.8715
3324
Evaluation Accuracy: 0.9708


                                                                              

Epoch 16/80, Avg Loss: 0.0043, LR: 0.25
Precision: 0.8885, Recall: 0.8816, F1 Score: 0.8828
3324
Evaluation Accuracy: 0.9723


                                                                              

Epoch 17/80, Avg Loss: 0.0041, LR: 0.25
Precision: 0.8884, Recall: 0.8759, F1 Score: 0.8797
3324
Evaluation Accuracy: 0.9720


                                                                              

Epoch 18/80, Avg Loss: 0.0039, LR: 0.25
Precision: 0.8903, Recall: 0.8728, F1 Score: 0.8789
3324
Evaluation Accuracy: 0.9723


                                                                              

Epoch 19/80, Avg Loss: 0.0028, LR: 0.25
Precision: 0.9036, Recall: 0.8935, F1 Score: 0.8965
3324
Evaluation Accuracy: 0.9765


                                                                              

Epoch 20/80, Avg Loss: 0.0019, LR: 0.25
Precision: 0.9027, Recall: 0.8848, F1 Score: 0.8929
3324
Evaluation Accuracy: 0.9756


                                                                              

Epoch 21/80, Avg Loss: 0.0018, LR: 0.25
Precision: 0.8823, Recall: 0.8786, F1 Score: 0.8789
3324
Evaluation Accuracy: 0.9741


                                                                              

Epoch 22/80, Avg Loss: 0.0018, LR: 0.25
Precision: 0.8915, Recall: 0.8806, F1 Score: 0.8843
3324
Evaluation Accuracy: 0.9741


                                                                              

Epoch 23/80, Avg Loss: 0.0024, LR: 0.25
Precision: 0.8658, Recall: 0.8744, F1 Score: 0.8668
3324
Evaluation Accuracy: 0.9702


                                                                              

Epoch 24/80, Avg Loss: 0.0028, LR: 0.25
Precision: 0.8958, Recall: 0.8729, F1 Score: 0.8827
3324
Evaluation Accuracy: 0.9738


                                                                              

Epoch 25/80, Avg Loss: 0.0017, LR: 0.125
Precision: 0.8900, Recall: 0.8747, F1 Score: 0.8807
3324
Evaluation Accuracy: 0.9732


                                                                              

Epoch 26/80, Avg Loss: 0.0009, LR: 0.125
Precision: 0.8929, Recall: 0.8724, F1 Score: 0.8811
3324
Evaluation Accuracy: 0.9741


                                                                              

Epoch 27/80, Avg Loss: 0.0008, LR: 0.125
Precision: 0.8924, Recall: 0.8799, F1 Score: 0.8847
3324
Evaluation Accuracy: 0.9750


                                                                              

Epoch 28/80, Avg Loss: 0.0008, LR: 0.125
Precision: 0.8885, Recall: 0.8727, F1 Score: 0.8784
3324
Evaluation Accuracy: 0.9735


                                                                              

Epoch 29/80, Avg Loss: 0.0007, LR: 0.125
Precision: 0.8891, Recall: 0.8804, F1 Score: 0.8831
3324
Evaluation Accuracy: 0.9741
Early stopping triggered.


In [41]:
# Load the state dictionary
best_model = torch.load('./blstm_bonus.pt')
best_model.to(device)

# Test the Model
evaluate_cnn(best_model, dev_loader, dev_data, word2idx, tag2idx, batch_size, "./dev_bonus.out")

Precision: 0.9036, Recall: 0.8935, F1 Score: 0.8965
3324
Evaluation Accuracy: 0.9765


0.8965035703194965

In [42]:
# Evaluating METRICS with eval.py
!python eval.py -p dev_bonus.out -g data/dev

processed 51578 tokens with 5942 phrases; found: 6084 phrases; correct: 5259.
accuracy:  97.92%; precision:  86.44%; recall:  88.51%; FB1:  87.46
              LOC: precision:  94.00%; recall:  92.92%; FB1:  93.46  1816
             MISC: precision:  77.57%; recall:  83.62%; FB1:  80.48  994
              ORG: precision:  75.65%; recall:  86.65%; FB1:  80.78  1536
              PER: precision:  93.15%; recall:  87.89%; FB1:  90.45  1738


In [43]:
# Load the state dictionary
best_model = torch.load('./blstm_bonus.pt')
best_model.to(device)

# Test the Model
dataset_test = NERDataset_CNN_test(test_data['sentences'], word2idx, char2idx)
test_loader = DataLoader(dataset_test, batch_size=batch_size, collate_fn=lambda batch: collate_fn_cnn_test(batch, word2idx, char2idx))
evaluate_cnn_test(best_model, test_loader, test_data, word2idx, tag2idx, batch_size, "./test_bonus.out")