Just using this to write the easily test the code for the baseline model. Final implementation will be in a py script, so it can be run from command line using GPU.


# To do!
- create function to extract data to train model Done
- create function to output tags into appropriate format
- make model - Sort of done
- train model
- define hyperparamter space and random space search to optimize on dev dataset
- submit results

In [2]:
#Extracts the data into 2 lists of lists, one with the tokens another with the tags


def extract_data(file_path):
    """
    Returns:tuple: A tuple containing input data (list of lists of words), tags (list of lists of tags),
    and metadata (list of tuples containing newdoc_id, sent_id, and text).
    """
    words_data = []
    tags_data = []
    metadata = []
    current_sent = None
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if line.startswith("# newdoc id"):
                newdoc_id = line.split("= ")[1]
            elif line.startswith("# sent_id"):
                sent_id = line.split("= ")[1]
            elif line.startswith("# text"):
                text = line.split("= ")[1]
            elif line:
                parts = line.split('\t')
                word = parts[1]
                tag = parts[2]
                if sent_id != current_sent:
                    current_sent = sent_id
                    words_data.append([word])
                    tags_data.append([tag])
                    metadata.append((newdoc_id, sent_id, text))
                else:
                    words_data[-1].append(word)
                    tags_data[-1].append(tag)
    return words_data, tags_data, metadata

# Example usage:
file_path = "../Data/UniversalNER/train/en_ewt-ud-train.iob2"
words_data, tags_data, metadata = extract_data(file_path)
# for words, tags, meta in zip(words_data, tags_data, metadata):
#     print("Words:", words)
#     print("Tags:", tags)
#     print("Metadata:", meta)
#     print()


In [8]:
#Converts the Data into a tensor for use by the model

def convertDataShape(data, vocabulary={}, labels=[], training=True, paddingToken='<PAD>'):
    """
    If training is enabled creates a vocabulary of all words in a list. Otherwise, a vocabulary should be passed.
    Does the same with the labels.
    Creates a matrix of sentences and positions, where each value indicates a word via its index in the vocabulary.
    Creates another matrix of sentences and positions, where the values indicate a label.
    '<PAD>' or another user defined token is used as padding for short sentences. Will also act as an unknown token, if not training, it is assumed to be in vocabulary.
    Returns, the vocabulary, the labels and the two matrices.
    
    Input:
    data         - (string list * string list) list - List of sentences. Each sentence is a tuple of two lists. The first is a list of words, the second a list of labels.
    vocabulary   - string : int dictionary          - Dictionary of words in the vocabulary, values are the indices. Should be provided if not training. Defaults to empty dict.
    labels       - string list                      - Dictionary of labels to classify, values are the indices. Should be provided if not training. Defaults to empty dict.
    training     - boolean                          - Boolean variable deffining whether training is taking place, if yes then a new vocabulary will be created. Defaults to yes.
    paddingToken - string                           - Token to be used as padding and unknown. Default is provided
    
    Output:
    Xmatrix      - 2D torch.tensor                  - 2d torch tensor containing the index of the word in the sentence in the vocabulary
    Ymatrix      - 2D torch.tensor                  - 2d torch tensor containing the index of the label in the sentence in the labels
    vocabulary   - string : int dictionary          - Dictionary of words, with indices as values, used for training.
    labels       - string : int dictionary          - Dictionary of all the labels, with indices as values, used for classification. (all the labels are expected to be present in the training data, or in other words, the label list provided should be exhaustive)
    """
    if training:
        vocabList = [paddingToken] + sorted(set(word for sentence, _ in data for word in sentence))
        labelList = [paddingToken] + sorted(set(label for _, sentence_labels in data for label in sentence_labels))
        vocabulary = {word: i for i, word in enumerate(vocabList)}
        labels = {label: i for i, label in enumerate(labelList)}
    
    max_len = max(len(sentence) for sentence, _ in data)
    Xmatrix = np.zeros((len(data), max_len), dtype=int)
    Ymatrix = np.zeros((len(data), max_len), dtype=int)
    
    for i, (sentence, sentence_labels) in enumerate(data):
        for j, word in enumerate(sentence):
            Xmatrix[i, j] = vocabulary.get(word, vocabulary[paddingToken])
        for j, label in enumerate(sentence_labels):
            Ymatrix[i, j] = labels.get(label, labels[paddingToken])
    
    return torch.tensor(Xmatrix, dtype=torch.long), torch.tensor(Ymatrix, dtype=torch.long), vocabulary, labels

In [4]:
import numpy as np
import torch
from torch import nn
from allennlp.modules.conditional_random_field import ConditionalRandomField as CRF

class baselineModel(torch.nn.Module):
    def __init__(self, nWords, tags, dimEmbed, dimHidden):
        super().__init__()
        self.dimEmbed = dimEmbed
        self.dimHidden = dimHidden
        self.vocabSize = nWords
        self.tagSetSize = len(tags)

        self.embed = nn.Embedding(nWords, dimEmbed)
        self.LSTM = nn.LSTM(dimEmbed, dimHidden, bidirectional=True)
        self.linear = nn.Linear(dimHidden * 2, self.tagSetSize)
        
        # Initialize the CRF layer
        self.CRF = CRF(self.tagSetSize, constraints=None, include_start_end_transitions=True)

    def forwardTrain(self, inputData, labels):
        # Embedding and LSTM layers
        wordVectors = self.embed(inputData)
        lstmOut, _ = self.LSTM(wordVectors)
        
        # Linear layer
        emissions = self.linear(lstmOut)
        
        # CRF layer to compute the log likelihood loss
        log_likelihood = self.CRF(emissions, labels)
        
        # The loss is the negative log-likelihood
        loss = -log_likelihood
        return loss
        
    def forwardPred(self, inputData):
        # Embedding and LSTM layers
        wordVectors = self.embed(inputData)
        lstmOut, _ = self.LSTM(wordVectors)
        
        # Linear layer
        emissions = self.linear(lstmOut)
        
        # Decode the best path
        best_paths = self.CRF.viterbi_tags(emissions)
        
        # Extract the predicted tags from the paths
        predictions = [path for path, score in best_paths]
        return predictions





In [9]:
# two first sentences of EWT training dataset so that quickdebugging can be run

tags = ["O", "B-PER", "I-PER", "B-LOC", "I-LOC", "B-ORG", "I-ORG"]

trainingDebugSen = [["Where", "in", "the", "world", "is", "Iguazu", "?"], ["Iguazu", "Falls"]]
trainingDebugTags = [["O", "O", "O", "O", "O", "B-LOC", "O"], ["B-LOC", "I-LOC"]]

dataDebug, labelsDebug, vocabDebug, tagsDebug = convertDataShape(list(zip(trainingDebugSen, trainingDebugTags)))

In [11]:
#Quick traininig script on the debug dataset

DIM_EMBEDDING = 100
LSTM_HIDDEN = 50
LEARNING_RATE = 0.01
EPOCHS = 5

model = baselineModel(len(vocabDebug), tagsDebug, DIM_EMBEDDING, LSTM_HIDDEN)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    model.train()
    
    optimizer.zero_grad()
    loss = model.forwardTrain(dataDebug, labelsDebug)
    
    loss.backward()
    optimizer.step()
    
    print(f"Epoch {epoch}, Loss: {loss.item()}")


Epoch 0, Loss: 26.6339168548584
Epoch 1, Loss: 19.067951202392578
Epoch 2, Loss: 13.250993728637695
Epoch 3, Loss: 8.547966003417969
Epoch 4, Loss: 4.967374801635742


In [22]:
#Getting predicitons and checking accuracy

from torcheval.metrics.functional import multiclass_accuracy
from torcheval.metrics.functional import multiclass_confusion_matrix as MCM

with torch.no_grad():
    predicts = model.forwardPred(dataDebug)

confMat = MCM(torch.flatten(torch.tensor(predicts, dtype=torch.long)), torch.flatten(labelsDebug), num_classes = len(tagsDebug))

acc = torch.sum(confMat[1:,1:])/torch.trace(confMat[1:,1:]) #Taking away the first collumn and first row, because those correspond to the padding token and we don't care

TypeError: 'function' object is not subscriptable