Just using this to write the easily test the code for the baseline model. Final implementation will be in a py script, so it can be run from command line using GPU.


# To do!
- create function to extract data to train model DONE!
- create function to output tags into appropriate format DONE!
- make model
  - Incorporate start, stop and unknown tokens into the convert data shape. Start and stop should be both a label and a vocab. Unknown should only be vocab
  - Define allowed transitions, such as cannot transition into start token, cannot transition into pad token, except from stop token, cannot transition out of stop token except into pad token, can only transition into I tokens, from the B token of the same category. Potentially use allowed_transitions from the allen nlp CRF module to create it, it should then be fed into the model on its creation
- train model This part should be working, but need the rest
- define hyperparamter space and random space search to optimize on dev dataset
- submit results

In [80]:
#Putting all the imports in one place for readability
import numpy as np
import torch
from torch import nn
from allennlp.modules.conditional_random_field import ConditionalRandomField as CRF
from allennlp.modules.conditional_random_field import allowed_transitions
from torcheval.metrics.functional import multiclass_accuracy
from torcheval.metrics.functional import multiclass_confusion_matrix as MCM
import random


# Setting seeds to ensure reproducibility of results

random.seed(666)
np.random.seed(666)
torch.manual_seed(666)

<torch._C.Generator at 0x196c3525ed0>

In [81]:
#Extracts the data into 2 lists of lists, one with the tokens another with the tags


def extractData(filePath):
    """
    Returns:tuple: A tuple containing input data (list of lists of words), tags (list of lists of tags),
    and metadata (list of tuples containing newdoc_id, sent_id, and text).
    """
    wordsData = []
    tagsData = []
    metadata = []
    currentSent = None
    with open(filePath, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if line.startswith("# newdoc id"):
                newdocId = line.split("= ")[1]
            elif line.startswith("# sent_id"):
                sentId = line.split("= ")[1]
            elif line.startswith("# text"):
                text = line.split("= ")[1]
            elif line:
                parts = line.split('\t')
                word = parts[1]
                tag = parts[2]
                if sentId != currentSent:
                    currentSent = sentId
                    wordsData.append([word])
                    tagsData.append([tag])
                    metadata.append((newdocId, sentId, text))
                else:
                    wordsData[-1].append(word)
                    tagsData[-1].append(tag)
    return wordsData, tagsData, metadata

# Example usage:
#file_path = "../Data/UniversalNER/train/en_ewt-ud-train.iob2"
#words_data, tags_data, metadata = extract_data(file_path)
# for words, tags, meta in zip(words_data, tags_data, metadata):
#     print("Words:", words)
#     print("Tags:", tags)
#     print("Metadata:", meta)
#     print()


In [82]:
#Converts the Data into a tensor for use by the model

def convertDataShape(data, vocabulary={}, labels=[], training=True, paddingToken='<PAD>', START_TOKEN = '<START>', STOP_TOKEN = '<END>', UNKNOWN_TOKEN = '<UNKNOWN>'):
    """
    If training is enabled creates a vocabulary of all words in a list. Otherwise, a vocabulary should be passed.
    Does the same with the labels.
    Creates a matrix of sentences and positions, where each value indicates a word via its index in the vocabulary.
    Creates another matrix of sentences and positions, where the values indicate a label.
    '<PAD>' or another user defined token is used as padding for short sentences. Will also act as an unknown token, if not training, it is assumed to be in vocabulary.
    Returns, the vocabulary, the labels and the two matrices.
    
    Input:
    data         - (string list * string list) list - List of sentences. Each sentence is a tuple of two lists. The first is a list of words, the second a list of labels.
    vocabulary   - string : int dictionary          - Dictionary of words in the vocabulary, values are the indices. Should be provided if not training. Defaults to empty dict.
    labels       - string : int dictionary          - Dictionary of labels to classify, values are the indices. Should be provided if not training. Defaults to empty dict.
    training     - boolean                          - Boolean variable deffining whether training is taking place, if yes then a new vocabulary will be created. Defaults to yes.
    paddingToken - string                           - Token to be used as padding and unknown. Default is provided
    
    Output:
    Xmatrix      - 2D torch.tensor                  - 2d torch tensor containing the index of the word in the sentence in the vocabulary
    Ymatrix      - 2D torch.tensor                  - 2d torch tensor containing the index of the label in the sentence in the labels
    vocabulary   - string : int dictionary          - Dictionary of words, with indices as values, used for training.
    labels       - string : int dictionary          - Dictionary of all the labels, with indices as values, used for classification. (all the labels are expected to be present in the training data, or in other words, the label list provided should be exhaustive)
    """


    if training:
        vocabList = [paddingToken, START_TOKEN, STOP_TOKEN, UNKNOWN_TOKEN] + sorted(set(word for sentence, _ in data for word in sentence))
        labelList = [paddingToken, START_TOKEN, STOP_TOKEN, UNKNOWN_TOKEN] + sorted(set(label for _, sentence_labels in data for label in sentence_labels))
        vocabulary = {word: i for i, word in enumerate(vocabList)}
        labels = {label: i for i, label in enumerate(labelList)}
    
    max_len = max(len(sentence) for sentence, _ in data) + 2
    Xmatrix = np.zeros((len(data), max_len), dtype=int)
    Ymatrix = np.zeros((len(data), max_len), dtype=int)

    for i, (sentence, sentence_labels) in enumerate(data):
        Xmatrix[i, 0] = 1
        Ymatrix[i, 0] = 1
        for j, word in enumerate(sentence):
            Xmatrix[i, j+1] = vocabulary.get(word, vocabulary[paddingToken])
            last_word = j
        Xmatrix[i, last_word+2] = 2
        for j, label in enumerate(sentence_labels):
            Ymatrix[i, j+1] = labels.get(label, labels[paddingToken])
            last_word = j
        Xmatrix[i, last_word+2] = 2
    
    return torch.tensor(Xmatrix, dtype=torch.long), torch.tensor(Ymatrix, dtype=torch.long), vocabulary, labels

# two first sentences of EWT training dataset so that quickdebugging can be run

tags = ["O", "B-PER", "I-PER", "B-LOC", "I-LOC", "B-ORG", "I-ORG"]

trainingDebugSen = [["Where", "in", "the", "world", "is", "Iguazu", "?"], ["Iguazu", "Falls"]]
trainingDebugTags = [["O", "O", "O", "O", "O", "B-LOC", "O"], ["B-LOC", "I-LOC"]]

dataDebug, labelsDebug, vocabDebug, tagsDebug = convertDataShape(list(zip(trainingDebugSen, trainingDebugTags)))
print(dataDebug)
print(labelsDebug)
print(vocabDebug)
print(tagsDebug)

tensor([[ 1,  7,  8, 10, 11,  9,  6,  4,  2],
        [ 1,  6,  5,  2,  0,  0,  0,  0,  0]])
tensor([[1, 6, 6, 6, 6, 6, 4, 6, 0],
        [1, 4, 5, 0, 0, 0, 0, 0, 0]])
{'<PAD>': 0, '<START>': 1, '<END>': 2, '<UNKNOWN>': 3, '?': 4, 'Falls': 5, 'Iguazu': 6, 'Where': 7, 'in': 8, 'is': 9, 'the': 10, 'world': 11}
{'<PAD>': 0, '<START>': 1, '<END>': 2, '<UNKNOWN>': 3, 'B-LOC': 4, 'I-LOC': 5, 'O': 6}


In [83]:


class baselineModel(torch.nn.Module):
    def __init__(self, nWords, tags, dimEmbed, dimHidden, constraint_type=None):
        super().__init__()
        self.dimEmbed = dimEmbed
        self.dimHidden = dimHidden
        self.vocabSize = nWords
        self.tagSetSize = len(tags)

        self.embed = nn.Embedding(nWords, dimEmbed)
        self.LSTM = nn.LSTM(dimEmbed, dimHidden, bidirectional=True)
        self.linear = nn.Linear(dimHidden * 2, self.tagSetSize)
        
        # 
        if constraint_type:
            tags_reversed = {v: k for k, v in tags.items()}
            constraints = allowed_transitions(constraint_type, tags_reversed)
        else:
            constraints = None

        # Initialize the CRF layer
        self.CRF = CRF(self.tagSetSize, constraints=constraints, include_start_end_transitions=True)

    def forwardTrain(self, inputData, labels):
        # Embedding and LSTM layers
        wordVectors = self.embed(inputData)
        lstmOut, _ = self.LSTM(wordVectors)
        
        # Linear layer
        emissions = self.linear(lstmOut)
        
        # CRF layer to compute the log likelihood loss
        log_likelihood = self.CRF(emissions, labels)
        
        # The loss is the negative log-likelihood
        loss = -log_likelihood
        return loss
        
    def forwardPred(self, inputData):
        # Embedding and LSTM layers
        wordVectors = self.embed(inputData)
        lstmOut, _ = self.LSTM(wordVectors)
        
        # Linear layer
        emissions = self.linear(lstmOut)
        
        # Decode the best path
        best_paths = self.CRF.viterbi_tags(emissions)
        
        # Extract the predicted tags from the paths
        predictions = [path for path, score in best_paths]
        return predictions


In [8]:

def saveToIob2(words, labels, outputFilePath):
    """
    Save words and their corresponding labels in IOB2 format.

    Args:
    words (list): List of lists containing words.
    labels (list): List of lists containing labels.
    output_file (str): Path to the output IOB2 file.
    """
    with open(outputFilePath, 'w', encoding='utf-8') as file:
        for i in range(len(words)):
            for j in range(len(words[i])):
                line = f"{j+1}\t{words[i][j]}\t{labels[i][j]}\n"
                file.write(line)
            file.write('\n')

In [58]:
# two first sentences of EWT training dataset so that quickdebugging can be run

tags = ["O", "B-PER", "I-PER", "B-LOC", "I-LOC", "B-ORG", "I-ORG"]

trainingDebugSen = [["Where", "in", "the", "world", "is", "Iguazu", "?"], ["Iguazu", "Falls"]]
trainingDebugTags = [["O", "O", "O", "O", "O", "B-LOC", "O"], ["B-LOC", "I-LOC"]]

dataDebug, labelsDebug, vocabDebug, tagsDebug = convertDataShape(list(zip(trainingDebugSen, trainingDebugTags)))

In [89]:
#Quick traininig script on the debug dataset

DIM_EMBEDDING = 100
LSTM_HIDDEN = 50
LEARNING_RATE = 0.01
EPOCHS = 5

random.seed(666)
np.random.seed(666)
torch.manual_seed(666)

constraint_type = "BIOUL"

model = baselineModel(len(vocabDebug), tagsDebug, DIM_EMBEDDING, LSTM_HIDDEN, constraint_type)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    model.train()
    
    optimizer.zero_grad()
    loss = model.forwardTrain(dataDebug, labelsDebug)
    
    loss.backward()
    optimizer.step()
    
    print(f"Epoch {epoch}, Loss: {loss.item()}")


Epoch 0, Loss: 36.203163146972656
Epoch 1, Loss: 26.62387466430664
Epoch 2, Loss: 18.93437957763672
Epoch 3, Loss: 12.457599639892578
Epoch 4, Loss: 7.5099945068359375


In [84]:
#Getting predicitons and checking accuracy


with torch.no_grad():
    predictsDebug = model.forwardPred(dataDebug)

confMat = MCM(torch.flatten(torch.tensor(predictsDebug, dtype=torch.long)), torch.flatten(labelsDebug), num_classes = len(tagsDebug))

acc = torch.trace(confMat[1:,1:])/torch.sum(confMat[1:,1:]) #Taking away the first collumn and first row, because those correspond to the padding token and we don't care
acc

tensor(0.5455)

In [85]:
# Loading the EWT train data set

filePath = "../Data/UniversalNER/train/en_ewt-ud-train.iob2"
wordsData, tagsData, metadata = extractData(filePath)

trainData, trainLabels, vocab, labels = convertDataShape(list(zip(wordsData, tagsData)))

In [92]:
DIM_EMBEDDING = 100
LSTM_HIDDEN = 50
LEARNING_RATE = 0.01
EPOCHS = 5
BATCH_SIZE = 32
PADDING_TOKEN = '<PAD>'

random.seed(666)
np.random.seed(666)
torch.manual_seed(666)

numBatches = trainData.shape[0] // BATCH_SIZE

trainDataBatches = trainData[:BATCH_SIZE*numBatches].view(numBatches, trainData.shape[1], BATCH_SIZE)
trainLabelsBatches = trainLabels[:BATCH_SIZE*numBatches].view(numBatches, trainLabels.shape[1], BATCH_SIZE)


constraint_type = "BIO"

model = baselineModel(len(vocab), labels, DIM_EMBEDDING, LSTM_HIDDEN, constraint_type)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    model.train()

    model.zero_grad()

    for batch in zip(trainDataBatches, trainLabelsBatches): 
        optimizer.zero_grad()
        
        loss = model.forwardTrain(batch[0], batch[1])
        loss.backward()
        optimizer.step()
        
    
    print(epoch)  
    
    print(f"Epoch {epoch}, Loss: {loss.item()}")


0
Epoch 0, Loss: 188.17529296875


KeyboardInterrupt: 

In [87]:
#Loading the dev EWT dataset

devFilePath = "../Data/UniversalNER/dev/en_ewt-ud-dev.iob2"
devWordsData, devTagsData, metadata = extractData(devFilePath)

devData, devLabels, _, _ = convertDataShape(list(zip(devWordsData, devTagsData)), vocabulary = vocab, labels = labels, training = False)

In [88]:
#Getting predicitons and checking accuracy


with torch.no_grad():
    predicts = model.forwardPred(devData)

confMat = MCM(torch.flatten(torch.tensor(predicts, dtype=torch.long)), torch.flatten(devLabels), num_classes = len(labels))

acc = torch.trace(confMat[1:,1:])/torch.sum(confMat[1:,1:]) #Taking away the first collumn and first row, because those correspond to the padding token and we don't care
acc

IndexError: index out of range in self

In [110]:
outputFilePath = "./baselineModel.iob2"

#convert the predictions back into labels

# creates a list of lists of tags, where the padding token is excluded
predictLabels = [[list(labels.keys())[i] for i in j if list(labels.keys())[i] != PADDING_TOKEN] for j in predicts]

# the saveToIob2 works when provided data in the right format
saveToIob2(devWordsData, devTagsData, outputFilePath)
